analyzer_name, analyzer, tokenizer, stop words, stemming 'ar' => array( 'name' => 'ar_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_arabic_', 'stemming' => null ), 'bg' => array( 'name' => 'bg_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_bulgarian_', 'stemming' => null ), //Not detected by ES langdetect, but left in (detected as spanish) 'ca' => array( 'name' => 'ca_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_catalan_', 'stemming' => null ), 'cs' => array( 'name' => 'cs_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_czech_', 'stemming' => null ), 'da' => array( 'name' => 'da_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_danish_', 'stemming' => null ), 'de' => array( //removes plurals only 'name' => 'de_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_german_', 'stemming' => 'minimal_german' ), 'el' => array( 'name' => 'el_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_greek_', 'stemming' => null ), 'en' => array( //removes plurals only 'name' => 'en_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_english_', 'stemming' => 'minimal_english' ), 'es' => array( //removes plurals and masc/fem 'name' => 'es_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_spanish_', 'stemming' => 'light_spanish' ), //Not detected by ES langdetect, but left in 'eu' => array( 'name' => 'eu_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_basque_', 'stemming' => null ), 'fa' => array( 'name' => 'fa_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_persian_', 'stemming' => null ), 'fi' => array( //removes plurals and masc/fem 'name' => 'fi_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_finnish_', 'stemming' => 'light_finish' //sic in ES ), 'fr' => array( //removes plurals only 'name' => 'fr_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_french_', 'stemming' => 'minimal_french' ), 'he' => array( //stopwords added by get_hebrew_stopwords() 'name' => 'he_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => null, 'stemming' => null, ), 'hi' => array( 'name' => 'hi_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_hindi_', 'stemming' => null ), 'hu' => array( //removes plurals and masc/fem? 'name' => 'hu_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_hungarian_', 'stemming' => 'light_hungarian' ), //Not detected by ES langdetect 'hy' => array( 'name' => 'hy_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_armenian_', 'stemming' => null ), 'id' => array( 'name' => 'id_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_indonesian_', 'stemming' => null ), 'it' => array( //removes plurals and masc/fem 'name' => 'it_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_italian_', 'stemming' => 'light_italian' ), 'ja' => array( 'name' => 'ja_analyzer', 'analyzer' => 'custom', 'tokenizer' => "kuromoji_tokenizer", 'stopwords' => null, 'stemming' => null ), 'ko' => array( 'name' => 'ko_analyzer', 'analyzer' => 'cjk', 'tokenizer' => null, 'stopwords' => null, 'stemming' => null ), 'nl' => array( 'name' => 'nl_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_dutch_', 'stemming' => null ), 'no' => array( 'name' => 'no_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_norwegian_', 'stemming' => null ), 'pt' => array( //removes plurals only 'name' => 'pt_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_portuguese_', 'stemming' => 'minimal_portuguese' ), 'ro' => array( 'name' => 'ro_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_romanian_', 'stemming' => null ), 'ru' => array( //removes plurals and masc/fem 'name' => 'ru_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_russian_', 'stemming' => 'light_russian' ), 'sv' => array( //removes plurals and masc/fem? 'name' => 'sv_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_swedish_', 'stemming' => 'light_swedish' ), 'tr' => array( 'name' => 'tr_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => '_turkish_', 'stemming' => null ), 'zh' => array( //uses a hidden markov model and dictionary to segment into words 'name' => 'zh_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'smartcn_tokenizer', 'stopwords' => null, 'stemming' => null ), // Non Language based 'lowercase' => array( 'name' => 'lowercase_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'keyword', 'stopwords' => null, 'stemming' => null, ), 'ngram' => array( 'name' => 'ngram_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => null, 'stemming' => null, ), 'edgengram' => array( 'name' => 'edgengram_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => null, 'stemming' => null, ), 'edgengram_raw' => array( 'name' => 'edgengram_raw_analyzer', 'analyzer' => 'custom', 'tokenizer' => 'keyword', 'stopwords' => null, 'stemming' => null, ), 'default' => array( //general analyzer used as the default for search (works pretty well across langs) 'name' => 'default', 'analyzer' => 'custom', 'tokenizer' => 'icu_tokenizer', 'stopwords' => null, 'stemming' => null ), ); //The language is supported by the elasticsearch-langdetect plugin // see: https://github.com/jprante/elasticsearch-langdetect var $has_lang_detection = array( 'af' => true, 'ar' => true, 'bg' => true, 'bn' => true, 'cs' => true, 'da' => true, 'de' => true, 'el' => true, 'en' => true, 'es' => true, 'et' => true, 'fa' => true, 'fi' => true, 'fr' => true, 'gu' => true, 'he' => true, 'hi' => true, 'hr' => true, 'hu' => true, 'id' => true, 'it' => true, 'ja' => true, 'kn' => true, 'ko' => true, 'lt' => true, 'lv' => true, 'mk' => true, 'ml' => true, 'mr' => true, 'ne' => true, 'nl' => true, 'no' => true, 'pa' => true, 'pl' => true, 'pt' => true, 'ro' => true, 'ru' => true, 'sk' => true, 'sl' => true, 'so' => true, 'sq' => true, 'sv' => true, 'sw' => true, 'ta' => true, 'te' => true, 'th' => true, 'tl' => true, 'tr' => true, 'uk' => true, 'ur' => true, 'vi' => true, 'zh' => true ); static $instance; function __construct() { } function set_es_ver( $ver ) { $ver = (int)$ver; // Reset ver flags $this->_is_es5plus = false; if ( $ver >= 5 ) { $this->_is_es5plus = true; } return $this; } function init() { if ( ! self::$instance ) { self::$instance = new Jetpack__To_UTF8; } return self::$instance; } function get_analyzer_name( $lang ) { if ( isset( $this->supported_languages[$lang] ) ) return $this->supported_languages[$lang]['name']; //handle special re-mappings switch( $lang ) { case 'zh-cn': //Simplified Chinese case 'zh-tw': //Traditional Chinese (or Taiwan?) case 'zh-hk': //Hong Kong Chinese return $this->supported_languages['zh']['name']; //Traditional Chinese } return $this->supported_languages['default']['name']; } function get_analyzer_id( $lang ) { // An ID is the first two characters of a $lang_code. The ID must be present // in supported_languages or be a special re-mapping. if ( isset( $this->supported_languages[$lang] ) ) return $lang; //handle special re-mappings switch( $lang ) { case 'zh-cn': //Simplified Chinese case 'zh-tw': //Traditional Chinese (or Taiwan?) case 'zh-hk': //Hong Kong Chinese return 'zh'; //Traditional Chinese } return 'default'; } function can_detect_lang( $lang ) { $normalized = strtok( $lang, '-_' ); return isset( $this->has_lang_detection[$normalized] ) && $this->has_lang_detection[$normalized]; } //build a list of analyzers for an ES index function build_analyzers( $langs = array(), $with_stop_words = true ) { if ( empty( $langs ) ) { $langs = array_keys( $this->supported_languages ); } $settings = array( 'filter' => array(), 'analyzer' => array() ); //$settings['filter']['bigram_filter'] = array( // 'type' => 'shingle', // 'min_shingle_size' => 2, // 'max_shingle_size' => 2, // 'output_unigrams' => true //); //japanese needs custom tokenizer if ( in_array( 'ja', $langs ) ) { $settings['tokenizer'] = array( 'kuromoji' => array( 'type' => 'kuromoji_tokenizer', 'mode' => 'search' ) ); } //ngram needs custom filters if ( in_array( 'ngram', $langs ) ) { $settings['filter']['unique_filter'] = array( 'type' => 'unique', 'only_on_same_position' => true ); $settings['filter']['ngram_filter'] = array( 'type' => 'nGram', 'min_gram' => '3', 'max_gram' => '5', ); } if ( in_array( 'edgengram', $langs ) ) { $settings['filter']['unique_filter'] = array( 'type' => 'unique', 'only_on_same_position' => true ); $settings['filter']['edgengram_filter'] = array( 'type' => 'edgeNGram', 'min_gram' => '3', 'max_gram' => '15', ); } foreach ( $langs as $lang ) { $config = $this->supported_languages[$lang]; $settings['analyzer'][ $config['name'] ] = array( 'type' => $config['analyzer'], 'filter' => array(), ); //////////////////////////////// // Lang specific customizations if ( 'ja' == $lang ) { ////From: http://tech.gmo-media.jp/post/70245090007/elasticsearch-kuromoji-japanese-fulltext-search $settings['analyzer'][ $config['name'] ]['tokenizer'] = $config['tokenizer']; $settings['filter'][$lang . '_pos_filter'] = array( 'type' => "kuromoji_part_of_speech", 'stoptags' => array( "助詞-格助詞-一般", "助詞-終助詞" ), ); $settings['analyzer'][ $config['name'] ]['filter'][] = 'kuromoji_baseform'; $settings['analyzer'][ $config['name'] ]['filter'][] = 'ja_pos_filter'; //stopwords $settings['analyzer'][ $config['name'] ]['filter'][] = 'icu_normalizer'; $settings['analyzer'][ $config['name'] ]['filter'][] = 'icu_folding'; $settings['analyzer'][ $config['name'] ]['filter'][] = 'cjk_width'; continue; } if ( 'de' == $lang ) { ////From: http://gibrown.wordpress.com/2013/05/01/three-principles-for-multilingal-indexing-in-elasticsearch/#comment-857 $settings['analyzer'][ $config['name'] ]['tokenizer'] = $config['tokenizer']; if ( $with_stop_words ) { $settings['filter'][$lang . '_stop_filter'] = array( 'type' => 'stop', 'stopwords' => array( $config['stopwords'] ) ); } $settings['filter'][ $lang . '_stem_filter' ] = array( 'type' => 'stemmer', 'name' => $config['stemming'] ); $settings['char_filter'][$lang . '_char_filter'] = array( 'type' => "mapping", 'mappings' => array( 'ß=>ss', 'Ä=>ae', 'ä=>ae', 'Ö=>oe', 'ö=>oe', 'Ü=>ue', 'ü=>ue', 'ph=>f' ), ); $settings['analyzer'][ $config['name'] ]['filter'][] = 'icu_normalizer'; if ( $with_stop_words ) { $settings['analyzer'][ $config['name'] ]['filter'][] = $lang . '_stop_filter'; } $settings['analyzer'][ $config['name'] ]['filter'][] = $lang . '_stem_filter'; $settings['analyzer'][ $config['name'] ]['filter'][] = 'icu_folding'; $settings['analyzer'][ $config['name'] ]['char_filter'] = array( $lang . '_char_filter' ); continue; } if ( preg_match( "/ngram/", $lang ) ) { // Based partly off of https://qbox.io/blog/an-introduction-to-ngrams-in-elasticsearch // but we need Asian langs, so we want to tokenize with icu tokenizer // and then duplicate those tokens before creating ngrams so we can match on single chars // which may be words // We use ngrams rather than edge-ngrams when we want to look within urls, usernames, etc // eg photomatt (match Matt), www.nytimes.com (match nytimes), etc $settings['analyzer'][ $config['name'] ]['tokenizer'] = $config['tokenizer']; $settings['analyzer'][ $config['name'] ]['filter'][] = 'icu_normalizer'; $settings['analyzer'][ $config['name'] ]['filter'][] = 'icu_folding'; $settings['analyzer'][ $config['name'] ]['filter'][] = 'keyword_repeat'; if ( 'ngram' === $lang ) { $settings['analyzer'][ $config['name'] ]['filter'][] = 'ngram_filter'; } elseif ( ( 'edgengram' === $lang ) || ( 'edgengram_raw' === $lang ) ) { $settings['analyzer'][ $config['name'] ]['filter'][] = 'edgengram_filter'; } $settings['analyzer'][ $config['name'] ]['filter'][] = 'unique_filter'; //remove dupes at the same location continue; } ///////////////////////////////////////////////// //First filter is normalization // normalization needs to be before stopwords so we combine UTF-8 characters (eg ê) if ( $config['tokenizer'] ) { $settings['analyzer'][ $config['name'] ]['tokenizer'] = $config['tokenizer']; $settings['analyzer'][ $config['name'] ]['filter'][] = 'icu_normalizer'; } ////////////// //Stopwords if ( 'he' == $lang && $with_stop_words ) { //hebrew has its own custom stopword list (no built in ES one) $settings['filter'][$lang . '_stop_filter'] = array( 'type' => 'stop', 'stopwords' => $this->get_hebrew_stopwords() ); $settings['analyzer'][ $config['name'] ]['filter'][] = $lang . '_stop_filter'; } if ( 'fr' == $lang ) { //French has elision's that need to be removed $settings['analyzer'][ $config['name'] ]['filter'][] = 'elision'; } if ( 'fr' == $lang ) { //French has elision's that need to be removed $settings['analyzer'][ $config['name'] ]['filter'][] = 'elision'; } if ( $config['stopwords'] && $with_stop_words ) { $settings['filter'][$lang . '_stop_filter'] = array( 'type' => 'stop', 'stopwords' => array( $config['stopwords'] ) ); $settings['analyzer'][ $config['name'] ]['filter'][] = $lang . '_stop_filter'; } //////////////// // Stemming if ( $config['stemming'] ) { $settings['filter'][ $lang . '_stem_filter' ] = array( 'type' => 'stemmer', 'name' => $config['stemming'] ); $settings['analyzer'][ $config['name'] ]['filter'][] = $lang . '_stem_filter'; } ///////////////////////////////////////////////// //final filters (character folding and bigrams) // character folding must be after stopwords if ( $config['tokenizer'] ) { $settings['analyzer'][ $config['name'] ]['filter'][] = 'icu_folding'; //if ( 'lowercase' != $config['name'] ) // $settings['analyzer'][ $config['name'] ]['filter'][] = 'bigram_filter'; } } if ( $this->_is_es5plus ) { $settings['normalizer'] = array( 'lowercase_normalizer' => array( "type" => "custom", "char_filter" => array(), "filter" => array( "lowercase" ), ) ); } return $settings; } //build a list of experimental analyzers for JP Search function build_jp_search_analyzers() { $langs = $this->supported_languages; unset( $langs['ngram'] ); unset( $langs['edgengram'] ); unset( $langs['edgengram_raw'] ); $settings = [ 'filter' => [ 'possessive_filter' => [ "type" => "stemmer", "name" => "possessive_english" ], 'unique_filter' => [ 'type' => 'unique', 'only_on_same_position' => true ], 'edgengram_1_10_filter' => [ 'type' => 'edgeNGram', 'min_gram' => '1', 'max_gram' => '10', ], 'edgengram_1_3_filter' => [ 'type' => 'edgeNGram', 'min_gram' => '1', 'max_gram' => '3', ], 'edgengram_3_15_filter' => [ 'type' => 'edgeNGram', 'min_gram' => '3', 'max_gram' => '15', ], 'ja_pos_filter' => [ 'type' => "kuromoji_part_of_speech", 'stoptags' => array( "助詞-格助詞-一般", "助詞-終助詞" ), ], 'greek_lowercase_filter' => [ "type" => "lowercase", "language" => "greek" //use greek so it will remove accents, etc ], 'he_stop_filter' => [ //hebrew has its own custom stopword list (no built in ES one) 'type' => 'stop', 'stopwords' => $this->get_hebrew_stopwords() ], ], 'char_filter' => [ 'de_char_filter' => [ 'type' => "mapping", 'mappings' => array( 'ß=>ss', 'Ä=>ae', 'ä=>ae', 'Ö=>oe', 'ö=>oe', 'Ü=>ue', 'ü=>ue', 'ph=>f' ), ], ], 'tokenizer' => [ 'kuromoji' => [ 'type' => 'kuromoji_tokenizer', 'mode' => 'search' ], ], 'analyzer' => [ ] ]; foreach ( $langs as $lang => $config ) { $name = $lang . '_analyzer'; $ename = $lang . '_edge_analyzer'; if ( $lang == 'default' ) { $name = 'default'; } $settings['analyzer'][ $name ] = [ 'type' => $config['analyzer'], 'filter' => [], ]; $settings['analyzer'][ $ename ] = [ 'type' => 'custom', 'filter' => [], ]; //add per lang filters if ( ! empty( $config['stopwords'] ) ) { $settings['filter'][$lang . '_stop_filter'] = array( 'type' => 'stop', 'stopwords' => array( $config['stopwords'] ) ); } if ( ! empty( $config['stemming'] ) ) { $settings['filter'][ $lang . '_stem_filter' ] = array( 'type' => 'stemmer', 'name' => $config['stemming'] ); } //////////////////////////////// // Lang specific customizations if ( 'ja' == $lang ) { ////From: http://tech.gmo-media.jp/post/70245090007/elasticsearch-kuromoji-japanese-fulltext-search $settings['analyzer'][ $name ]['tokenizer'] = 'kuromoji'; $settings['analyzer'][ $name ]['filter'][] = 'kuromoji_baseform'; $settings['analyzer'][ $name ]['filter'][] = 'ja_pos_filter'; //stopwords //full ICU will remove too much from ja chars $settings['analyzer'][ $name ]['filter'][] = 'greek_lowercase_filter'; $settings['analyzer'][ $name ]['filter'][] = 'cjk_width'; //edgengram identical except one extra filter $settings['analyzer'][ $ename ] = $settings['analyzer'][ $name ]; $settings['analyzer'][ $ename ]['filter'][] = 'edgengram_1_10_filter'; continue; //very different from others, so break the loop } /////////////////////////// // Custom Char Filters if ( 'de' == $lang ) { ////From: http://gibrown.wordpress.com/2013/05/01/three-principles-for-multilingal-indexing-in-elasticsearch/#comment-857 $settings['analyzer'][ $name ]['char_filter'] = array( $lang . '_char_filter' ); } ///////////////////////////////////////////////// //First filter is normalization // normalization needs to be before stopwords so we combine UTF-8 characters (eg ê) if ( $config['tokenizer'] ) { $settings['analyzer'][ $name ]['tokenizer'] = $config['tokenizer']; $settings['analyzer'][ $name ]['filter'][] = 'icu_normalizer'; } //////////////////// //Custom Filters if ( 'fr' == $lang ) { //French has elision's that need to be removed $settings['analyzer'][ $name ]['filter'][] = 'elision'; } ////////////// //Stopwords if ( 'he' == $lang ) { $settings['analyzer'][ $name ]['filter'][] = $lang . '_stop_filter'; } if ( $config['stopwords'] ) { $settings['analyzer'][ $name ]['filter'][] = $lang . '_stop_filter'; } ////////////// //Stemming if ( in_array( $lang, [ 'en', 'default' ] ) ) { $settings['analyzer'][ $name ]['filter'][] = 'possessive_filter'; } if ( $config['stemming'] ) { $settings['analyzer'][ $name ]['filter'][] = $lang . '_stem_filter'; } ///////////////////////////////////////////////// //final filters (character folding and bigrams) // character folding must be after stopwords so we correctly filter out words with accents if ( $lang != 'ko' ) { $settings['analyzer'][ $name ]['filter'][] = 'icu_folding'; } //////////////// // Add edgengram //edgengram identical except one extra filter switch( $lang ) { case 'ko': // this is almost certainly wrong, probably need nori analyzer $settings['analyzer'][ $ename ] = $settings['analyzer'][ $name ]; break; case 'zh': // probably wrong $settings['analyzer'][ $ename ] = $settings['analyzer'][ $name ]; $settings['analyzer'][ $ename ]['filter'][] = 'edgengram_1_3_filter'; break; default: $settings['analyzer'][ $ename ] = $settings['analyzer'][ $name ]; $settings['analyzer'][ $ename ]['filter'][] = 'edgengram_3_15_filter'; } } if ( $this->_is_es5plus ) { $settings['normalizer'] = array( 'lowercase_normalizer' => array( "type" => "custom", "char_filter" => array(), "filter" => array( "lowercase" ), ) ); } return $settings; } //hebrew stopwords taken from http://wiki.korotkin.co.il/Hebrew_stopwords // removed the two word "stopwords" per @ranh function get_hebrew_stopwords() { $stopwords = array( 'אבל', 'או', 'אולי', 'אותה', 'אותה', 'אותו', 'אותו', 'אותו', 'אותי', 'אותך', 'אותם', 'אותן', 'אותנו', 'אז', 'אחר', 'אחר', 'אחרות', 'אחרי', 'אחרי', 'אחרי', 'אחרים', 'אחרת', 'אי', 'איזה', 'איך', 'אין', 'אין', 'איפה', 'איתה', 'איתו', 'איתי', 'איתך', 'איתכם', 'איתכן', 'איתם', 'איתן', 'איתנו', 'אך', 'אך', 'אל', 'אל', 'אלה', 'אלה', 'אלו', 'אלו', 'אם', 'אם', 'אנחנו', 'אני', 'אס', 'אף', 'אצל', 'אשר', 'אשר', 'את', 'את', 'אתה', 'אתכם', 'אתכן', 'אתם', 'אתן', 'באמצע', 'באמצעות', 'בגלל', 'בין', 'בלי', 'בלי', 'במידה', 'ברם', 'בשביל', 'בתוך', 'גם', 'דרך', 'הוא', 'היא', 'היה', 'היכן', 'היתה', 'היתי', 'הם', 'הן', 'הנה', 'הרי', 'ואילו', 'ואת', 'זאת', 'זה', 'זה', 'זות', 'יהיה', 'יוכל', 'יוכלו', 'יותר', 'יכול', 'יכולה', 'יכולות', 'יכולים', 'יכל', 'יכלה', 'יכלו', 'יש', 'כאן', 'כאשר', 'כולם', 'כולן', 'כזה', 'כי', 'כיצד', 'כך', 'ככה', 'כל', 'כלל', 'כמו', 'כמו', 'כן', 'כן', 'כפי', 'כש', 'לא', 'לאו', 'לאן', 'לבין', 'לה', 'להיות', 'להם', 'להן', 'לו', 'לי', 'לכם', 'לכן', 'לכן', 'למה', 'למטה', 'למעלה', 'למרות', 'למרות', 'לנו', 'לעבר', 'לעיכן', 'לפיכך', 'לפני', 'לפני', 'מאד', 'מאחורי', 'מאין', 'מאיפה', 'מבלי', 'מבעד', 'מדוע', 'מדי', 'מה', 'מהיכן', 'מול', 'מחוץ', 'מי', 'מכאן', 'מכיוון', 'מלבד', 'מן', 'מנין', 'מסוגל', 'מעט', 'מעטים', 'מעל', 'מעל', 'מצד', 'מתחת', 'מתחת', 'מתי', 'נגד', 'נגר', 'נו', 'עד', 'עד', 'עז', 'על', 'על', 'עלי', 'עליה', 'עליהם', 'עליהן', 'עליו', 'עליך', 'עליכם', 'עלינו', 'עם', 'עם', 'עצמה', 'עצמהם', 'עצמהן', 'עצמו', 'עצמי', 'עצמם', 'עצמן', 'עצמנו', 'פה', 'רק', 'רק', 'שוב', 'שוב', 'של', 'שלה', 'שלהם', 'שלהן', 'שלו', 'שלי', 'שלך', 'שלכם', 'שלכן', 'שלנו', 'שם', 'תהיה', 'תחת', ); return $stopwords; } }