Add elision on french tokenizer and fix missed config

This commit is contained in:
Damien Alexandre
2014-09-08 15:50:47 +02:00
parent c488e259f4
commit f535aaf3ad
2 changed files with 13 additions and 13 deletions

View File

@@ -345,7 +345,7 @@ class ElasticSearchEngine implements SearchEngineInterface
* {@inheritdoc} * {@inheritdoc}
* *
* @todo Allow multiple hosts! * @todo Allow multiple hosts!
* @return ElastcSearchEngine * @return \Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine
*/ */
public static function create(Application $app, array $options = []) public static function create(Application $app, array $options = [])
{ {

View File

@@ -49,7 +49,6 @@ class Indexer
$params['index'] = $this->options['index']; $params['index'] = $this->options['index'];
$params['body']['settings']['number_of_shards'] = $this->options['shards']; $params['body']['settings']['number_of_shards'] = $this->options['shards'];
$params['body']['settings']['number_of_replicas'] = $this->options['replicas']; $params['body']['settings']['number_of_replicas'] = $this->options['replicas'];
$params['body']['settings']['analysis'] = $this->getAnalysis();; $params['body']['settings']['analysis'] = $this->getAnalysis();;
if ($withMapping) { if ($withMapping) {
@@ -57,6 +56,7 @@ class Indexer
$params['body']['mappings'][self::TYPE_RECORD] = $this->getRecordMapping(); $params['body']['mappings'][self::TYPE_RECORD] = $this->getRecordMapping();
$params['body']['mappings'][self::TYPE_TERM] = $this->getTermMapping(); $params['body']['mappings'][self::TYPE_TERM] = $this->getTermMapping();
} }
$this->client->indices()->create($params); $this->client->indices()->create($params);
} }
@@ -261,7 +261,7 @@ class Indexer
// Business rules // Business rules
$field['private'] = $fieldStructure->isBusiness(); $field['private'] = $fieldStructure->isBusiness();
$field['indexable'] = $fieldStructure->is_indexable(); $field['indexable'] = $fieldStructure->is_indexable();
$field['to_aggregate'] = false; // @todo $field['to_aggregate'] = false; // @todo, dev in progress
$name = $fieldStructure->get_name(); $name = $fieldStructure->get_name();
@@ -359,54 +359,54 @@ class Indexer
'general_light' => [ 'general_light' => [
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'lowercase'] 'filter' => ['nfkc_normalizer', 'asciifolding']
], ],
// Lang specific // Lang specific
'fr_full' => [ 'fr_full' => [
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', // better support for some Asian languages and using custom rules to break Myanmar and Khmer text. 'tokenizer' => 'icu_tokenizer', // better support for some Asian languages and using custom rules to break Myanmar and Khmer text.
'filter' => ['nfkc_normalizer', 'lowercase', 'stop_fr', 'stem_fr'] 'filter' => ['nfkc_normalizer', 'asciifolding', 'elision', 'stop_fr', 'stem_fr']
], ],
'en_full' => [ 'en_full' => [
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'lowercase', 'stop_en', 'stem_en'] 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_en', 'stem_en']
], ],
'de_full' => [ 'de_full' => [
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'lowercase', 'stop_de', 'stem_de'] 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_de', 'stem_de']
], ],
'nl_full' => [ 'nl_full' => [
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'lowercase', 'stop_nl', 'stem_nl_override', 'stem_nl'] 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_nl', 'stem_nl_override', 'stem_nl']
], ],
'es_full' => [ 'es_full' => [
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'lowercase', 'stop_es', 'stem_es'] 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_es', 'stem_es']
], ],
'ar_full' => [ 'ar_full' => [
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'lowercase', 'stop_ar', 'stem_ar'] 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_ar', 'stem_ar']
], ],
'ru_full' => [ 'ru_full' => [
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'lowercase', 'stop_ru', 'stem_ru'] 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_ru', 'stem_ru']
], ],
'cn_full' => [ // Standard chinese analyzer is not exposed 'cn_full' => [ // Standard chinese analyzer is not exposed
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'lowercase'] 'filter' => ['nfkc_normalizer', 'asciifolding']
] ]
], ],
'filter' => [ 'filter' => [
'nfkc_normalizer' => [ // weißkopfseeadler => weisskopfseeadler, ١٢٣٤٥ => 12345. 'nfkc_normalizer' => [ // weißkopfseeadler => weisskopfseeadler, ١٢٣٤٥ => 12345.
'type' => 'icu_normalizer', // œ => oe, and use the fewest bytes possible. 'type' => 'icu_normalizer', // œ => oe, and use the fewest bytes possible.
'name' => 'nfkc_cf' // nfkc_cf do the asciifolding job too. 'name' => 'nfkc_cf' // nfkc_cf do the lowercase job too.
], ],
'stop_fr' => [ 'stop_fr' => [