client = $client; $this->options = $options; $this->termIndexer = $termIndexer; $this->recordIndexer = $recordIndexer; $this->logger = $logger; } public function createIndex($withMapping = true) { $params = array(); $params['index'] = $this->options['index']; $params['body']['settings']['number_of_shards'] = $this->options['shards']; $params['body']['settings']['number_of_replicas'] = $this->options['replicas']; $params['body']['settings']['analysis'] = $this->getAnalysis();; if ($withMapping) { $params['body']['mappings'][RecordIndexer::TYPE_NAME] = $this->recordIndexer->getMapping(); $params['body']['mappings'][TermIndexer::TYPE_NAME] = $this->termIndexer->getMapping(); } $this->client->indices()->create($params); } public function updateMapping() { $params = array(); $params['index'] = $this->options['index']; $params['type'] = RecordIndexer::TYPE_NAME; $params['body'][RecordIndexer::TYPE_NAME] = $this->recordIndexer->getMapping(); // @todo Add term mapping // @todo This must throw a new indexation if a mapping is edited $this->client->indices()->putMapping($params); } public function deleteIndex() { $params = array('index' => $this->options['index']); $this->client->indices()->delete($params); } public function indexExists() { $params = array('index' => $this->options['index']); return $this->client->indices()->exists($params); } public function populateIndex() { $stopwatch = new Stopwatch(); $stopwatch->start('populate'); $this->disableShardRefreshing(); try { // Prepare the bulk operation $bulk = new BulkOperation($this->client); $bulk->setDefaultIndex($this->options['index']); $bulk->setAutoFlushLimit(1000); $this->termIndexer->populateIndex($bulk); // Record indexing depends on indexed terms so we need to flush // between the two operations $bulk->flush(); // Make everything ready to search $this->client->indices()->refresh(); $this->recordIndexer->populateIndex($bulk); // Final flush $bulk->flush(); // Optimize index $params = array('index' => $this->options['index']); $this->client->indices()->optimize($params); $this->restoreShardRefreshing(); } catch (\Exception $e) { $this->restoreShardRefreshing(); throw $e; } $event = $stopwatch->stop('populate'); printf("Indexation finished in %s min (Mem. %s)", ($event->getDuration()/1000/60), $event->getMemory()); } private function disableShardRefreshing() { $refreshInterval = $this->getSetting(self::REFRESH_INTERVAL_KEY); if (null !== $refreshInterval) { $this->previousRefreshInterval = $refreshInterval; } $this->setSetting(self::REFRESH_INTERVAL_KEY, "30s"); } private function restoreShardRefreshing() { $this->setSetting(self::REFRESH_INTERVAL_KEY, $this->previousRefreshInterval); $this->previousRefreshInterval = self::DEFAULT_REFRESH_INTERVAL; } private function getSetting($name) { $index = $this->options['index']; $params = array(); $params['index'] = $index; $params['name'] = $name; $params['flat_settings'] = true; $response = $this->client->indices()->getSettings($params); return igorw\get_in($response, [$index, 'settings', $name]); } private function setSetting($name, $value) { $index = $this->options['index']; $params = array(); $params['index'] = $index; $params['body'][$name] = $value; $response = $this->client->indices()->putSettings($params); return igorw\get_in($response, ['acknowledged']); } /** * Editing this configuration must be followed by a full re-indexation * @return array */ private function getAnalysis() { return [ 'analyzer' => [ // General purpose, without removing stop word or stem: improve meaning accuracy 'general_light' => [ 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', 'filter' => ['nfkc_normalizer', 'asciifolding'] ], // Lang specific 'fr_full' => [ 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', // better support for some Asian languages and using custom rules to break Myanmar and Khmer text. 'filter' => ['nfkc_normalizer', 'asciifolding', 'elision', 'stop_fr', 'stem_fr'] ], 'en_full' => [ 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_en', 'stem_en'] ], 'de_full' => [ 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_de', 'stem_de'] ], 'nl_full' => [ 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_nl', 'stem_nl_override', 'stem_nl'] ], 'es_full' => [ 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_es', 'stem_es'] ], 'ar_full' => [ 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_ar', 'stem_ar'] ], 'ru_full' => [ 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', 'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_ru', 'stem_ru'] ], 'cn_full' => [ // Standard chinese analyzer is not exposed 'type' => 'custom', 'tokenizer' => 'icu_tokenizer', 'filter' => ['nfkc_normalizer', 'asciifolding'] ], // Thesaurus specific 'thesaurus_path' => [ 'type' => 'custom', 'tokenizer' => 'thesaurus_path' ] ], 'tokenizer' => [ 'thesaurus_path' => [ 'type' => 'path_hierarchy' ] ], 'filter' => [ 'nfkc_normalizer' => [ // weißkopfseeadler => weisskopfseeadler, ١٢٣٤٥ => 12345. 'type' => 'icu_normalizer', // œ => oe, and use the fewest bytes possible. 'name' => 'nfkc_cf' // nfkc_cf do the lowercase job too. ], 'stop_fr' => [ 'type' => 'stop', 'stopwords' => ['l', 'm', 't', 'qu', 'n', 's', 'j', 'd'], ], 'stop_en' => [ 'type' => 'stop', 'stopwords' => '_english_' // Use the Lucene default ], 'stop_de' => [ 'type' => 'stop', 'stopwords' => '_german_' // Use the Lucene default ], 'stop_nl' => [ 'type' => 'stop', 'stopwords' => '_dutch_' // Use the Lucene default ], 'stop_es' => [ 'type' => 'stop', 'stopwords' => '_spanish_' // Use the Lucene default ], 'stop_ar' => [ 'type' => 'stop', 'stopwords' => '_arabic_' // Use the Lucene default ], 'stop_ru' => [ 'type' => 'stop', 'stopwords' => '_russian_' // Use the Lucene default ], // See http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html 'stem_fr' => [ 'type' => 'stemmer', 'name' => 'light_french', ], 'stem_en' => [ 'type' => 'stemmer', 'name' => 'english', // Porter stemming algorithm ], 'stem_de' => [ 'type' => 'stemmer', 'name' => 'light_german', ], 'stem_nl' => [ 'type' => 'stemmer', 'name' => 'dutch', // Snowball algo ], 'stem_es' => [ 'type' => 'stemmer', 'name' => 'light_spanish', ], 'stem_ar' => [ 'type' => 'stemmer', 'name' => 'arabic', // Lucene Arabic stemmer ], 'stem_ru' => [ 'type' => 'stemmer', 'name' => 'russian', // Snowball algo ], // Some custom rules 'stem_nl_override' => [ 'type' => 'stemmer_override', 'rules' => [ "fiets=>fiets", "bromfiets=>bromfiets", "ei=>eier", "kind=>kinder" ] ] ], ]; } }