mirror of
https://github.com/alchemy-fr/Phraseanet.git
synced 2025-10-10 11:33:17 +00:00
228 lines
7.5 KiB
PHP
228 lines
7.5 KiB
PHP
<?php
|
|
|
|
namespace Alchemy\Phrasea\SearchEngine\Elastic;
|
|
|
|
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndex;
|
|
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndex;
|
|
|
|
class Index
|
|
{
|
|
|
|
/**
|
|
* @var array
|
|
*/
|
|
private $analysis;
|
|
|
|
/**
|
|
* @var ElasticsearchOptions
|
|
*/
|
|
private $options;
|
|
|
|
/**
|
|
* @var IndexLocator
|
|
*/
|
|
private $indexLocator;
|
|
|
|
/**
|
|
* @param ElasticsearchOptions $options
|
|
* @param IndexLocator $indexLocator
|
|
*/
|
|
public function __construct(
|
|
ElasticsearchOptions $options,
|
|
IndexLocator $indexLocator
|
|
) {
|
|
$this->options = $options;
|
|
$this->indexLocator = $indexLocator;
|
|
|
|
$this->buildDefaultAnalysis();
|
|
}
|
|
|
|
/**
|
|
* Returns the index name (this is same value as defined in ElasticsearchOptions)
|
|
*
|
|
* @return string
|
|
*/
|
|
public function getName()
|
|
{
|
|
return $this->options->getIndexName();
|
|
}
|
|
|
|
/**
|
|
* @return array
|
|
*/
|
|
public function getAnalysis()
|
|
{
|
|
return $this->analysis;
|
|
}
|
|
|
|
/**
|
|
* @return ElasticsearchOptions
|
|
*/
|
|
public function getOptions()
|
|
{
|
|
return $this->options;
|
|
}
|
|
|
|
/**
|
|
* @return RecordIndex
|
|
*/
|
|
public function getRecordIndex()
|
|
{
|
|
return $this->indexLocator->getRecordIndex();
|
|
}
|
|
|
|
/**
|
|
* @return TermIndex
|
|
*/
|
|
public function getTermIndex()
|
|
{
|
|
return $this->indexLocator->getTermIndex();
|
|
}
|
|
|
|
private function buildDefaultAnalysis()
|
|
{
|
|
$this->analysis = [
|
|
'analyzer' => [
|
|
// General purpose, without removing stop word or stem: improve meaning accuracy
|
|
'general_light' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
// TODO Maybe replace nfkc_normalizer + asciifolding with icu_folding
|
|
'filter' => ['nfkc_normalizer', 'asciifolding']
|
|
],
|
|
// Lang specific
|
|
'fr_full' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
// better support for some Asian languages and using custom rules to break Myanmar and Khmer text.
|
|
'filter' => ['nfkc_normalizer', 'asciifolding', 'elision', 'stop_fr', 'stem_fr']
|
|
],
|
|
'en_full' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_en', 'stem_en']
|
|
],
|
|
'de_full' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_de', 'stem_de']
|
|
],
|
|
'nl_full' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_nl', 'stem_nl_override', 'stem_nl']
|
|
],
|
|
'es_full' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_es', 'stem_es']
|
|
],
|
|
'ar_full' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_ar', 'stem_ar']
|
|
],
|
|
'ru_full' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
'filter' => ['nfkc_normalizer', 'asciifolding', 'stop_ru', 'stem_ru']
|
|
],
|
|
'cn_full' => [ // Standard chinese analyzer is not exposed
|
|
'type' => 'custom',
|
|
'tokenizer' => 'icu_tokenizer',
|
|
'filter' => ['nfkc_normalizer', 'asciifolding']
|
|
],
|
|
// Thesaurus specific
|
|
'thesaurus_path' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'thesaurus_path'
|
|
],
|
|
// Thesaurus strict term lookup
|
|
'thesaurus_term_strict' => [
|
|
'type' => 'custom',
|
|
'tokenizer' => 'keyword',
|
|
'filter' => 'nfkc_normalizer'
|
|
]
|
|
],
|
|
'tokenizer' => [
|
|
'thesaurus_path' => [
|
|
'type' => 'path_hierarchy'
|
|
]
|
|
],
|
|
'filter' => [
|
|
'nfkc_normalizer' => [ // weißkopfseeadler => weisskopfseeadler, ١٢٣٤٥ => 12345.
|
|
'type' => 'icu_normalizer', // œ => oe, and use the fewest bytes possible.
|
|
'name' => 'nfkc_cf' // nfkc_cf do the lowercase job too.
|
|
],
|
|
'stop_fr' => [
|
|
'type' => 'stop',
|
|
'stopwords' => ['l', 'm', 't', 'qu', 'n', 's', 'j', 'd'],
|
|
],
|
|
'stop_en' => [
|
|
'type' => 'stop',
|
|
'stopwords' => '_english_' // Use the Lucene default
|
|
],
|
|
'stop_de' => [
|
|
'type' => 'stop',
|
|
'stopwords' => '_german_' // Use the Lucene default
|
|
],
|
|
'stop_nl' => [
|
|
'type' => 'stop',
|
|
'stopwords' => '_dutch_' // Use the Lucene default
|
|
],
|
|
'stop_es' => [
|
|
'type' => 'stop',
|
|
'stopwords' => '_spanish_' // Use the Lucene default
|
|
],
|
|
'stop_ar' => [
|
|
'type' => 'stop',
|
|
'stopwords' => '_arabic_' // Use the Lucene default
|
|
],
|
|
'stop_ru' => [
|
|
'type' => 'stop',
|
|
'stopwords' => '_russian_' // Use the Lucene default
|
|
],
|
|
// See http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/analysis-stemmer-tokenfilter.html
|
|
'stem_fr' => [
|
|
'type' => 'stemmer',
|
|
'name' => 'light_french',
|
|
],
|
|
'stem_en' => [
|
|
'type' => 'stemmer',
|
|
'name' => 'english', // Porter stemming algorithm
|
|
],
|
|
'stem_de' => [
|
|
'type' => 'stemmer',
|
|
'name' => 'light_german',
|
|
],
|
|
'stem_nl' => [
|
|
'type' => 'stemmer',
|
|
'name' => 'dutch', // Snowball algo
|
|
],
|
|
'stem_es' => [
|
|
'type' => 'stemmer',
|
|
'name' => 'light_spanish',
|
|
],
|
|
'stem_ar' => [
|
|
'type' => 'stemmer',
|
|
'name' => 'arabic', // Lucene Arabic stemmer
|
|
],
|
|
'stem_ru' => [
|
|
'type' => 'stemmer',
|
|
'name' => 'russian', // Snowball algo
|
|
],
|
|
// Some custom rules
|
|
'stem_nl_override' => [
|
|
'type' => 'stemmer_override',
|
|
'rules' => [
|
|
"fiets=>fiets",
|
|
"bromfiets=>bromfiets",
|
|
"ei=>eier",
|
|
"kind=>kinder"
|
|
]
|
|
]
|
|
],
|
|
];
|
|
}
|
|
}
|