mirror of
https://github.com/alchemy-fr/Phraseanet.git
synced 2025-10-24 10:23:17 +00:00
Merge pull request #23 from mdarse/thesaurus-relevance
Enhance thesaurus search relevance
This commit is contained in:
@@ -13,6 +13,8 @@ namespace Alchemy\Phrasea\Command\Thesaurus;
|
||||
|
||||
use Alchemy\Phrasea\Command\Command;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
|
||||
use Symfony\Component\Console\Input\InputArgument;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
@@ -30,12 +32,23 @@ class FindConceptsCommand extends Command
|
||||
InputArgument::REQUIRED,
|
||||
'Reverse search a term to infer concepts'
|
||||
)
|
||||
->addArgument(
|
||||
'context',
|
||||
InputArgument::OPTIONAL,
|
||||
'Restrict search to a specific term context'
|
||||
)
|
||||
->addOption(
|
||||
'locale',
|
||||
null,
|
||||
InputOption::VALUE_REQUIRED,
|
||||
'Specify input locale'
|
||||
)
|
||||
->addOption(
|
||||
'broad',
|
||||
null,
|
||||
InputOption::VALUE_NONE,
|
||||
'Keep broad concepts (discards narrower concepts)'
|
||||
)
|
||||
->addOption(
|
||||
'raw',
|
||||
null,
|
||||
@@ -48,16 +61,26 @@ class FindConceptsCommand extends Command
|
||||
protected function doExecute(InputInterface $input, OutputInterface $output)
|
||||
{
|
||||
$term = $input->getArgument('term');
|
||||
$context = $input->getArgument('context');
|
||||
$raw = $input->getOption('raw');
|
||||
|
||||
if (!$raw) {
|
||||
$output->writeln(sprintf('Finding linked concepts: <comment>%s</comment>', $term));
|
||||
$message = sprintf('Finding linked concepts: <comment>%s</comment>', $term);
|
||||
if ($context) {
|
||||
$message .= sprintf(' (with context <comment>%s</comment>)', $context);
|
||||
}
|
||||
$output->writeln($message);
|
||||
$output->writeln(str_repeat('-', 20));
|
||||
}
|
||||
|
||||
$thesaurus = $this->container['thesaurus'];
|
||||
$term = new Term($term, $context);
|
||||
$locale = $input->getOption('locale');
|
||||
$concepts = $thesaurus->findConcepts($term, null, $locale);
|
||||
$concepts = $thesaurus->findConcepts($term, $locale);
|
||||
|
||||
if ($input->getOption('broad')) {
|
||||
$concepts = Concept::pruneNarrowConcepts($concepts);
|
||||
}
|
||||
|
||||
if (count($concepts)) {
|
||||
$output->writeln($concepts);
|
||||
|
||||
@@ -18,6 +18,7 @@ use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryParser;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||
use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngine;
|
||||
@@ -87,6 +88,7 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
|
||||
|
||||
$app['elasticsearch.indexer.record_indexer'] = $app->share(function ($app) {
|
||||
return new RecordIndexer(
|
||||
$app['elasticsearch.record_helper'],
|
||||
$app['thesaurus'],
|
||||
$app['elasticsearch.engine'],
|
||||
$app['phraseanet.appbox'],
|
||||
@@ -94,6 +96,10 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
|
||||
);
|
||||
});
|
||||
|
||||
$app['elasticsearch.record_helper'] = $app->share(function ($app) {
|
||||
return new RecordHelper($app['phraseanet.appbox']);
|
||||
});
|
||||
|
||||
$app['elasticsearch.client'] = $app->share(function($app) {
|
||||
$options = $app['elasticsearch.options'];
|
||||
$clientParams = ['hosts' => [sprintf('%s:%s', $options['host'], $options['port'])]];
|
||||
|
||||
@@ -2,14 +2,16 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
class AndExpression extends BinaryOperator
|
||||
{
|
||||
protected $operator = 'AND';
|
||||
|
||||
public function getQuery($fields = ['_all'])
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
$left = $this->left->getQuery($fields);
|
||||
$right = $this->right->getQuery($fields);
|
||||
$left = $this->left->buildQuery($context);
|
||||
$right = $this->right->buildQuery($context);
|
||||
|
||||
return array(
|
||||
'bool' => array(
|
||||
|
||||
@@ -19,9 +19,11 @@ abstract class BinaryOperator extends Node
|
||||
return sprintf('(%s %s %s)', $this->left, $this->operator, $this->right);
|
||||
}
|
||||
|
||||
public function isFullTextOnly()
|
||||
public function getTextNodes()
|
||||
{
|
||||
return $this->left->isFullTextOnly()
|
||||
&& $this->right->isFullTextOnly();
|
||||
return array_merge(
|
||||
$this->left->getTextNodes(),
|
||||
$this->right->getTextNodes()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,14 +2,16 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
class ExceptExpression extends BinaryOperator
|
||||
{
|
||||
protected $operator = 'EXCEPT';
|
||||
|
||||
public function getQuery($fields = ['_all'])
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
$left = $this->left->getQuery($fields);
|
||||
$right = $this->right->getQuery($fields);
|
||||
$left = $this->left->buildQuery($context);
|
||||
$right = $this->right->buildQuery($context);
|
||||
|
||||
return array(
|
||||
'bool' => array(
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
class FieldNode extends Node
|
||||
{
|
||||
protected $keyword;
|
||||
@@ -16,18 +18,18 @@ class FieldNode extends Node
|
||||
return $this->keyword;
|
||||
}
|
||||
|
||||
public function getQuery()
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
throw new \LogicException("A keyword can't be converted to a query.");
|
||||
}
|
||||
|
||||
public function __toString()
|
||||
public function getTextNodes()
|
||||
{
|
||||
return sprintf('<%s>', $this->keyword);
|
||||
throw new \LogicException("A keyword can't contain text nodes.");
|
||||
}
|
||||
|
||||
public function isFullTextOnly()
|
||||
public function __toString()
|
||||
{
|
||||
return false;
|
||||
return sprintf('<field:%s>', $this->keyword);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
class InExpression extends Node
|
||||
{
|
||||
protected $field;
|
||||
@@ -13,19 +15,20 @@ class InExpression extends Node
|
||||
$this->expression = $expression;
|
||||
}
|
||||
|
||||
public function getQuery()
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
return $this->expression->getQuery($this->field->getValue());
|
||||
$fields = array($this->field->getValue());
|
||||
|
||||
return $this->expression->buildQuery($context->narrowToFields($fields));
|
||||
}
|
||||
|
||||
public function getTextNodes()
|
||||
{
|
||||
return $this->expression->getTextNodes();
|
||||
}
|
||||
|
||||
public function __toString()
|
||||
{
|
||||
return sprintf('(%s IN %s)', $this->expression, $this->field);
|
||||
}
|
||||
|
||||
public function isFullTextOnly()
|
||||
{
|
||||
// In expressions are never full-text
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,15 +2,14 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
abstract class Node
|
||||
{
|
||||
/**
|
||||
* @return array The Elasticsearch formatted query
|
||||
*/
|
||||
abstract public function getQuery();
|
||||
abstract public function buildQuery(QueryContext $context);
|
||||
|
||||
/**
|
||||
* @return bool Tell if the node and it's child are full-text queries only
|
||||
*/
|
||||
abstract public function isFullTextOnly();
|
||||
abstract public function getTextNodes();
|
||||
}
|
||||
|
||||
@@ -2,9 +2,11 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
class NullQueryNode extends Node
|
||||
{
|
||||
public function getQuery()
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
return array('match_all' => array());
|
||||
}
|
||||
@@ -18,9 +20,4 @@ class NullQueryNode extends Node
|
||||
{
|
||||
return '<NULL>';
|
||||
}
|
||||
|
||||
public function isFullTextOnly()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,14 +2,16 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
class OrExpression extends BinaryOperator
|
||||
{
|
||||
protected $operator = 'OR';
|
||||
|
||||
public function getQuery($fields = ['_all'])
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
$left = $this->left->getQuery($fields);
|
||||
$right = $this->right->getQuery($fields);
|
||||
$left = $this->left->buildQuery($context);
|
||||
$right = $this->right->buildQuery($context);
|
||||
|
||||
return array(
|
||||
'bool' => array(
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
class PrefixNode extends Node
|
||||
{
|
||||
protected $prefix;
|
||||
@@ -11,11 +13,11 @@ class PrefixNode extends Node
|
||||
$this->prefix = $prefix;
|
||||
}
|
||||
|
||||
public function getQuery($fields = ['_all'])
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
return array(
|
||||
'multi_match' => array(
|
||||
'fields' => $fields,
|
||||
'fields' => $context->getLocalizedFields(),
|
||||
'query' => $this->prefix,
|
||||
'type' => 'phrase_prefix'
|
||||
)
|
||||
@@ -26,9 +28,4 @@ class PrefixNode extends Node
|
||||
{
|
||||
return sprintf('prefix("%s")', $this->prefix);
|
||||
}
|
||||
|
||||
public function isFullTextOnly()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,22 +2,19 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
|
||||
class QuotedTextNode extends TextNode
|
||||
{
|
||||
public function getQuery($fields = ['_all'])
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
return array(
|
||||
'multi_match' => array(
|
||||
'type' => 'phrase',
|
||||
'fields' => $fields,
|
||||
'fields' => $context->getLocalizedFields(),
|
||||
'query' => $this->text,
|
||||
// 'operator' => 'and'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
public function isFullTextOnly()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,23 +2,49 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
|
||||
|
||||
class TextNode extends Node
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
|
||||
|
||||
class TextNode extends Node implements TermInterface
|
||||
{
|
||||
protected $text;
|
||||
protected $concepts = array();
|
||||
|
||||
public function __construct($text)
|
||||
{
|
||||
$this->text = $text;
|
||||
}
|
||||
|
||||
public function getQuery($fields = ['_all'])
|
||||
public function setConcepts(array $concepts)
|
||||
{
|
||||
return array(
|
||||
$this->concepts = $concepts;
|
||||
}
|
||||
|
||||
public function buildQuery(QueryContext $context)
|
||||
{
|
||||
$query = array(
|
||||
'multi_match' => array(
|
||||
'fields' => $fields,
|
||||
'fields' => $context->getLocalizedFields(),
|
||||
'query' => $this->text,
|
||||
)
|
||||
);
|
||||
|
||||
if ($this->concepts) {
|
||||
$shoulds = array($query);
|
||||
foreach (Concept::pruneNarrowConcepts($this->concepts) as $concept) {
|
||||
$shoulds[]['term']['concept_paths'] = $concept->getPath();
|
||||
}
|
||||
$query = array();
|
||||
$query['bool']['should'] = $shoulds;
|
||||
}
|
||||
|
||||
return $query;
|
||||
}
|
||||
|
||||
public function getTextNodes()
|
||||
{
|
||||
return array($this);
|
||||
}
|
||||
|
||||
public function __toString()
|
||||
@@ -26,13 +52,22 @@ class TextNode extends Node
|
||||
return sprintf('"%s"', $this->text);
|
||||
}
|
||||
|
||||
public function isFullTextOnly()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
public function getText()
|
||||
// Implementation of TermInterface
|
||||
|
||||
public function getValue()
|
||||
{
|
||||
return $this->text;
|
||||
}
|
||||
|
||||
public function hasContext()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
public function getContext()
|
||||
{
|
||||
// TODO Insert context during parsing
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,9 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\SearchQuery;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
|
||||
use Alchemy\Phrasea\SearchEngine\SearchEngineInterface;
|
||||
use Alchemy\Phrasea\SearchEngine\SearchEngineOptions;
|
||||
use Alchemy\Phrasea\SearchEngine\SearchEngineResult;
|
||||
@@ -278,72 +280,27 @@ class ElasticSearchEngine implements SearchEngineInterface
|
||||
|
||||
$searchQuery = $this->app['query_parser']->parse($string);
|
||||
|
||||
// Contains the full thesaurus paths to search on
|
||||
$pathsToFilter = [];
|
||||
// Contains the thesaurus values by fields (synonyms, translations, etc)
|
||||
$collectFields = [];
|
||||
$query['_ast'] = $searchQuery->dump();
|
||||
|
||||
// Only search in thesaurus for full text search
|
||||
if ($searchQuery->isFullTextOnly()) {
|
||||
$termFields = $this->expendToAnalyzedFieldsNames('value', null, $this->app['locale']);
|
||||
$termsQuery = $searchQuery->getElasticsearchQuery($termFields);
|
||||
|
||||
$params = $this->createTermQueryParams($termsQuery, $options);
|
||||
$terms = $this->doExecute('search', $params);
|
||||
|
||||
foreach ($terms['hits']['hits'] as $term) {
|
||||
// Skip paths with very low score
|
||||
if ($term['_score'] < 1) {
|
||||
continue;
|
||||
$thesaurus = $this->app['thesaurus'];
|
||||
$textNodes = $searchQuery->getTextNodes();
|
||||
$concepts = $thesaurus->findConceptsBulk($textNodes);
|
||||
|
||||
foreach ($concepts as $index => $termConcepts) {
|
||||
$node = $textNodes[$index];
|
||||
$node->setConcepts($termConcepts);
|
||||
$term = Term::dump($node);
|
||||
$query['_thesaurus_concepts'][$term] = Concept::toPathArray($termConcepts);
|
||||
}
|
||||
|
||||
$pathsToFilter[$term['_source']['path']] = $term['_score'];
|
||||
$recordHelper = $this->app['elasticsearch.record_helper'];
|
||||
// TODO Pass options to getFields to include/exclude private fields
|
||||
$searchableFields = $recordHelper->getFields();
|
||||
$queryContext = new QueryContext($searchableFields, $this->locales, $this->app['locale']);
|
||||
$recordQuery = $searchQuery->build($queryContext);
|
||||
|
||||
foreach ($term['_source']['fields'] as $field) {
|
||||
$collectFields['caption.'.$field][] = $term['_source']['value'];
|
||||
}
|
||||
}
|
||||
$pathsToFilter = array_unique($pathsToFilter);
|
||||
}
|
||||
|
||||
if (empty($collectFields)) {
|
||||
// @todo a list of field by default? all fields?
|
||||
$searchFieldNames = ['caption.*'];
|
||||
} else {
|
||||
$searchFieldNames = array_keys($collectFields);
|
||||
}
|
||||
|
||||
$recordFields = $this->expendToAnalyzedFieldsNames($searchFieldNames, null, $this->app['locale']);
|
||||
|
||||
$recordQuery = [
|
||||
'bool' => [
|
||||
'should' => [
|
||||
$searchQuery->getElasticsearchQuery($recordFields)
|
||||
]
|
||||
]
|
||||
];
|
||||
|
||||
foreach ($pathsToFilter as $path => $score) {
|
||||
// Also match incomplete path. /a/b/c will return /a/b/c/d records
|
||||
$recordQuery['bool']['should'][] = [
|
||||
'match' => [
|
||||
'concept_paths' => array(
|
||||
'query' => $path,
|
||||
'boost' => $score,
|
||||
)
|
||||
]
|
||||
];
|
||||
|
||||
// Add signal for exact path only
|
||||
$recordQuery['bool']['should'][] = [
|
||||
'term' => [
|
||||
'concept_paths.raw' => array(
|
||||
'value' => $path,
|
||||
'boost' => $score,
|
||||
)
|
||||
]
|
||||
];
|
||||
}
|
||||
|
||||
$params = $this->createRecordQueryParams($recordQuery, $options, null);
|
||||
$params['body']['from'] = $offset;
|
||||
@@ -378,10 +335,15 @@ class ElasticSearchEngine implements SearchEngineInterface
|
||||
$results[] = ElasticsearchRecordHydrator::hydrate($hit['_source'], $n++);
|
||||
}
|
||||
|
||||
$query['_searchable_fields'] = $searchableFields;
|
||||
$query['_ast'] = $searchQuery->dump();
|
||||
$query['_paths'] = $pathsToFilter;
|
||||
$query['_richFields'] = $collectFields;
|
||||
$query['query'] = json_encode($params);
|
||||
// $query['_paths'] = $pathsToFilter;
|
||||
// $query['_richFields'] = $collectFields;
|
||||
|
||||
$queryyy = $recordQuery;
|
||||
// $queryyy = $params['body'];
|
||||
$query['query'] = $queryyy;
|
||||
$query['query_as_string'] = json_encode($queryyy);
|
||||
|
||||
return new SearchEngineResult($results, json_encode($query), $res['took'], $offset,
|
||||
$res['hits']['total'], $res['hits']['total'], null, null, $suggestions, [],
|
||||
@@ -572,36 +534,6 @@ class ElasticSearchEngine implements SearchEngineInterface
|
||||
return $res;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param array|string $fields
|
||||
* @param array|null $locales
|
||||
* @param null $currentLocale
|
||||
* @return array
|
||||
*/
|
||||
public function expendToAnalyzedFieldsNames($fields, $locales = null, $currentLocale = null)
|
||||
{
|
||||
$fieldsExpended = [];
|
||||
|
||||
if (!$locales) {
|
||||
$locales = $this->locales;
|
||||
}
|
||||
|
||||
foreach ((array) $fields as $field) {
|
||||
foreach ($locales as $locale) {
|
||||
$boost = "";
|
||||
|
||||
if ($locale === $currentLocale) {
|
||||
$boost = "^5";
|
||||
}
|
||||
|
||||
$fieldsExpended[] = sprintf('%s.%s%s', $field, $locale, $boost);
|
||||
}
|
||||
$fieldsExpended[] = sprintf('%s.%s', $field, 'light^10');
|
||||
}
|
||||
|
||||
return $fieldsExpended;
|
||||
}
|
||||
|
||||
private function getFlagsKey(\appbox $appbox)
|
||||
{
|
||||
$flags = [];
|
||||
|
||||
@@ -20,13 +20,16 @@ use Alchemy\Phrasea\SearchEngine\Elastic\RecordFetcher;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
|
||||
use media_subdef;
|
||||
|
||||
class RecordIndexer
|
||||
{
|
||||
const TYPE_NAME = 'record';
|
||||
|
||||
private $helper;
|
||||
|
||||
private $thesaurus;
|
||||
|
||||
/**
|
||||
* @var \appbox
|
||||
*/
|
||||
@@ -42,10 +45,9 @@ class RecordIndexer
|
||||
*/
|
||||
private $locales;
|
||||
|
||||
private $dataStructure;
|
||||
|
||||
public function __construct(Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales)
|
||||
public function __construct(RecordHelper $helper, Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales)
|
||||
{
|
||||
$this->helper = $helper;
|
||||
$this->thesaurus = $thesaurus;
|
||||
$this->appbox = $appbox;
|
||||
$this->elasticSearchEngine = $elasticSearchEngine;
|
||||
@@ -54,11 +56,8 @@ class RecordIndexer
|
||||
|
||||
public function populateIndex(BulkOperation $bulk)
|
||||
{
|
||||
// Helper to fetch record related data
|
||||
$recordHelper = new RecordHelper($this->appbox);
|
||||
|
||||
foreach ($this->appbox->get_databoxes() as $databox) {
|
||||
$fetcher = new RecordFetcher($databox, $recordHelper);
|
||||
$fetcher = new RecordFetcher($databox, $this->helper);
|
||||
$fetcher->setBatchSize(200);
|
||||
while ($records = $fetcher->fetch()) {
|
||||
foreach ($records as $record) {
|
||||
@@ -74,9 +73,7 @@ class RecordIndexer
|
||||
|
||||
public function indexSingleRecord(\record_adapter $record_adapter, $indexName)
|
||||
{
|
||||
// Helper to fetch record related data
|
||||
$recordHelper = new RecordHelper($this->appbox);
|
||||
$fetcher = new RecordFetcher($record_adapter->get_databox(), $recordHelper);
|
||||
$fetcher = new RecordFetcher($record_adapter->get_databox(), $this->helper);
|
||||
$record = $fetcher->fetchOne($record_adapter);
|
||||
|
||||
$params = array();
|
||||
@@ -140,13 +137,13 @@ class RecordIndexer
|
||||
->add('record_id', 'integer') // Compound primary key
|
||||
->add('databox_id', 'integer') // Compound primary key
|
||||
->add('base_id', 'integer') // Unique collection ID
|
||||
->add('collection_id', 'integer') // Useless collection ID (local to databox)
|
||||
->add('collection_name', 'string')->notAnalyzed() // Collection name
|
||||
->add('uuid', 'string')->notAnalyzed()
|
||||
->add('sha256', 'string')->notAnalyzed()
|
||||
->add('collection_id', 'integer')->notIndexed() // Useless collection ID (local to databox)
|
||||
->add('collection_name', 'string')->notIndexed() // Collection name
|
||||
->add('uuid', 'string')->notIndexed()
|
||||
->add('sha256', 'string')->notIndexed()
|
||||
// Mandatory metadata
|
||||
->add('original_name', 'string')->notAnalyzed()
|
||||
->add('mime', 'string')->notAnalyzed()
|
||||
->add('original_name', 'string')->notIndexed()
|
||||
->add('mime', 'string')->notIndexed()
|
||||
->add('type', 'string')->notAnalyzed()
|
||||
->add('record_type', 'string')->notAnalyzed() // record or story
|
||||
// Dates
|
||||
@@ -157,29 +154,15 @@ class RecordIndexer
|
||||
->analyzer('thesaurus_path', 'indexing')
|
||||
->analyzer('keyword', 'searching')
|
||||
->addRawVersion()
|
||||
// EXIF
|
||||
->add('exif', $this->getExifMapping())
|
||||
// Status
|
||||
->add('flags', $this->getFlagsMapping())
|
||||
// Keep some fields arround for display purpose
|
||||
->add('subdefs', Mapping::disabledMapping())
|
||||
->add('title', Mapping::disabledMapping())
|
||||
;
|
||||
|
||||
// Index title
|
||||
$titleMapping = new Mapping();
|
||||
$titleMapping->add('default', 'string')->notAnalyzed()->notIndexed();
|
||||
foreach ($this->locales as $locale) {
|
||||
$titleMapping->add($locale, 'string')->notAnalyzed()->notIndexed();
|
||||
}
|
||||
$mapping->add('title', $titleMapping);
|
||||
|
||||
// Minimal subdefs mapping info for display purpose
|
||||
$subdefMapping = new Mapping();
|
||||
$subdefMapping->add('path', 'string')->notAnalyzed()->notIndexed();
|
||||
$subdefMapping->add('height', 'integer')->notIndexed();
|
||||
$subdefMapping->add('width', 'integer')->notIndexed();
|
||||
|
||||
$subdefsMapping = new Mapping();
|
||||
$subdefsMapping->add('thumbnail', $subdefMapping);
|
||||
$subdefsMapping->add('thumbnailgif', $subdefMapping);
|
||||
$subdefsMapping->add('preview', $subdefMapping);
|
||||
|
||||
$mapping->add('subdefs', $subdefsMapping);
|
||||
|
||||
// Caption mapping
|
||||
$captionMapping = new Mapping();
|
||||
$mapping->add('caption', $captionMapping);
|
||||
@@ -194,9 +177,9 @@ class RecordIndexer
|
||||
}
|
||||
|
||||
if ($params['type'] === Mapping::TYPE_STRING) {
|
||||
if (!$params['indexable'] && !$params['to_aggregate']) {
|
||||
if (!$params['searchable'] && !$params['to_aggregate']) {
|
||||
$m->notIndexed();
|
||||
} elseif (!$params['indexable'] && $params['to_aggregate']) {
|
||||
} elseif (!$params['searchable'] && $params['to_aggregate']) {
|
||||
$m->notAnalyzed();
|
||||
$m->addRawVersion();
|
||||
} else {
|
||||
@@ -206,89 +189,13 @@ class RecordIndexer
|
||||
}
|
||||
}
|
||||
|
||||
// EXIF
|
||||
$mapping->add('exif', $this->getExifMapping());
|
||||
|
||||
// Status
|
||||
$mapping->add('flags', $this->getFlagsMapping());
|
||||
|
||||
return $mapping->export();
|
||||
}
|
||||
|
||||
|
||||
private function getFieldsStructure()
|
||||
{
|
||||
if (!empty($this->dataStructure)) {
|
||||
return $this->dataStructure;
|
||||
}
|
||||
|
||||
$fields = array();
|
||||
|
||||
foreach ($this->appbox->get_databoxes() as $databox) {
|
||||
//printf("Databox %d\n", $databox->get_sbas_id());
|
||||
foreach ($databox->get_meta_structure() as $fieldStructure) {
|
||||
$field = array();
|
||||
// Field type
|
||||
switch ($fieldStructure->get_type()) {
|
||||
case \databox_field::TYPE_DATE:
|
||||
$field['type'] = 'date';
|
||||
break;
|
||||
case \databox_field::TYPE_NUMBER:
|
||||
$field['type'] = 'double';
|
||||
break;
|
||||
case \databox_field::TYPE_STRING:
|
||||
case \databox_field::TYPE_TEXT:
|
||||
$field['type'] = 'string';
|
||||
break;
|
||||
default:
|
||||
throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type()));
|
||||
break;
|
||||
}
|
||||
|
||||
$name = $fieldStructure->get_name();
|
||||
|
||||
// Business rules
|
||||
$field['private'] = $fieldStructure->isBusiness();
|
||||
$field['indexable'] = $fieldStructure->is_indexable();
|
||||
$field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
|
||||
|
||||
// Thesaurus concept inference
|
||||
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
|
||||
$helper = new ThesaurusHelper();
|
||||
|
||||
// TODO Not the real option yet
|
||||
$field['thesaurus_concept_inference'] = $field['type'] === 'string';
|
||||
// TODO Find thesaurus path prefixes
|
||||
$field['thesaurus_prefix'] = '/categories';
|
||||
|
||||
//printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);
|
||||
|
||||
// Since mapping is merged between databoxes, two fields may
|
||||
// have conflicting names. Indexing is the same for a given
|
||||
// type so we reject only those with different types.
|
||||
if (isset($fields[$name])) {
|
||||
if ($fields[$name]['type'] !== $field['type']) {
|
||||
throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type']));
|
||||
}
|
||||
|
||||
if ($fields[$name]['indexable'] !== $field['indexable']) {
|
||||
throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name));
|
||||
}
|
||||
|
||||
if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) {
|
||||
throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name));
|
||||
}
|
||||
// TODO other structure incompatibilities
|
||||
|
||||
//printf("Merged with previous \"%s\" field\n", $name);
|
||||
}
|
||||
|
||||
$fields[$name] = $field;
|
||||
}
|
||||
}
|
||||
|
||||
$this->dataStructure = $fields;
|
||||
return $this->dataStructure;
|
||||
return $this->helper->getFieldsStructure();
|
||||
}
|
||||
|
||||
// @todo Add call to addAnalyzedVersion ?
|
||||
@@ -374,7 +281,7 @@ class RecordIndexer
|
||||
}
|
||||
}
|
||||
|
||||
$record['concept_paths'] = $this->findLinkedConcepts($structure, $record);
|
||||
// $record['concept_paths'] = $this->findLinkedConcepts($structure, $record);
|
||||
|
||||
return $record;
|
||||
}
|
||||
|
||||
@@ -43,37 +43,26 @@ class TermIndexer
|
||||
/** @var databox $databox */
|
||||
$databoxId = $databox->get_sbas_id();
|
||||
|
||||
$document = self::thesaurusFromDatabox($databox);
|
||||
$dedicatedFieldTerms = $this->getDedicatedFieldTerms($databox, $document);
|
||||
$visitor = new TermVisitor(function ($term) use ($bulk, $databoxId) {
|
||||
// Path and id are prefixed with a databox identifier to not
|
||||
// collide with other databoxes terms
|
||||
|
||||
$visitor = new TermVisitor(function ($term) use ($bulk, $databoxId, $dedicatedFieldTerms) {
|
||||
//printf("- %s (%s)\n", $term['path'], $term['value']);
|
||||
// Term structure
|
||||
$id = $term['id'];
|
||||
$id = sprintf('%s_%s', $databoxId, $term['id']);
|
||||
unset($term['id']);
|
||||
|
||||
$term['path'] = sprintf('/%s%s', $databoxId, $term['path']);
|
||||
$term['databox_id'] = $databoxId;
|
||||
$term['branch_id'] = $id;
|
||||
|
||||
// @todo move to the TermVisitor? dunno.
|
||||
$term['fields'] = null;
|
||||
foreach ($dedicatedFieldTerms as $partialId => $fields) {
|
||||
if (strpos($id, $partialId) === 0) {
|
||||
foreach ($fields as $field) {
|
||||
$term['fields'][] = $field;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Index request
|
||||
$params = array();
|
||||
$params['id'] = sprintf('%s_%s', $databoxId, $id);
|
||||
$params['id'] = $id;
|
||||
$params['type'] = self::TYPE_NAME;
|
||||
$params['body'] = $term;
|
||||
|
||||
$bulk->index($params);
|
||||
});
|
||||
|
||||
$document = self::thesaurusFromDatabox($databox);
|
||||
$this->navigator->walk($document, $visitor);
|
||||
}
|
||||
}
|
||||
@@ -88,38 +77,20 @@ class TermIndexer
|
||||
return $dom;
|
||||
}
|
||||
|
||||
private function getDedicatedFieldTerms(databox $databox, DOMDocument $document)
|
||||
{
|
||||
$xpath = new \DOMXpath($document);
|
||||
$dedicatedFieldTerms = [];
|
||||
|
||||
foreach ($databox->get_meta_structure() as $f) {
|
||||
if ($f->get_tbranch()) {
|
||||
$elements = $xpath->query($f->get_tbranch());
|
||||
|
||||
if ($elements) {
|
||||
foreach ($elements as $element) {
|
||||
$dedicatedFieldTerms[$element->getAttribute('id')][] = $f->get_name();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $dedicatedFieldTerms;
|
||||
}
|
||||
|
||||
public function getMapping()
|
||||
{
|
||||
$mapping = new Mapping();
|
||||
$mapping
|
||||
->add('raw_value', 'string')->notAnalyzed()
|
||||
->add('value', 'string')->addAnalyzedVersion($this->locales)
|
||||
->add('context', 'string')->addAnalyzedVersion($this->locales)
|
||||
->add('value', 'string')
|
||||
->analyzer('general_light')
|
||||
->addLocalizedSubfields($this->locales)
|
||||
->add('context', 'string')
|
||||
->analyzer('general_light')
|
||||
->addLocalizedSubfields($this->locales)
|
||||
->add('path', 'string')->notAnalyzed()
|
||||
->add('lang', 'string')->notAnalyzed()
|
||||
->add('branch_id', 'string')->notAnalyzed()
|
||||
->add('databox_id', 'integer')
|
||||
->add('fields', 'string')->notAnalyzed()
|
||||
;
|
||||
|
||||
return $mapping->export();
|
||||
|
||||
@@ -18,6 +18,7 @@ class Mapping
|
||||
{
|
||||
private $fields = array();
|
||||
private $current;
|
||||
private $enabled = true;
|
||||
|
||||
const DATE_FORMAT_MYSQL = 'yyyy-MM-dd HH:mm:ss';
|
||||
const DATE_FORMAT_CAPTION = 'yyyy/MM/dd'; // ES format
|
||||
@@ -54,7 +55,7 @@ class Mapping
|
||||
$field = array();
|
||||
if ($type instanceof self) {
|
||||
$field['type'] = self::TYPE_OBJECT;
|
||||
$field['properties'] = $type;
|
||||
$field['mapping'] = $type;
|
||||
}
|
||||
elseif (in_array($type, self::$types)) {
|
||||
$field['type'] = $type;
|
||||
@@ -74,20 +75,19 @@ class Mapping
|
||||
|
||||
public function export()
|
||||
{
|
||||
return ['properties' => $this->exportProperties()];
|
||||
}
|
||||
|
||||
public function exportProperties()
|
||||
{
|
||||
$properties = array();
|
||||
$mapping = array();
|
||||
foreach ($this->fields as $name => $field) {
|
||||
$properties[$name] = $field;
|
||||
if ($field['type'] === self::TYPE_OBJECT) {
|
||||
$properties[$name]['properties'] = $field['properties']->exportProperties();
|
||||
$field = $field['mapping']->export();
|
||||
}
|
||||
$mapping['properties'][$name] = $field;
|
||||
}
|
||||
|
||||
return $properties;
|
||||
if (!$this->enabled) {
|
||||
$mapping['enabled'] = false;
|
||||
}
|
||||
|
||||
return $mapping;
|
||||
}
|
||||
|
||||
public function analyzer($analyzer, $type = null)
|
||||
@@ -134,6 +134,23 @@ class Mapping
|
||||
return $this;
|
||||
}
|
||||
|
||||
public static function disabledMapping()
|
||||
{
|
||||
return (new self())->disable();
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows to disable parsing and indexing a named object completely.
|
||||
* This is handy when a portion of the JSON document contains arbitrary JSON
|
||||
* which should not be indexed, nor added to the mapping.
|
||||
*/
|
||||
private function disable()
|
||||
{
|
||||
$this->enabled = false;
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function addRawVersion()
|
||||
{
|
||||
$field = &$this->currentField();
|
||||
@@ -146,22 +163,30 @@ class Mapping
|
||||
return $this;
|
||||
}
|
||||
|
||||
public function addAnalyzedVersion(array $langs)
|
||||
/**
|
||||
* @deprecated
|
||||
*/
|
||||
public function addAnalyzedVersion(array $locales)
|
||||
{
|
||||
$field = &$this->currentField();
|
||||
|
||||
foreach ($langs as $lang) {
|
||||
$field['fields'][$lang] = [
|
||||
'type' => $field['type'],
|
||||
'analyzer' => sprintf('%s_full', $lang)
|
||||
];
|
||||
}
|
||||
|
||||
$field['fields']['light'] = [
|
||||
'type' => $field['type'],
|
||||
'analyzer' => 'general_light'
|
||||
];
|
||||
|
||||
return $this->addLocalizedSubfields($locales);
|
||||
}
|
||||
|
||||
public function addLocalizedSubfields(array $locales)
|
||||
{
|
||||
$field = &$this->currentField();
|
||||
|
||||
foreach ($locales as $locale) {
|
||||
$field['fields'][$locale] = array();
|
||||
$field['fields'][$locale]['type'] = $field['type'];
|
||||
$field['fields'][$locale]['analyzer'] = sprintf('%s_full', $locale);
|
||||
}
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
||||
@@ -11,18 +11,21 @@
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
|
||||
use appbox;
|
||||
use igorw;
|
||||
|
||||
class RecordHelper
|
||||
{
|
||||
private $connection;
|
||||
private $appbox;
|
||||
|
||||
// Computation caches
|
||||
private $collectionMap;
|
||||
private $fieldStructure;
|
||||
|
||||
public function __construct(appbox $appbox)
|
||||
{
|
||||
$this->connection = $appbox->get_connection();
|
||||
$this->appbox = $appbox;
|
||||
}
|
||||
|
||||
public function getUniqueRecordId($databoxId, $recordId)
|
||||
@@ -46,12 +49,13 @@ class RecordHelper
|
||||
private function collectionMap()
|
||||
{
|
||||
if (!$this->collectionMap) {
|
||||
$connection = $this->appbox->get_connection();
|
||||
$sql = 'SELECT
|
||||
sbas_id as databox_id,
|
||||
server_coll_id as collection_id,
|
||||
base_id
|
||||
FROM bas';
|
||||
$statement = $this->connection->query($sql);
|
||||
$statement = $connection->query($sql);
|
||||
|
||||
$map = array();
|
||||
while ($mapping = $statement->fetch()) {
|
||||
@@ -68,4 +72,97 @@ class RecordHelper
|
||||
{
|
||||
return StringUtils::slugify($key, '_');
|
||||
}
|
||||
|
||||
public function getFields($includePrivate = false, $onlySearchable = true)
|
||||
{
|
||||
$fields = array();
|
||||
foreach ($this->getFieldsStructure() as $name => $options) {
|
||||
// Skip private fields
|
||||
if ($options['private'] && !$includePrivate) {
|
||||
continue;
|
||||
}
|
||||
// Skip not searchable fields
|
||||
if ($onlySearchable && !$options['searchable']) {
|
||||
continue;
|
||||
}
|
||||
$fields[] = $name;
|
||||
}
|
||||
|
||||
return $fields;
|
||||
}
|
||||
|
||||
public function getFieldsStructure()
|
||||
{
|
||||
if (!empty($this->fieldsStructure)) {
|
||||
return $this->fieldsStructure;
|
||||
}
|
||||
|
||||
$fields = array();
|
||||
|
||||
foreach ($this->appbox->get_databoxes() as $databox) {
|
||||
//printf("Databox %d\n", $databox->get_sbas_id());
|
||||
foreach ($databox->get_meta_structure() as $fieldStructure) {
|
||||
$field = array();
|
||||
// Field type
|
||||
switch ($fieldStructure->get_type()) {
|
||||
case \databox_field::TYPE_DATE:
|
||||
$field['type'] = 'date';
|
||||
break;
|
||||
case \databox_field::TYPE_NUMBER:
|
||||
$field['type'] = 'double';
|
||||
break;
|
||||
case \databox_field::TYPE_STRING:
|
||||
case \databox_field::TYPE_TEXT:
|
||||
$field['type'] = 'string';
|
||||
break;
|
||||
default:
|
||||
throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type()));
|
||||
break;
|
||||
}
|
||||
|
||||
$name = $fieldStructure->get_name();
|
||||
|
||||
// Business rules
|
||||
$field['private'] = $fieldStructure->isBusiness();
|
||||
$field['searchable'] = $fieldStructure->is_indexable();
|
||||
$field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
|
||||
|
||||
// Thesaurus concept inference
|
||||
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
|
||||
$helper = new ThesaurusHelper();
|
||||
|
||||
// TODO Not the real option yet
|
||||
$field['thesaurus_concept_inference'] = $field['type'] === 'string';
|
||||
// TODO Find thesaurus path prefixes
|
||||
$field['thesaurus_prefix'] = '/categories';
|
||||
|
||||
//printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);
|
||||
|
||||
// Since mapping is merged between databoxes, two fields may
|
||||
// have conflicting names. Indexing is the same for a given
|
||||
// type so we reject only those with different types.
|
||||
if (isset($fields[$name])) {
|
||||
if ($fields[$name]['type'] !== $field['type']) {
|
||||
throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type']));
|
||||
}
|
||||
|
||||
if ($fields[$name]['searchable'] !== $field['searchable']) {
|
||||
throw new MergeException(sprintf("Field %s can't be merged, incompatible searchable state", $name));
|
||||
}
|
||||
|
||||
if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) {
|
||||
throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name));
|
||||
}
|
||||
// TODO other structure incompatibilities
|
||||
|
||||
//printf("Merged with previous \"%s\" field\n", $name);
|
||||
}
|
||||
|
||||
$fields[$name] = $field;
|
||||
}
|
||||
}
|
||||
|
||||
$this->fieldsStructure = $fields;
|
||||
return $this->fieldsStructure;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,19 +18,14 @@ class Query
|
||||
$this->root = $root;
|
||||
}
|
||||
|
||||
/*
|
||||
* This method seems weird to me, the implementation returns true when the
|
||||
* query doesn't contain IN statements, but that doesn't define a full text
|
||||
* search.
|
||||
*/
|
||||
public function isFullTextOnly()
|
||||
public function getTextNodes()
|
||||
{
|
||||
return $this->root->isFullTextOnly();
|
||||
return $this->root->getTextNodes();
|
||||
}
|
||||
|
||||
public function getElasticsearchQuery($fields = array())
|
||||
public function build(QueryContext $context)
|
||||
{
|
||||
return $this->root->getQuery($fields);
|
||||
return $this->root->buildQuery($context);
|
||||
}
|
||||
|
||||
public function dump()
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
<?php
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\Search;
|
||||
|
||||
class QueryContext
|
||||
{
|
||||
private $fields;
|
||||
private $locales;
|
||||
private $queryLocale;
|
||||
|
||||
public function __construct(array $fields, array $locales, $queryLocale)
|
||||
{
|
||||
$this->fields = $fields;
|
||||
$this->locales = $locales;
|
||||
$this->queryLocale = $queryLocale;
|
||||
}
|
||||
|
||||
public function narrowToFields(array $fields)
|
||||
{
|
||||
// Ensure we are not escaping from original fields restrictions
|
||||
$fields = array_intersect($this->fields, $fields);
|
||||
|
||||
return new static($fields, $this->locales, $this->queryLocale);
|
||||
}
|
||||
|
||||
public function getLocalizedFields()
|
||||
{
|
||||
$fields = array();
|
||||
foreach ($this->fields as $field) {
|
||||
foreach ($this->locales as $locale) {
|
||||
$boost = ($locale === $this->queryLocale) ? '^5' : '';
|
||||
$fields[] = sprintf('caption.%s.%s%s', $field, $locale, $boost);
|
||||
}
|
||||
// TODO Put generic analyzers on main field instead of "light" sub-field
|
||||
$fields[] = sprintf('caption.%s.%s', $field, 'light^10');
|
||||
}
|
||||
|
||||
return $fields;
|
||||
}
|
||||
}
|
||||
@@ -119,7 +119,7 @@ class QueryVisitor implements Visit
|
||||
if ($root instanceof AST\TextNode &&
|
||||
!$root instanceof AST\QuotedTextNode &&
|
||||
!$node instanceof AST\QuotedTextNode) {
|
||||
$root = new AST\TextNode(sprintf('%s %s', $root->getText(), $node->getText()));
|
||||
$root = new AST\TextNode(sprintf('%s %s', $root->getValue(), $node->getValue()));
|
||||
} else {
|
||||
$root = new AST\AndExpression($root, $node);
|
||||
}
|
||||
|
||||
@@ -12,6 +12,9 @@
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
|
||||
use Elasticsearch\Client;
|
||||
|
||||
class Thesaurus
|
||||
@@ -19,34 +22,78 @@ class Thesaurus
|
||||
private $client;
|
||||
private $index;
|
||||
|
||||
const MIN_SCORE = 6;
|
||||
|
||||
public function __construct(Client $client, $index)
|
||||
{
|
||||
$this->client = $client;
|
||||
$this->index = $index;
|
||||
}
|
||||
|
||||
public function findConcepts($term, $context = null, $lang = null)
|
||||
public function findConceptsBulk(array $terms, $lang = null)
|
||||
{
|
||||
// TODO Use bulk queries for performance
|
||||
$concepts = array();
|
||||
foreach ($terms as $term) {
|
||||
$concepts[] = $this->findConcepts($term, $lang);
|
||||
}
|
||||
|
||||
return $concepts;
|
||||
}
|
||||
|
||||
public function findConcepts($term, $lang = null)
|
||||
{
|
||||
if (!($term instanceof TermInterface)) {
|
||||
$term = new Term($term);
|
||||
}
|
||||
|
||||
// TODO Check that term queries are ok with multiple words
|
||||
$query = array();
|
||||
$query['term']['value'] = $term;
|
||||
$field = $lang ? sprintf('value.%s', $lang) : 'value.light';
|
||||
$query['match'][$field]['query'] = $term->getValue();
|
||||
$query['match'][$field]['operator'] = 'and';
|
||||
// Allow 25% of non-matching tokens
|
||||
// (not exactly the same that 75% of matching tokens)
|
||||
// $query['match'][$field]['minimum_should_match'] = '-25%';
|
||||
|
||||
if ($context) {
|
||||
if ($term->hasContext()) {
|
||||
$term_query = $query;
|
||||
$query = array();
|
||||
$query['bool']['must'][0] = $term_query;
|
||||
$query['bool']['must'][1]['term']['context'] = $context;
|
||||
$query['bool']['must'][1]['term']['context'] = $term->getContext();
|
||||
}
|
||||
|
||||
if ($lang) {
|
||||
$term_query = $query;
|
||||
$query = array();
|
||||
$query['filtered']['query'] = $term_query;
|
||||
$query['filtered']['filter']['term']['lang'] = $lang;
|
||||
}
|
||||
|
||||
// TODO Only search in a specific databox
|
||||
// $term_query = $query;
|
||||
// $query = array();
|
||||
// $query['filtered']['query'] = $term_query;
|
||||
// $query['filtered']['filter']['term']['databox_id'] = $databox_id;
|
||||
|
||||
// Path deduplication
|
||||
$aggs = array();
|
||||
$aggs['dedup']['terms']['field'] = 'path';
|
||||
|
||||
// Search request
|
||||
$params = array();
|
||||
$params['index'] = $this->index;
|
||||
$params['type'] = TermIndexer::TYPE_NAME;
|
||||
$params['body']['query'] = $query;
|
||||
$params['body']['aggs'] = $aggs;
|
||||
// Arbitrary score low limit, we need find a more granular way to remove
|
||||
// inexact concepts.
|
||||
// We also need to disable TF/IDF on terms, and try to boost score only
|
||||
// when the search match nearly all tokens of term's value field.
|
||||
$params['body']['min_score'] = self::MIN_SCORE;
|
||||
// No need to get any hits since we extract data from aggs
|
||||
$params['body']['size'] = 0;
|
||||
|
||||
$response = $this->client->search($params);
|
||||
|
||||
// Extract concept paths from response
|
||||
@@ -54,7 +101,7 @@ class Thesaurus
|
||||
$buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []);
|
||||
foreach ($buckets as $bucket) {
|
||||
if (isset($bucket['key'])) {
|
||||
$concepts[] = $bucket['key'];
|
||||
$concepts[] = new Concept($bucket['key']);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of Phraseanet
|
||||
*
|
||||
* (c) 2005-2014 Alchemy
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||
|
||||
class Concept
|
||||
{
|
||||
private $path;
|
||||
|
||||
public function __construct($path)
|
||||
{
|
||||
$this->path = (string) $path;
|
||||
}
|
||||
|
||||
public function getPath()
|
||||
{
|
||||
return $this->path;
|
||||
}
|
||||
|
||||
public function isNarrowerThan(Concept $other)
|
||||
{
|
||||
// A concept is the child of another if it begins with the other
|
||||
return 0 === strpos($this->getPath(), $other->getPath() . '/');
|
||||
}
|
||||
|
||||
public function __toString()
|
||||
{
|
||||
return $this->path;
|
||||
}
|
||||
|
||||
public static function toPathArray(array $concepts)
|
||||
{
|
||||
foreach ($concepts as $index => $concept) {
|
||||
$concepts[$index] = $concept->getPath();
|
||||
}
|
||||
return $concepts;
|
||||
}
|
||||
|
||||
public static function pruneNarrowConcepts($concepts)
|
||||
{
|
||||
// Build a map with paths as keys
|
||||
$concepts = array_combine(Concept::toPathArray($concepts), $concepts);
|
||||
// Paths are sorted in advance to keep search O(n)
|
||||
ksort($concepts);
|
||||
// With sorting, the first element can't be a child
|
||||
$broad = current($concepts);
|
||||
next($concepts);
|
||||
// Start prunning concepts narrower than current broad one
|
||||
while ($concept = current($concepts)) {
|
||||
if ($concept->isNarrowerThan($broad)) {
|
||||
unset($concepts[key($concepts)]);
|
||||
} else {
|
||||
// End of prunable childs, beginning of a new concept
|
||||
$broad = $concept;
|
||||
next($concepts);
|
||||
}
|
||||
}
|
||||
|
||||
return $concepts;
|
||||
}
|
||||
}
|
||||
57
lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php
Normal file
57
lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php
Normal file
@@ -0,0 +1,57 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of Phraseanet
|
||||
*
|
||||
* (c) 2005-2014 Alchemy
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\AST\TextNode;
|
||||
|
||||
class Term implements TermInterface
|
||||
{
|
||||
private $value;
|
||||
private $context;
|
||||
|
||||
public function __construct($value, $context = null)
|
||||
{
|
||||
$this->value = (string) $value;
|
||||
if ($context) {
|
||||
$this->context = (string) $context;
|
||||
}
|
||||
}
|
||||
|
||||
public function getValue()
|
||||
{
|
||||
return $this->value;
|
||||
}
|
||||
|
||||
public function hasContext()
|
||||
{
|
||||
return $this->context !== null;
|
||||
}
|
||||
|
||||
public function getContext()
|
||||
{
|
||||
return $this->context;
|
||||
}
|
||||
|
||||
public function __toString()
|
||||
{
|
||||
return self::dump($this);
|
||||
}
|
||||
|
||||
public static function dump(TermInterface $term)
|
||||
{
|
||||
if ($term->hasContext()) {
|
||||
return sprintf('%s (%s)', $term->getValue(), $term->getContext());
|
||||
}
|
||||
|
||||
return $term->getValue();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of Phraseanet
|
||||
*
|
||||
* (c) 2005-2014 Alchemy
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||
|
||||
interface TermInterface
|
||||
{
|
||||
public function getValue();
|
||||
public function hasContext();
|
||||
public function getContext();
|
||||
}
|
||||
Reference in New Issue
Block a user