diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php b/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php
index 269f6700dd..ac5da01b6b 100644
--- a/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php
+++ b/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php
@@ -13,6 +13,8 @@ namespace Alchemy\Phrasea\Command\Thesaurus;
use Alchemy\Phrasea\Command\Command;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
@@ -30,12 +32,23 @@ class FindConceptsCommand extends Command
InputArgument::REQUIRED,
'Reverse search a term to infer concepts'
)
+ ->addArgument(
+ 'context',
+ InputArgument::OPTIONAL,
+ 'Restrict search to a specific term context'
+ )
->addOption(
'locale',
null,
InputOption::VALUE_REQUIRED,
'Specify input locale'
)
+ ->addOption(
+ 'broad',
+ null,
+ InputOption::VALUE_NONE,
+ 'Keep broad concepts (discards narrower concepts)'
+ )
->addOption(
'raw',
null,
@@ -48,16 +61,26 @@ class FindConceptsCommand extends Command
protected function doExecute(InputInterface $input, OutputInterface $output)
{
$term = $input->getArgument('term');
+ $context = $input->getArgument('context');
$raw = $input->getOption('raw');
if (!$raw) {
- $output->writeln(sprintf('Finding linked concepts: %s', $term));
+ $message = sprintf('Finding linked concepts: %s', $term);
+ if ($context) {
+ $message .= sprintf(' (with context %s)', $context);
+ }
+ $output->writeln($message);
$output->writeln(str_repeat('-', 20));
}
$thesaurus = $this->container['thesaurus'];
+ $term = new Term($term, $context);
$locale = $input->getOption('locale');
- $concepts = $thesaurus->findConcepts($term, null, $locale);
+ $concepts = $thesaurus->findConcepts($term, $locale);
+
+ if ($input->getOption('broad')) {
+ $concepts = Concept::pruneNarrowConcepts($concepts);
+ }
if (count($concepts)) {
$output->writeln($concepts);
diff --git a/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php b/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php
index a3b428cc36..b0b34fcaf7 100644
--- a/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php
+++ b/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php
@@ -18,6 +18,7 @@ use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
+use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryParser;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngine;
@@ -87,6 +88,7 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
$app['elasticsearch.indexer.record_indexer'] = $app->share(function ($app) {
return new RecordIndexer(
+ $app['elasticsearch.record_helper'],
$app['thesaurus'],
$app['elasticsearch.engine'],
$app['phraseanet.appbox'],
@@ -94,6 +96,10 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
);
});
+ $app['elasticsearch.record_helper'] = $app->share(function ($app) {
+ return new RecordHelper($app['phraseanet.appbox']);
+ });
+
$app['elasticsearch.client'] = $app->share(function($app) {
$options = $app['elasticsearch.options'];
$clientParams = ['hosts' => [sprintf('%s:%s', $options['host'], $options['port'])]];
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php
index 99e5579c64..3d890e1fa7 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php
@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
class AndExpression extends BinaryOperator
{
protected $operator = 'AND';
- public function getQuery($fields = ['_all'])
+ public function buildQuery(QueryContext $context)
{
- $left = $this->left->getQuery($fields);
- $right = $this->right->getQuery($fields);
+ $left = $this->left->buildQuery($context);
+ $right = $this->right->buildQuery($context);
return array(
'bool' => array(
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php
index 54c41a4514..6876e16f8f 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php
@@ -19,9 +19,11 @@ abstract class BinaryOperator extends Node
return sprintf('(%s %s %s)', $this->left, $this->operator, $this->right);
}
- public function isFullTextOnly()
+ public function getTextNodes()
{
- return $this->left->isFullTextOnly()
- && $this->right->isFullTextOnly();
+ return array_merge(
+ $this->left->getTextNodes(),
+ $this->right->getTextNodes()
+ );
}
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php
index a1c812b444..2959ee4eec 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php
@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
class ExceptExpression extends BinaryOperator
{
protected $operator = 'EXCEPT';
- public function getQuery($fields = ['_all'])
+ public function buildQuery(QueryContext $context)
{
- $left = $this->left->getQuery($fields);
- $right = $this->right->getQuery($fields);
+ $left = $this->left->buildQuery($context);
+ $right = $this->right->buildQuery($context);
return array(
'bool' => array(
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php
index f15de1b18e..5bd7aec253 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php
@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
class FieldNode extends Node
{
protected $keyword;
@@ -16,18 +18,18 @@ class FieldNode extends Node
return $this->keyword;
}
- public function getQuery()
+ public function buildQuery(QueryContext $context)
{
throw new \LogicException("A keyword can't be converted to a query.");
}
- public function __toString()
+ public function getTextNodes()
{
- return sprintf('<%s>', $this->keyword);
+ throw new \LogicException("A keyword can't contain text nodes.");
}
- public function isFullTextOnly()
+ public function __toString()
{
- return false;
+ return sprintf('', $this->keyword);
}
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php
index fb8f8fad54..00f95afdc3 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php
@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
class InExpression extends Node
{
protected $field;
@@ -13,19 +15,20 @@ class InExpression extends Node
$this->expression = $expression;
}
- public function getQuery()
+ public function buildQuery(QueryContext $context)
{
- return $this->expression->getQuery($this->field->getValue());
+ $fields = array($this->field->getValue());
+
+ return $this->expression->buildQuery($context->narrowToFields($fields));
+ }
+
+ public function getTextNodes()
+ {
+ return $this->expression->getTextNodes();
}
public function __toString()
{
return sprintf('(%s IN %s)', $this->expression, $this->field);
}
-
- public function isFullTextOnly()
- {
- // In expressions are never full-text
- return false;
- }
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php
index b601f4188d..36467b9fb3 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php
@@ -2,15 +2,14 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
abstract class Node
{
/**
* @return array The Elasticsearch formatted query
*/
- abstract public function getQuery();
+ abstract public function buildQuery(QueryContext $context);
- /**
- * @return bool Tell if the node and it's child are full-text queries only
- */
- abstract public function isFullTextOnly();
+ abstract public function getTextNodes();
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php
index 54e294de86..00890f8c03 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php
@@ -2,9 +2,11 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
class NullQueryNode extends Node
{
- public function getQuery()
+ public function buildQuery(QueryContext $context)
{
return array('match_all' => array());
}
@@ -18,9 +20,4 @@ class NullQueryNode extends Node
{
return '';
}
-
- public function isFullTextOnly()
- {
- return false;
- }
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php
index 6a345688e5..61c58e2109 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php
@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
class OrExpression extends BinaryOperator
{
protected $operator = 'OR';
- public function getQuery($fields = ['_all'])
+ public function buildQuery(QueryContext $context)
{
- $left = $this->left->getQuery($fields);
- $right = $this->right->getQuery($fields);
+ $left = $this->left->buildQuery($context);
+ $right = $this->right->buildQuery($context);
return array(
'bool' => array(
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php
index 8e63f9e586..c18754e58e 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php
@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
class PrefixNode extends Node
{
protected $prefix;
@@ -11,11 +13,11 @@ class PrefixNode extends Node
$this->prefix = $prefix;
}
- public function getQuery($fields = ['_all'])
+ public function buildQuery(QueryContext $context)
{
return array(
'multi_match' => array(
- 'fields' => $fields,
+ 'fields' => $context->getLocalizedFields(),
'query' => $this->prefix,
'type' => 'phrase_prefix'
)
@@ -26,9 +28,4 @@ class PrefixNode extends Node
{
return sprintf('prefix("%s")', $this->prefix);
}
-
- public function isFullTextOnly()
- {
- return true;
- }
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php
index 35df144d6c..48a5f913d3 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php
@@ -2,22 +2,19 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+
class QuotedTextNode extends TextNode
{
- public function getQuery($fields = ['_all'])
+ public function buildQuery(QueryContext $context)
{
return array(
'multi_match' => array(
'type' => 'phrase',
- 'fields' => $fields,
+ 'fields' => $context->getLocalizedFields(),
'query' => $this->text,
// 'operator' => 'and'
)
);
}
-
- public function isFullTextOnly()
- {
- return true;
- }
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php
index c96aad7d4f..da7315385b 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php
@@ -2,23 +2,49 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
-class TextNode extends Node
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
+
+class TextNode extends Node implements TermInterface
{
protected $text;
+ protected $concepts = array();
public function __construct($text)
{
$this->text = $text;
}
- public function getQuery($fields = ['_all'])
+ public function setConcepts(array $concepts)
{
- return array(
+ $this->concepts = $concepts;
+ }
+
+ public function buildQuery(QueryContext $context)
+ {
+ $query = array(
'multi_match' => array(
- 'fields' => $fields,
+ 'fields' => $context->getLocalizedFields(),
'query' => $this->text,
)
);
+
+ if ($this->concepts) {
+ $shoulds = array($query);
+ foreach (Concept::pruneNarrowConcepts($this->concepts) as $concept) {
+ $shoulds[]['term']['concept_paths'] = $concept->getPath();
+ }
+ $query = array();
+ $query['bool']['should'] = $shoulds;
+ }
+
+ return $query;
+ }
+
+ public function getTextNodes()
+ {
+ return array($this);
}
public function __toString()
@@ -26,13 +52,22 @@ class TextNode extends Node
return sprintf('"%s"', $this->text);
}
- public function isFullTextOnly()
- {
- return true;
- }
- public function getText()
+ // Implementation of TermInterface
+
+ public function getValue()
{
return $this->text;
}
+
+ public function hasContext()
+ {
+ return false;
+ }
+
+ public function getContext()
+ {
+ // TODO Insert context during parsing
+ return null;
+ }
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php
index 4a43151f5a..cfd2930649 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php
@@ -13,7 +13,9 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
-use Alchemy\Phrasea\SearchEngine\Elastic\Search\SearchQuery;
+use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
use Alchemy\Phrasea\SearchEngine\SearchEngineInterface;
use Alchemy\Phrasea\SearchEngine\SearchEngineOptions;
use Alchemy\Phrasea\SearchEngine\SearchEngineResult;
@@ -278,72 +280,27 @@ class ElasticSearchEngine implements SearchEngineInterface
$searchQuery = $this->app['query_parser']->parse($string);
- // Contains the full thesaurus paths to search on
- $pathsToFilter = [];
- // Contains the thesaurus values by fields (synonyms, translations, etc)
- $collectFields = [];
+ $query['_ast'] = $searchQuery->dump();
- // Only search in thesaurus for full text search
- if ($searchQuery->isFullTextOnly()) {
- $termFields = $this->expendToAnalyzedFieldsNames('value', null, $this->app['locale']);
- $termsQuery = $searchQuery->getElasticsearchQuery($termFields);
- $params = $this->createTermQueryParams($termsQuery, $options);
- $terms = $this->doExecute('search', $params);
- foreach ($terms['hits']['hits'] as $term) {
- // Skip paths with very low score
- if ($term['_score'] < 1) {
- continue;
- }
+ $thesaurus = $this->app['thesaurus'];
+ $textNodes = $searchQuery->getTextNodes();
+ $concepts = $thesaurus->findConceptsBulk($textNodes);
- $pathsToFilter[$term['_source']['path']] = $term['_score'];
-
- foreach ($term['_source']['fields'] as $field) {
- $collectFields['caption.'.$field][] = $term['_source']['value'];
- }
- }
- $pathsToFilter = array_unique($pathsToFilter);
+ foreach ($concepts as $index => $termConcepts) {
+ $node = $textNodes[$index];
+ $node->setConcepts($termConcepts);
+ $term = Term::dump($node);
+ $query['_thesaurus_concepts'][$term] = Concept::toPathArray($termConcepts);
}
- if (empty($collectFields)) {
- // @todo a list of field by default? all fields?
- $searchFieldNames = ['caption.*'];
- } else {
- $searchFieldNames = array_keys($collectFields);
- }
+ $recordHelper = $this->app['elasticsearch.record_helper'];
+ // TODO Pass options to getFields to include/exclude private fields
+ $searchableFields = $recordHelper->getFields();
+ $queryContext = new QueryContext($searchableFields, $this->locales, $this->app['locale']);
+ $recordQuery = $searchQuery->build($queryContext);
- $recordFields = $this->expendToAnalyzedFieldsNames($searchFieldNames, null, $this->app['locale']);
-
- $recordQuery = [
- 'bool' => [
- 'should' => [
- $searchQuery->getElasticsearchQuery($recordFields)
- ]
- ]
- ];
-
- foreach ($pathsToFilter as $path => $score) {
- // Also match incomplete path. /a/b/c will return /a/b/c/d records
- $recordQuery['bool']['should'][] = [
- 'match' => [
- 'concept_paths' => array(
- 'query' => $path,
- 'boost' => $score,
- )
- ]
- ];
-
- // Add signal for exact path only
- $recordQuery['bool']['should'][] = [
- 'term' => [
- 'concept_paths.raw' => array(
- 'value' => $path,
- 'boost' => $score,
- )
- ]
- ];
- }
$params = $this->createRecordQueryParams($recordQuery, $options, null);
$params['body']['from'] = $offset;
@@ -378,10 +335,15 @@ class ElasticSearchEngine implements SearchEngineInterface
$results[] = ElasticsearchRecordHydrator::hydrate($hit['_source'], $n++);
}
+ $query['_searchable_fields'] = $searchableFields;
$query['_ast'] = $searchQuery->dump();
- $query['_paths'] = $pathsToFilter;
- $query['_richFields'] = $collectFields;
- $query['query'] = json_encode($params);
+ // $query['_paths'] = $pathsToFilter;
+ // $query['_richFields'] = $collectFields;
+
+ $queryyy = $recordQuery;
+ // $queryyy = $params['body'];
+ $query['query'] = $queryyy;
+ $query['query_as_string'] = json_encode($queryyy);
return new SearchEngineResult($results, json_encode($query), $res['took'], $offset,
$res['hits']['total'], $res['hits']['total'], null, null, $suggestions, [],
@@ -572,36 +534,6 @@ class ElasticSearchEngine implements SearchEngineInterface
return $res;
}
- /**
- * @param array|string $fields
- * @param array|null $locales
- * @param null $currentLocale
- * @return array
- */
- public function expendToAnalyzedFieldsNames($fields, $locales = null, $currentLocale = null)
- {
- $fieldsExpended = [];
-
- if (!$locales) {
- $locales = $this->locales;
- }
-
- foreach ((array) $fields as $field) {
- foreach ($locales as $locale) {
- $boost = "";
-
- if ($locale === $currentLocale) {
- $boost = "^5";
- }
-
- $fieldsExpended[] = sprintf('%s.%s%s', $field, $locale, $boost);
- }
- $fieldsExpended[] = sprintf('%s.%s', $field, 'light^10');
- }
-
- return $fieldsExpended;
- }
-
private function getFlagsKey(\appbox $appbox)
{
$flags = [];
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php
index 36006f325f..a2d8d05d61 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php
@@ -20,13 +20,16 @@ use Alchemy\Phrasea\SearchEngine\Elastic\RecordFetcher;
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
-use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
use media_subdef;
class RecordIndexer
{
const TYPE_NAME = 'record';
+ private $helper;
+
+ private $thesaurus;
+
/**
* @var \appbox
*/
@@ -42,10 +45,9 @@ class RecordIndexer
*/
private $locales;
- private $dataStructure;
-
- public function __construct(Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales)
+ public function __construct(RecordHelper $helper, Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales)
{
+ $this->helper = $helper;
$this->thesaurus = $thesaurus;
$this->appbox = $appbox;
$this->elasticSearchEngine = $elasticSearchEngine;
@@ -54,11 +56,8 @@ class RecordIndexer
public function populateIndex(BulkOperation $bulk)
{
- // Helper to fetch record related data
- $recordHelper = new RecordHelper($this->appbox);
-
foreach ($this->appbox->get_databoxes() as $databox) {
- $fetcher = new RecordFetcher($databox, $recordHelper);
+ $fetcher = new RecordFetcher($databox, $this->helper);
$fetcher->setBatchSize(200);
while ($records = $fetcher->fetch()) {
foreach ($records as $record) {
@@ -74,9 +73,7 @@ class RecordIndexer
public function indexSingleRecord(\record_adapter $record_adapter, $indexName)
{
- // Helper to fetch record related data
- $recordHelper = new RecordHelper($this->appbox);
- $fetcher = new RecordFetcher($record_adapter->get_databox(), $recordHelper);
+ $fetcher = new RecordFetcher($record_adapter->get_databox(), $this->helper);
$record = $fetcher->fetchOne($record_adapter);
$params = array();
@@ -140,13 +137,13 @@ class RecordIndexer
->add('record_id', 'integer') // Compound primary key
->add('databox_id', 'integer') // Compound primary key
->add('base_id', 'integer') // Unique collection ID
- ->add('collection_id', 'integer') // Useless collection ID (local to databox)
- ->add('collection_name', 'string')->notAnalyzed() // Collection name
- ->add('uuid', 'string')->notAnalyzed()
- ->add('sha256', 'string')->notAnalyzed()
+ ->add('collection_id', 'integer')->notIndexed() // Useless collection ID (local to databox)
+ ->add('collection_name', 'string')->notIndexed() // Collection name
+ ->add('uuid', 'string')->notIndexed()
+ ->add('sha256', 'string')->notIndexed()
// Mandatory metadata
- ->add('original_name', 'string')->notAnalyzed()
- ->add('mime', 'string')->notAnalyzed()
+ ->add('original_name', 'string')->notIndexed()
+ ->add('mime', 'string')->notIndexed()
->add('type', 'string')->notAnalyzed()
->add('record_type', 'string')->notAnalyzed() // record or story
// Dates
@@ -157,29 +154,15 @@ class RecordIndexer
->analyzer('thesaurus_path', 'indexing')
->analyzer('keyword', 'searching')
->addRawVersion()
+ // EXIF
+ ->add('exif', $this->getExifMapping())
+ // Status
+ ->add('flags', $this->getFlagsMapping())
+ // Keep some fields arround for display purpose
+ ->add('subdefs', Mapping::disabledMapping())
+ ->add('title', Mapping::disabledMapping())
;
- // Index title
- $titleMapping = new Mapping();
- $titleMapping->add('default', 'string')->notAnalyzed()->notIndexed();
- foreach ($this->locales as $locale) {
- $titleMapping->add($locale, 'string')->notAnalyzed()->notIndexed();
- }
- $mapping->add('title', $titleMapping);
-
- // Minimal subdefs mapping info for display purpose
- $subdefMapping = new Mapping();
- $subdefMapping->add('path', 'string')->notAnalyzed()->notIndexed();
- $subdefMapping->add('height', 'integer')->notIndexed();
- $subdefMapping->add('width', 'integer')->notIndexed();
-
- $subdefsMapping = new Mapping();
- $subdefsMapping->add('thumbnail', $subdefMapping);
- $subdefsMapping->add('thumbnailgif', $subdefMapping);
- $subdefsMapping->add('preview', $subdefMapping);
-
- $mapping->add('subdefs', $subdefsMapping);
-
// Caption mapping
$captionMapping = new Mapping();
$mapping->add('caption', $captionMapping);
@@ -194,9 +177,9 @@ class RecordIndexer
}
if ($params['type'] === Mapping::TYPE_STRING) {
- if (!$params['indexable'] && !$params['to_aggregate']) {
+ if (!$params['searchable'] && !$params['to_aggregate']) {
$m->notIndexed();
- } elseif (!$params['indexable'] && $params['to_aggregate']) {
+ } elseif (!$params['searchable'] && $params['to_aggregate']) {
$m->notAnalyzed();
$m->addRawVersion();
} else {
@@ -206,89 +189,13 @@ class RecordIndexer
}
}
- // EXIF
- $mapping->add('exif', $this->getExifMapping());
-
- // Status
- $mapping->add('flags', $this->getFlagsMapping());
-
return $mapping->export();
}
private function getFieldsStructure()
{
- if (!empty($this->dataStructure)) {
- return $this->dataStructure;
- }
-
- $fields = array();
-
- foreach ($this->appbox->get_databoxes() as $databox) {
- //printf("Databox %d\n", $databox->get_sbas_id());
- foreach ($databox->get_meta_structure() as $fieldStructure) {
- $field = array();
- // Field type
- switch ($fieldStructure->get_type()) {
- case \databox_field::TYPE_DATE:
- $field['type'] = 'date';
- break;
- case \databox_field::TYPE_NUMBER:
- $field['type'] = 'double';
- break;
- case \databox_field::TYPE_STRING:
- case \databox_field::TYPE_TEXT:
- $field['type'] = 'string';
- break;
- default:
- throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type()));
- break;
- }
-
- $name = $fieldStructure->get_name();
-
- // Business rules
- $field['private'] = $fieldStructure->isBusiness();
- $field['indexable'] = $fieldStructure->is_indexable();
- $field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
-
- // Thesaurus concept inference
- // $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
- $helper = new ThesaurusHelper();
-
- // TODO Not the real option yet
- $field['thesaurus_concept_inference'] = $field['type'] === 'string';
- // TODO Find thesaurus path prefixes
- $field['thesaurus_prefix'] = '/categories';
-
- //printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);
-
- // Since mapping is merged between databoxes, two fields may
- // have conflicting names. Indexing is the same for a given
- // type so we reject only those with different types.
- if (isset($fields[$name])) {
- if ($fields[$name]['type'] !== $field['type']) {
- throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type']));
- }
-
- if ($fields[$name]['indexable'] !== $field['indexable']) {
- throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name));
- }
-
- if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) {
- throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name));
- }
- // TODO other structure incompatibilities
-
- //printf("Merged with previous \"%s\" field\n", $name);
- }
-
- $fields[$name] = $field;
- }
- }
-
- $this->dataStructure = $fields;
- return $this->dataStructure;
+ return $this->helper->getFieldsStructure();
}
// @todo Add call to addAnalyzedVersion ?
@@ -374,7 +281,7 @@ class RecordIndexer
}
}
- $record['concept_paths'] = $this->findLinkedConcepts($structure, $record);
+ // $record['concept_paths'] = $this->findLinkedConcepts($structure, $record);
return $record;
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php
index 99127db7b9..54f4e43b08 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php
@@ -41,39 +41,28 @@ class TermIndexer
{
foreach ($this->appbox->get_databoxes() as $databox) {
/** @var databox $databox */
- $databoxId = $databox->get_sbas_id();
+ $databoxId = $databox->get_sbas_id();
- $document = self::thesaurusFromDatabox($databox);
- $dedicatedFieldTerms = $this->getDedicatedFieldTerms($databox, $document);
+ $visitor = new TermVisitor(function ($term) use ($bulk, $databoxId) {
+ // Path and id are prefixed with a databox identifier to not
+ // collide with other databoxes terms
- $visitor = new TermVisitor(function ($term) use ($bulk, $databoxId, $dedicatedFieldTerms) {
- //printf("- %s (%s)\n", $term['path'], $term['value']);
// Term structure
- $id = $term['id'];
+ $id = sprintf('%s_%s', $databoxId, $term['id']);
unset($term['id']);
-
+ $term['path'] = sprintf('/%s%s', $databoxId, $term['path']);
$term['databox_id'] = $databoxId;
- $term['branch_id'] = $id;
-
- // @todo move to the TermVisitor? dunno.
- $term['fields'] = null;
- foreach ($dedicatedFieldTerms as $partialId => $fields) {
- if (strpos($id, $partialId) === 0) {
- foreach ($fields as $field) {
- $term['fields'][] = $field;
- }
- }
- }
// Index request
$params = array();
- $params['id'] = sprintf('%s_%s', $databoxId, $id);
+ $params['id'] = $id;
$params['type'] = self::TYPE_NAME;
$params['body'] = $term;
$bulk->index($params);
});
+ $document = self::thesaurusFromDatabox($databox);
$this->navigator->walk($document, $visitor);
}
}
@@ -88,38 +77,20 @@ class TermIndexer
return $dom;
}
- private function getDedicatedFieldTerms(databox $databox, DOMDocument $document)
- {
- $xpath = new \DOMXpath($document);
- $dedicatedFieldTerms = [];
-
- foreach ($databox->get_meta_structure() as $f) {
- if ($f->get_tbranch()) {
- $elements = $xpath->query($f->get_tbranch());
-
- if ($elements) {
- foreach ($elements as $element) {
- $dedicatedFieldTerms[$element->getAttribute('id')][] = $f->get_name();
- }
- }
- }
- }
-
- return $dedicatedFieldTerms;
- }
-
public function getMapping()
{
$mapping = new Mapping();
$mapping
->add('raw_value', 'string')->notAnalyzed()
- ->add('value', 'string')->addAnalyzedVersion($this->locales)
- ->add('context', 'string')->addAnalyzedVersion($this->locales)
+ ->add('value', 'string')
+ ->analyzer('general_light')
+ ->addLocalizedSubfields($this->locales)
+ ->add('context', 'string')
+ ->analyzer('general_light')
+ ->addLocalizedSubfields($this->locales)
->add('path', 'string')->notAnalyzed()
->add('lang', 'string')->notAnalyzed()
- ->add('branch_id', 'string')->notAnalyzed()
->add('databox_id', 'integer')
- ->add('fields', 'string')->notAnalyzed()
;
return $mapping->export();
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php
index 2232d65d19..3093c96e9a 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php
@@ -18,6 +18,7 @@ class Mapping
{
private $fields = array();
private $current;
+ private $enabled = true;
const DATE_FORMAT_MYSQL = 'yyyy-MM-dd HH:mm:ss';
const DATE_FORMAT_CAPTION = 'yyyy/MM/dd'; // ES format
@@ -54,7 +55,7 @@ class Mapping
$field = array();
if ($type instanceof self) {
$field['type'] = self::TYPE_OBJECT;
- $field['properties'] = $type;
+ $field['mapping'] = $type;
}
elseif (in_array($type, self::$types)) {
$field['type'] = $type;
@@ -74,20 +75,19 @@ class Mapping
public function export()
{
- return ['properties' => $this->exportProperties()];
- }
-
- public function exportProperties()
- {
- $properties = array();
+ $mapping = array();
foreach ($this->fields as $name => $field) {
- $properties[$name] = $field;
if ($field['type'] === self::TYPE_OBJECT) {
- $properties[$name]['properties'] = $field['properties']->exportProperties();
+ $field = $field['mapping']->export();
}
+ $mapping['properties'][$name] = $field;
}
- return $properties;
+ if (!$this->enabled) {
+ $mapping['enabled'] = false;
+ }
+
+ return $mapping;
}
public function analyzer($analyzer, $type = null)
@@ -134,6 +134,23 @@ class Mapping
return $this;
}
+ public static function disabledMapping()
+ {
+ return (new self())->disable();
+ }
+
+ /**
+ * Allows to disable parsing and indexing a named object completely.
+ * This is handy when a portion of the JSON document contains arbitrary JSON
+ * which should not be indexed, nor added to the mapping.
+ */
+ private function disable()
+ {
+ $this->enabled = false;
+
+ return $this;
+ }
+
public function addRawVersion()
{
$field = &$this->currentField();
@@ -146,22 +163,30 @@ class Mapping
return $this;
}
- public function addAnalyzedVersion(array $langs)
+ /**
+ * @deprecated
+ */
+ public function addAnalyzedVersion(array $locales)
{
$field = &$this->currentField();
-
- foreach ($langs as $lang) {
- $field['fields'][$lang] = [
- 'type' => $field['type'],
- 'analyzer' => sprintf('%s_full', $lang)
- ];
- }
-
$field['fields']['light'] = [
'type' => $field['type'],
'analyzer' => 'general_light'
];
+ return $this->addLocalizedSubfields($locales);
+ }
+
+ public function addLocalizedSubfields(array $locales)
+ {
+ $field = &$this->currentField();
+
+ foreach ($locales as $locale) {
+ $field['fields'][$locale] = array();
+ $field['fields'][$locale]['type'] = $field['type'];
+ $field['fields'][$locale]['analyzer'] = sprintf('%s_full', $locale);
+ }
+
return $this;
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php
index 2453a06f8e..da4710ca95 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php
@@ -11,18 +11,21 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
use appbox;
use igorw;
class RecordHelper
{
- private $connection;
+ private $appbox;
+ // Computation caches
private $collectionMap;
+ private $fieldStructure;
public function __construct(appbox $appbox)
{
- $this->connection = $appbox->get_connection();
+ $this->appbox = $appbox;
}
public function getUniqueRecordId($databoxId, $recordId)
@@ -46,12 +49,13 @@ class RecordHelper
private function collectionMap()
{
if (!$this->collectionMap) {
+ $connection = $this->appbox->get_connection();
$sql = 'SELECT
sbas_id as databox_id,
server_coll_id as collection_id,
base_id
FROM bas';
- $statement = $this->connection->query($sql);
+ $statement = $connection->query($sql);
$map = array();
while ($mapping = $statement->fetch()) {
@@ -68,4 +72,97 @@ class RecordHelper
{
return StringUtils::slugify($key, '_');
}
+
+ public function getFields($includePrivate = false, $onlySearchable = true)
+ {
+ $fields = array();
+ foreach ($this->getFieldsStructure() as $name => $options) {
+ // Skip private fields
+ if ($options['private'] && !$includePrivate) {
+ continue;
+ }
+ // Skip not searchable fields
+ if ($onlySearchable && !$options['searchable']) {
+ continue;
+ }
+ $fields[] = $name;
+ }
+
+ return $fields;
+ }
+
+ public function getFieldsStructure()
+ {
+ if (!empty($this->fieldsStructure)) {
+ return $this->fieldsStructure;
+ }
+
+ $fields = array();
+
+ foreach ($this->appbox->get_databoxes() as $databox) {
+ //printf("Databox %d\n", $databox->get_sbas_id());
+ foreach ($databox->get_meta_structure() as $fieldStructure) {
+ $field = array();
+ // Field type
+ switch ($fieldStructure->get_type()) {
+ case \databox_field::TYPE_DATE:
+ $field['type'] = 'date';
+ break;
+ case \databox_field::TYPE_NUMBER:
+ $field['type'] = 'double';
+ break;
+ case \databox_field::TYPE_STRING:
+ case \databox_field::TYPE_TEXT:
+ $field['type'] = 'string';
+ break;
+ default:
+ throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type()));
+ break;
+ }
+
+ $name = $fieldStructure->get_name();
+
+ // Business rules
+ $field['private'] = $fieldStructure->isBusiness();
+ $field['searchable'] = $fieldStructure->is_indexable();
+ $field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
+
+ // Thesaurus concept inference
+ // $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
+ $helper = new ThesaurusHelper();
+
+ // TODO Not the real option yet
+ $field['thesaurus_concept_inference'] = $field['type'] === 'string';
+ // TODO Find thesaurus path prefixes
+ $field['thesaurus_prefix'] = '/categories';
+
+ //printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);
+
+ // Since mapping is merged between databoxes, two fields may
+ // have conflicting names. Indexing is the same for a given
+ // type so we reject only those with different types.
+ if (isset($fields[$name])) {
+ if ($fields[$name]['type'] !== $field['type']) {
+ throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type']));
+ }
+
+ if ($fields[$name]['searchable'] !== $field['searchable']) {
+ throw new MergeException(sprintf("Field %s can't be merged, incompatible searchable state", $name));
+ }
+
+ if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) {
+ throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name));
+ }
+ // TODO other structure incompatibilities
+
+ //printf("Merged with previous \"%s\" field\n", $name);
+ }
+
+ $fields[$name] = $field;
+ }
+ }
+
+ $this->fieldsStructure = $fields;
+ return $this->fieldsStructure;
+ }
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php
index db9715c2f8..4e05e46ed9 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php
@@ -18,19 +18,14 @@ class Query
$this->root = $root;
}
- /*
- * This method seems weird to me, the implementation returns true when the
- * query doesn't contain IN statements, but that doesn't define a full text
- * search.
- */
- public function isFullTextOnly()
+ public function getTextNodes()
{
- return $this->root->isFullTextOnly();
+ return $this->root->getTextNodes();
}
- public function getElasticsearchQuery($fields = array())
+ public function build(QueryContext $context)
{
- return $this->root->getQuery($fields);
+ return $this->root->buildQuery($context);
}
public function dump()
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php
new file mode 100644
index 0000000000..8fc5176da9
--- /dev/null
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php
@@ -0,0 +1,40 @@
+fields = $fields;
+ $this->locales = $locales;
+ $this->queryLocale = $queryLocale;
+ }
+
+ public function narrowToFields(array $fields)
+ {
+ // Ensure we are not escaping from original fields restrictions
+ $fields = array_intersect($this->fields, $fields);
+
+ return new static($fields, $this->locales, $this->queryLocale);
+ }
+
+ public function getLocalizedFields()
+ {
+ $fields = array();
+ foreach ($this->fields as $field) {
+ foreach ($this->locales as $locale) {
+ $boost = ($locale === $this->queryLocale) ? '^5' : '';
+ $fields[] = sprintf('caption.%s.%s%s', $field, $locale, $boost);
+ }
+ // TODO Put generic analyzers on main field instead of "light" sub-field
+ $fields[] = sprintf('caption.%s.%s', $field, 'light^10');
+ }
+
+ return $fields;
+ }
+}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php
index 75318df71d..efb62efb79 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php
@@ -119,7 +119,7 @@ class QueryVisitor implements Visit
if ($root instanceof AST\TextNode &&
!$root instanceof AST\QuotedTextNode &&
!$node instanceof AST\QuotedTextNode) {
- $root = new AST\TextNode(sprintf('%s %s', $root->getText(), $node->getText()));
+ $root = new AST\TextNode(sprintf('%s %s', $root->getValue(), $node->getValue()));
} else {
$root = new AST\AndExpression($root, $node);
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php
index 7201c50f9a..8b1aef3dbc 100644
--- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php
@@ -12,6 +12,9 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
+use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
use Elasticsearch\Client;
class Thesaurus
@@ -19,34 +22,78 @@ class Thesaurus
private $client;
private $index;
+ const MIN_SCORE = 6;
+
public function __construct(Client $client, $index)
{
$this->client = $client;
$this->index = $index;
}
- public function findConcepts($term, $context = null, $lang = null)
+ public function findConceptsBulk(array $terms, $lang = null)
{
+ // TODO Use bulk queries for performance
+ $concepts = array();
+ foreach ($terms as $term) {
+ $concepts[] = $this->findConcepts($term, $lang);
+ }
+
+ return $concepts;
+ }
+
+ public function findConcepts($term, $lang = null)
+ {
+ if (!($term instanceof TermInterface)) {
+ $term = new Term($term);
+ }
+
// TODO Check that term queries are ok with multiple words
$query = array();
- $query['term']['value'] = $term;
+ $field = $lang ? sprintf('value.%s', $lang) : 'value.light';
+ $query['match'][$field]['query'] = $term->getValue();
+ $query['match'][$field]['operator'] = 'and';
+ // Allow 25% of non-matching tokens
+ // (not exactly the same that 75% of matching tokens)
+ // $query['match'][$field]['minimum_should_match'] = '-25%';
- if ($context) {
+ if ($term->hasContext()) {
$term_query = $query;
$query = array();
$query['bool']['must'][0] = $term_query;
- $query['bool']['must'][1]['term']['context'] = $context;
+ $query['bool']['must'][1]['term']['context'] = $term->getContext();
}
+ if ($lang) {
+ $term_query = $query;
+ $query = array();
+ $query['filtered']['query'] = $term_query;
+ $query['filtered']['filter']['term']['lang'] = $lang;
+ }
+
+ // TODO Only search in a specific databox
+ // $term_query = $query;
+ // $query = array();
+ // $query['filtered']['query'] = $term_query;
+ // $query['filtered']['filter']['term']['databox_id'] = $databox_id;
+
// Path deduplication
$aggs = array();
$aggs['dedup']['terms']['field'] = 'path';
// Search request
$params = array();
+ $params['index'] = $this->index;
$params['type'] = TermIndexer::TYPE_NAME;
$params['body']['query'] = $query;
$params['body']['aggs'] = $aggs;
+ // Arbitrary score low limit, we need find a more granular way to remove
+ // inexact concepts.
+ // We also need to disable TF/IDF on terms, and try to boost score only
+ // when the search match nearly all tokens of term's value field.
+ $params['body']['min_score'] = self::MIN_SCORE;
+ // No need to get any hits since we extract data from aggs
+ $params['body']['size'] = 0;
+
$response = $this->client->search($params);
// Extract concept paths from response
@@ -54,7 +101,7 @@ class Thesaurus
$buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []);
foreach ($buckets as $bucket) {
if (isset($bucket['key'])) {
- $concepts[] = $bucket['key'];
+ $concepts[] = new Concept($bucket['key']);
}
}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php
new file mode 100644
index 0000000000..0512abff0d
--- /dev/null
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php
@@ -0,0 +1,69 @@
+path = (string) $path;
+ }
+
+ public function getPath()
+ {
+ return $this->path;
+ }
+
+ public function isNarrowerThan(Concept $other)
+ {
+ // A concept is the child of another if it begins with the other
+ return 0 === strpos($this->getPath(), $other->getPath() . '/');
+ }
+
+ public function __toString()
+ {
+ return $this->path;
+ }
+
+ public static function toPathArray(array $concepts)
+ {
+ foreach ($concepts as $index => $concept) {
+ $concepts[$index] = $concept->getPath();
+ }
+ return $concepts;
+ }
+
+ public static function pruneNarrowConcepts($concepts)
+ {
+ // Build a map with paths as keys
+ $concepts = array_combine(Concept::toPathArray($concepts), $concepts);
+ // Paths are sorted in advance to keep search O(n)
+ ksort($concepts);
+ // With sorting, the first element can't be a child
+ $broad = current($concepts);
+ next($concepts);
+ // Start prunning concepts narrower than current broad one
+ while ($concept = current($concepts)) {
+ if ($concept->isNarrowerThan($broad)) {
+ unset($concepts[key($concepts)]);
+ } else {
+ // End of prunable childs, beginning of a new concept
+ $broad = $concept;
+ next($concepts);
+ }
+ }
+
+ return $concepts;
+ }
+}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php
new file mode 100644
index 0000000000..395b5bd765
--- /dev/null
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php
@@ -0,0 +1,57 @@
+value = (string) $value;
+ if ($context) {
+ $this->context = (string) $context;
+ }
+ }
+
+ public function getValue()
+ {
+ return $this->value;
+ }
+
+ public function hasContext()
+ {
+ return $this->context !== null;
+ }
+
+ public function getContext()
+ {
+ return $this->context;
+ }
+
+ public function __toString()
+ {
+ return self::dump($this);
+ }
+
+ public static function dump(TermInterface $term)
+ {
+ if ($term->hasContext()) {
+ return sprintf('%s (%s)', $term->getValue(), $term->getContext());
+ }
+
+ return $term->getValue();
+ }
+}
diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php
new file mode 100644
index 0000000000..72a9f05492
--- /dev/null
+++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php
@@ -0,0 +1,19 @@
+