From d0b3995a33bae2c15ef530f425705df81cc72527 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Wed, 17 Dec 2014 23:01:09 +0100 Subject: [PATCH 01/14] Text nodes extraction from queries --- .../Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php | 8 ++++++++ .../Phrasea/SearchEngine/Elastic/AST/FieldNode.php | 5 +++++ .../Phrasea/SearchEngine/Elastic/AST/InExpression.php | 5 +++++ lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php | 2 ++ lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php | 5 +++++ lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php | 5 +++++ 6 files changed, 30 insertions(+) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php index 54c41a4514..714f40cb7c 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php @@ -19,6 +19,14 @@ abstract class BinaryOperator extends Node return sprintf('(%s %s %s)', $this->left, $this->operator, $this->right); } + public function getTextNodes() + { + return array_merge( + $this->left->getTextNodes(), + $this->right->getTextNodes() + ); + } + public function isFullTextOnly() { return $this->left->isFullTextOnly() diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php index f15de1b18e..3a4e64b75a 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php @@ -21,6 +21,11 @@ class FieldNode extends Node throw new \LogicException("A keyword can't be converted to a query."); } + public function getTextNodes() + { + throw new \LogicException("A keyword can't contain text nodes."); + } + public function __toString() { return sprintf('<%s>', $this->keyword); diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php index fb8f8fad54..0552d4dbad 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php @@ -18,6 +18,11 @@ class InExpression extends Node return $this->expression->getQuery($this->field->getValue()); } + public function getTextNodes() + { + return $this->expression->getTextNodes(); + } + public function __toString() { return sprintf('(%s IN %s)', $this->expression, $this->field); diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php index b601f4188d..7a13e47478 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php @@ -13,4 +13,6 @@ abstract class Node * @return bool Tell if the node and it's child are full-text queries only */ abstract public function isFullTextOnly(); + + abstract public function getTextNodes(); } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php index c96aad7d4f..cbb0ab4b4d 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php @@ -21,6 +21,11 @@ class TextNode extends Node ); } + public function getTextNodes() + { + return array($this); + } + public function __toString() { return sprintf('"%s"', $this->text); diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php index db9715c2f8..3af8d73fdb 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php @@ -18,6 +18,11 @@ class Query $this->root = $root; } + public function getTextNodes() + { + return $this->root->getTextNodes(); + } + /* * This method seems weird to me, the implementation returns true when the * query doesn't contain IN statements, but that doesn't define a full text From 2c0119b010fd634133e9a0527fb60db7a638ed82 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Wed, 17 Dec 2014 23:05:17 +0100 Subject: [PATCH 02/14] Change field representation in dumps --- lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php index 3a4e64b75a..6a634c7ebb 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php @@ -28,7 +28,7 @@ class FieldNode extends Node public function __toString() { - return sprintf('<%s>', $this->keyword); + return sprintf('', $this->keyword); } public function isFullTextOnly() From f283bf01d1233ed2f2fe6ef3c6ce20934bad2cef Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Wed, 17 Dec 2014 23:15:08 +0100 Subject: [PATCH 03/14] WIP Improve thesaurus relevency by filtering low score --- .../Elastic/ElasticSearchEngine.php | 12 +++++++ .../Elastic/Indexer/TermIndexer.php | 4 ++- .../SearchEngine/Elastic/Thesaurus.php | 34 ++++++++++++++++++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php index 4a43151f5a..46b51e849f 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php @@ -278,6 +278,18 @@ class ElasticSearchEngine implements SearchEngineInterface $searchQuery = $this->app['query_parser']->parse($string); + $query['_ast'] = $searchQuery->dump(); + + $thesaurus = $this->app['thesaurus']; + foreach ($searchQuery->getTextNodes() as $textNode) { + $text = $textNode->getText(); + $concepts = $thesaurus->findConcepts($text); + $query['_thesaurus_concepts'][$text] = $concepts; + } + + // $concepts = $thesaurus->findConceptsBulk($terms); + + // Contains the full thesaurus paths to search on $pathsToFilter = []; // Contains the thesaurus values by fields (synonyms, translations, etc) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php index 99127db7b9..a8c83003ff 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php @@ -113,7 +113,9 @@ class TermIndexer $mapping = new Mapping(); $mapping ->add('raw_value', 'string')->notAnalyzed() - ->add('value', 'string')->addAnalyzedVersion($this->locales) + ->add('value', 'string') + ->analyzer('general_light') + ->addAnalyzedVersion($this->locales) ->add('context', 'string')->addAnalyzedVersion($this->locales) ->add('path', 'string')->notAnalyzed() ->add('lang', 'string')->notAnalyzed() diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php index 7201c50f9a..6fc1244ab8 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php @@ -19,17 +19,29 @@ class Thesaurus private $client; private $index; + const MIN_SCORE = 6; + public function __construct(Client $client, $index) { $this->client = $client; $this->index = $index; } + public function findConceptsBulk(array $terms, $lang = null) + { + // TODO Use bulk queries for performance + } + public function findConcepts($term, $context = null, $lang = null) { // TODO Check that term queries are ok with multiple words $query = array(); - $query['term']['value'] = $term; + $field = $lang ? sprintf('value.%s', $lang) : 'value.light'; + $query['match'][$field]['query'] = $term; + $query['match'][$field]['operator'] = 'and'; + // Allow 25% of non-matching tokens + // (not exactly the same that 75% of matching tokens) + // $query['match'][$field]['minimum_should_match'] = '-25%'; if ($context) { $term_query = $query; @@ -38,15 +50,35 @@ class Thesaurus $query['bool']['must'][1]['term']['context'] = $context; } + if ($lang) { + $term_query = $query; + $query = array(); + $query['filtered']['query'] = $term_query; + $query['filtered']['filter']['term']['lang'] = $lang; + } + + // TODO Only search in a specific databox + // $term_query = $query; + // $query = array(); + // $query['filtered']['query'] = $term_query; + // $query['filtered']['filter']['term']['databox_id'] = $databox_id; + // Path deduplication $aggs = array(); $aggs['dedup']['terms']['field'] = 'path'; // Search request $params = array(); + $params['index'] = $this->index; $params['type'] = TermIndexer::TYPE_NAME; $params['body']['query'] = $query; $params['body']['aggs'] = $aggs; + // Arbitrary score low limit, we need find a more granular way to remove + // inexact concepts. + // We also need to disable TF/IDF on terms, and try to boost score only + // when the search match nearly all tokens of term's value field. + $params['body']['min_score'] = self::MIN_SCORE; + $response = $this->client->search($params); // Extract concept paths from response From dc2c9f8c7f53e4d49bbdde87b9dc2bf6440687a5 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Thu, 15 Jan 2015 20:04:46 +0100 Subject: [PATCH 04/14] Refactor thesaurus query build - Look for text nodes and infer the concepts behind term using thesaurus - Use value objects for thesaurus terms and concepts - Pass a QueryContext holding allowed fields and locales informations when building the Elasticsearch query - Change type hinting and name of query building method on nodes - Remove unused method Node#isFullTextOnly() - Move getFieldsStructure from RecordIndexer to RecordHelper for reusing field structure in SearchEngine --- .../Provider/SearchEngineServiceProvider.php | 6 + .../Elastic/AST/AndExpression.php | 8 +- .../Elastic/AST/BinaryOperator.php | 6 - .../Elastic/AST/ExceptExpression.php | 8 +- .../SearchEngine/Elastic/AST/FieldNode.php | 9 +- .../SearchEngine/Elastic/AST/InExpression.php | 14 +- .../Phrasea/SearchEngine/Elastic/AST/Node.php | 9 +- .../SearchEngine/Elastic/AST/OrExpression.php | 8 +- .../SearchEngine/Elastic/AST/PrefixNode.php | 11 +- .../Elastic/AST/QuotedTextNode.php | 11 +- .../SearchEngine/Elastic/AST/TextNode.php | 47 +++++-- .../Elastic/ElasticSearchEngine.php | 132 ++++-------------- .../Elastic/Indexer/RecordIndexer.php | 91 ++---------- .../SearchEngine/Elastic/RecordHelper.php | 103 +++++++++++++- .../SearchEngine/Elastic/Search/Query.php | 14 +- .../Elastic/Search/QueryContext.php | 40 ++++++ .../Elastic/Search/QueryVisitor.php | 2 +- .../SearchEngine/Elastic/Thesaurus.php | 23 ++- .../Elastic/Thesaurus/Concept.php | 40 ++++++ .../SearchEngine/Elastic/Thesaurus/Term.php | 57 ++++++++ .../Elastic/Thesaurus/TermInterface.php | 19 +++ 21 files changed, 391 insertions(+), 267 deletions(-) create mode 100644 lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php diff --git a/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php b/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php index a3b428cc36..b0b34fcaf7 100644 --- a/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php +++ b/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php @@ -18,6 +18,7 @@ use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; +use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryParser; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngine; @@ -87,6 +88,7 @@ class SearchEngineServiceProvider implements ServiceProviderInterface $app['elasticsearch.indexer.record_indexer'] = $app->share(function ($app) { return new RecordIndexer( + $app['elasticsearch.record_helper'], $app['thesaurus'], $app['elasticsearch.engine'], $app['phraseanet.appbox'], @@ -94,6 +96,10 @@ class SearchEngineServiceProvider implements ServiceProviderInterface ); }); + $app['elasticsearch.record_helper'] = $app->share(function ($app) { + return new RecordHelper($app['phraseanet.appbox']); + }); + $app['elasticsearch.client'] = $app->share(function($app) { $options = $app['elasticsearch.options']; $clientParams = ['hosts' => [sprintf('%s:%s', $options['host'], $options['port'])]]; diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php index 99e5579c64..3d890e1fa7 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php @@ -2,14 +2,16 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class AndExpression extends BinaryOperator { protected $operator = 'AND'; - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { - $left = $this->left->getQuery($fields); - $right = $this->right->getQuery($fields); + $left = $this->left->buildQuery($context); + $right = $this->right->buildQuery($context); return array( 'bool' => array( diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php index 714f40cb7c..6876e16f8f 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php @@ -26,10 +26,4 @@ abstract class BinaryOperator extends Node $this->right->getTextNodes() ); } - - public function isFullTextOnly() - { - return $this->left->isFullTextOnly() - && $this->right->isFullTextOnly(); - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php index a1c812b444..2959ee4eec 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php @@ -2,14 +2,16 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class ExceptExpression extends BinaryOperator { protected $operator = 'EXCEPT'; - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { - $left = $this->left->getQuery($fields); - $right = $this->right->getQuery($fields); + $left = $this->left->buildQuery($context); + $right = $this->right->buildQuery($context); return array( 'bool' => array( diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php index 6a634c7ebb..5bd7aec253 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php @@ -2,6 +2,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class FieldNode extends Node { protected $keyword; @@ -16,7 +18,7 @@ class FieldNode extends Node return $this->keyword; } - public function getQuery() + public function buildQuery(QueryContext $context) { throw new \LogicException("A keyword can't be converted to a query."); } @@ -30,9 +32,4 @@ class FieldNode extends Node { return sprintf('', $this->keyword); } - - public function isFullTextOnly() - { - return false; - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php index 0552d4dbad..00f95afdc3 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php @@ -2,6 +2,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class InExpression extends Node { protected $field; @@ -13,9 +15,11 @@ class InExpression extends Node $this->expression = $expression; } - public function getQuery() + public function buildQuery(QueryContext $context) { - return $this->expression->getQuery($this->field->getValue()); + $fields = array($this->field->getValue()); + + return $this->expression->buildQuery($context->narrowToFields($fields)); } public function getTextNodes() @@ -27,10 +31,4 @@ class InExpression extends Node { return sprintf('(%s IN %s)', $this->expression, $this->field); } - - public function isFullTextOnly() - { - // In expressions are never full-text - return false; - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php index 7a13e47478..36467b9fb3 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php @@ -2,17 +2,14 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + abstract class Node { /** * @return array The Elasticsearch formatted query */ - abstract public function getQuery(); - - /** - * @return bool Tell if the node and it's child are full-text queries only - */ - abstract public function isFullTextOnly(); + abstract public function buildQuery(QueryContext $context); abstract public function getTextNodes(); } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php index 6a345688e5..61c58e2109 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php @@ -2,14 +2,16 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class OrExpression extends BinaryOperator { protected $operator = 'OR'; - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { - $left = $this->left->getQuery($fields); - $right = $this->right->getQuery($fields); + $left = $this->left->buildQuery($context); + $right = $this->right->buildQuery($context); return array( 'bool' => array( diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php index 8e63f9e586..c18754e58e 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php @@ -2,6 +2,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class PrefixNode extends Node { protected $prefix; @@ -11,11 +13,11 @@ class PrefixNode extends Node $this->prefix = $prefix; } - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { return array( 'multi_match' => array( - 'fields' => $fields, + 'fields' => $context->getLocalizedFields(), 'query' => $this->prefix, 'type' => 'phrase_prefix' ) @@ -26,9 +28,4 @@ class PrefixNode extends Node { return sprintf('prefix("%s")', $this->prefix); } - - public function isFullTextOnly() - { - return true; - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php index 35df144d6c..48a5f913d3 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php @@ -2,22 +2,19 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class QuotedTextNode extends TextNode { - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { return array( 'multi_match' => array( 'type' => 'phrase', - 'fields' => $fields, + 'fields' => $context->getLocalizedFields(), 'query' => $this->text, // 'operator' => 'and' ) ); } - - public function isFullTextOnly() - { - return true; - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php index cbb0ab4b4d..e1b5a6e498 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php @@ -2,23 +2,43 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; -class TextNode extends Node +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface; + +class TextNode extends Node implements TermInterface { protected $text; + protected $concepts = array(); public function __construct($text) { $this->text = $text; } - public function getQuery($fields = ['_all']) + public function setConcepts(array $concepts) { - return array( + $this->concepts = $concepts; + } + + public function buildQuery(QueryContext $context) + { + $query = array( 'multi_match' => array( - 'fields' => $fields, + 'fields' => $context->getLocalizedFields(), 'query' => $this->text, ) ); + + if ($this->concepts) { + $shoulds = array($query); + foreach ($this->concepts as $concept) { + $shoulds[]['term']['concept_paths'] = $concept->getPath(); + } + $query = array(); + $query['bool']['should'] = $shoulds; + } + + return $query; } public function getTextNodes() @@ -31,13 +51,22 @@ class TextNode extends Node return sprintf('"%s"', $this->text); } - public function isFullTextOnly() - { - return true; - } - public function getText() + // Implementation of TermInterface + + public function getValue() { return $this->text; } + + public function hasContext() + { + return false; + } + + public function getContext() + { + // TODO Insert context during parsing + return null; + } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php index 46b51e849f..cfd2930649 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php @@ -13,7 +13,9 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; -use Alchemy\Phrasea\SearchEngine\Elastic\Search\SearchQuery; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; use Alchemy\Phrasea\SearchEngine\SearchEngineInterface; use Alchemy\Phrasea\SearchEngine\SearchEngineOptions; use Alchemy\Phrasea\SearchEngine\SearchEngineResult; @@ -280,83 +282,26 @@ class ElasticSearchEngine implements SearchEngineInterface $query['_ast'] = $searchQuery->dump(); + + $thesaurus = $this->app['thesaurus']; - foreach ($searchQuery->getTextNodes() as $textNode) { - $text = $textNode->getText(); - $concepts = $thesaurus->findConcepts($text); - $query['_thesaurus_concepts'][$text] = $concepts; + $textNodes = $searchQuery->getTextNodes(); + $concepts = $thesaurus->findConceptsBulk($textNodes); + + foreach ($concepts as $index => $termConcepts) { + $node = $textNodes[$index]; + $node->setConcepts($termConcepts); + $term = Term::dump($node); + $query['_thesaurus_concepts'][$term] = Concept::toPathArray($termConcepts); } - // $concepts = $thesaurus->findConceptsBulk($terms); + $recordHelper = $this->app['elasticsearch.record_helper']; + // TODO Pass options to getFields to include/exclude private fields + $searchableFields = $recordHelper->getFields(); + $queryContext = new QueryContext($searchableFields, $this->locales, $this->app['locale']); + $recordQuery = $searchQuery->build($queryContext); - // Contains the full thesaurus paths to search on - $pathsToFilter = []; - // Contains the thesaurus values by fields (synonyms, translations, etc) - $collectFields = []; - - // Only search in thesaurus for full text search - if ($searchQuery->isFullTextOnly()) { - $termFields = $this->expendToAnalyzedFieldsNames('value', null, $this->app['locale']); - $termsQuery = $searchQuery->getElasticsearchQuery($termFields); - - $params = $this->createTermQueryParams($termsQuery, $options); - $terms = $this->doExecute('search', $params); - - foreach ($terms['hits']['hits'] as $term) { - // Skip paths with very low score - if ($term['_score'] < 1) { - continue; - } - - $pathsToFilter[$term['_source']['path']] = $term['_score']; - - foreach ($term['_source']['fields'] as $field) { - $collectFields['caption.'.$field][] = $term['_source']['value']; - } - } - $pathsToFilter = array_unique($pathsToFilter); - } - - if (empty($collectFields)) { - // @todo a list of field by default? all fields? - $searchFieldNames = ['caption.*']; - } else { - $searchFieldNames = array_keys($collectFields); - } - - $recordFields = $this->expendToAnalyzedFieldsNames($searchFieldNames, null, $this->app['locale']); - - $recordQuery = [ - 'bool' => [ - 'should' => [ - $searchQuery->getElasticsearchQuery($recordFields) - ] - ] - ]; - - foreach ($pathsToFilter as $path => $score) { - // Also match incomplete path. /a/b/c will return /a/b/c/d records - $recordQuery['bool']['should'][] = [ - 'match' => [ - 'concept_paths' => array( - 'query' => $path, - 'boost' => $score, - ) - ] - ]; - - // Add signal for exact path only - $recordQuery['bool']['should'][] = [ - 'term' => [ - 'concept_paths.raw' => array( - 'value' => $path, - 'boost' => $score, - ) - ] - ]; - } - $params = $this->createRecordQueryParams($recordQuery, $options, null); $params['body']['from'] = $offset; $params['body']['size'] = $perPage; @@ -390,10 +335,15 @@ class ElasticSearchEngine implements SearchEngineInterface $results[] = ElasticsearchRecordHydrator::hydrate($hit['_source'], $n++); } + $query['_searchable_fields'] = $searchableFields; $query['_ast'] = $searchQuery->dump(); - $query['_paths'] = $pathsToFilter; - $query['_richFields'] = $collectFields; - $query['query'] = json_encode($params); + // $query['_paths'] = $pathsToFilter; + // $query['_richFields'] = $collectFields; + + $queryyy = $recordQuery; + // $queryyy = $params['body']; + $query['query'] = $queryyy; + $query['query_as_string'] = json_encode($queryyy); return new SearchEngineResult($results, json_encode($query), $res['took'], $offset, $res['hits']['total'], $res['hits']['total'], null, null, $suggestions, [], @@ -584,36 +534,6 @@ class ElasticSearchEngine implements SearchEngineInterface return $res; } - /** - * @param array|string $fields - * @param array|null $locales - * @param null $currentLocale - * @return array - */ - public function expendToAnalyzedFieldsNames($fields, $locales = null, $currentLocale = null) - { - $fieldsExpended = []; - - if (!$locales) { - $locales = $this->locales; - } - - foreach ((array) $fields as $field) { - foreach ($locales as $locale) { - $boost = ""; - - if ($locale === $currentLocale) { - $boost = "^5"; - } - - $fieldsExpended[] = sprintf('%s.%s%s', $field, $locale, $boost); - } - $fieldsExpended[] = sprintf('%s.%s', $field, 'light^10'); - } - - return $fieldsExpended; - } - private function getFlagsKey(\appbox $appbox) { $flags = []; diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php index 36006f325f..246548f18d 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php @@ -20,13 +20,16 @@ use Alchemy\Phrasea\SearchEngine\Elastic\RecordFetcher; use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper; use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; -use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper; use media_subdef; class RecordIndexer { const TYPE_NAME = 'record'; + private $helper; + + private $thesaurus; + /** * @var \appbox */ @@ -42,10 +45,9 @@ class RecordIndexer */ private $locales; - private $dataStructure; - - public function __construct(Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales) + public function __construct(RecordHelper $helper, Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales) { + $this->helper = $helper; $this->thesaurus = $thesaurus; $this->appbox = $appbox; $this->elasticSearchEngine = $elasticSearchEngine; @@ -54,11 +56,8 @@ class RecordIndexer public function populateIndex(BulkOperation $bulk) { - // Helper to fetch record related data - $recordHelper = new RecordHelper($this->appbox); - foreach ($this->appbox->get_databoxes() as $databox) { - $fetcher = new RecordFetcher($databox, $recordHelper); + $fetcher = new RecordFetcher($databox, $this->helper); $fetcher->setBatchSize(200); while ($records = $fetcher->fetch()) { foreach ($records as $record) { @@ -74,9 +73,7 @@ class RecordIndexer public function indexSingleRecord(\record_adapter $record_adapter, $indexName) { - // Helper to fetch record related data - $recordHelper = new RecordHelper($this->appbox); - $fetcher = new RecordFetcher($record_adapter->get_databox(), $recordHelper); + $fetcher = new RecordFetcher($record_adapter->get_databox(), $this->helper); $record = $fetcher->fetchOne($record_adapter); $params = array(); @@ -218,77 +215,7 @@ class RecordIndexer private function getFieldsStructure() { - if (!empty($this->dataStructure)) { - return $this->dataStructure; - } - - $fields = array(); - - foreach ($this->appbox->get_databoxes() as $databox) { - //printf("Databox %d\n", $databox->get_sbas_id()); - foreach ($databox->get_meta_structure() as $fieldStructure) { - $field = array(); - // Field type - switch ($fieldStructure->get_type()) { - case \databox_field::TYPE_DATE: - $field['type'] = 'date'; - break; - case \databox_field::TYPE_NUMBER: - $field['type'] = 'double'; - break; - case \databox_field::TYPE_STRING: - case \databox_field::TYPE_TEXT: - $field['type'] = 'string'; - break; - default: - throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type())); - break; - } - - $name = $fieldStructure->get_name(); - - // Business rules - $field['private'] = $fieldStructure->isBusiness(); - $field['indexable'] = $fieldStructure->is_indexable(); - $field['to_aggregate'] = (bool) $fieldStructure->isAggregable(); - - // Thesaurus concept inference - // $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']"; - $helper = new ThesaurusHelper(); - - // TODO Not the real option yet - $field['thesaurus_concept_inference'] = $field['type'] === 'string'; - // TODO Find thesaurus path prefixes - $field['thesaurus_prefix'] = '/categories'; - - //printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']); - - // Since mapping is merged between databoxes, two fields may - // have conflicting names. Indexing is the same for a given - // type so we reject only those with different types. - if (isset($fields[$name])) { - if ($fields[$name]['type'] !== $field['type']) { - throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type'])); - } - - if ($fields[$name]['indexable'] !== $field['indexable']) { - throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name)); - } - - if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) { - throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name)); - } - // TODO other structure incompatibilities - - //printf("Merged with previous \"%s\" field\n", $name); - } - - $fields[$name] = $field; - } - } - - $this->dataStructure = $fields; - return $this->dataStructure; + return $this->helper->getFieldsStructure(); } // @todo Add call to addAnalyzedVersion ? diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php index 2453a06f8e..7bfb7c043b 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php @@ -11,18 +11,21 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper; use appbox; use igorw; class RecordHelper { - private $connection; + private $appbox; + // Computation caches private $collectionMap; + private $fieldStructure; public function __construct(appbox $appbox) { - $this->connection = $appbox->get_connection(); + $this->appbox = $appbox; } public function getUniqueRecordId($databoxId, $recordId) @@ -46,12 +49,13 @@ class RecordHelper private function collectionMap() { if (!$this->collectionMap) { + $connection = $this->appbox->get_connection(); $sql = 'SELECT sbas_id as databox_id, server_coll_id as collection_id, base_id FROM bas'; - $statement = $this->connection->query($sql); + $statement = $connection->query($sql); $map = array(); while ($mapping = $statement->fetch()) { @@ -68,4 +72,97 @@ class RecordHelper { return StringUtils::slugify($key, '_'); } + + public function getFields($includePrivate = false, $onlySearchable = true) + { + $fields = array(); + foreach ($this->getFieldsStructure() as $name => $options) { + // Skip private fields + if ($options['private'] && !$includePrivate) { + continue; + } + // Skip not searchable fields + if ($onlySearchable && !$options['indexable']) { + continue; + } + $fields[] = $name; + } + + return $fields; + } + + public function getFieldsStructure() + { + if (!empty($this->fieldsStructure)) { + return $this->fieldsStructure; + } + + $fields = array(); + + foreach ($this->appbox->get_databoxes() as $databox) { + //printf("Databox %d\n", $databox->get_sbas_id()); + foreach ($databox->get_meta_structure() as $fieldStructure) { + $field = array(); + // Field type + switch ($fieldStructure->get_type()) { + case \databox_field::TYPE_DATE: + $field['type'] = 'date'; + break; + case \databox_field::TYPE_NUMBER: + $field['type'] = 'double'; + break; + case \databox_field::TYPE_STRING: + case \databox_field::TYPE_TEXT: + $field['type'] = 'string'; + break; + default: + throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type())); + break; + } + + $name = $fieldStructure->get_name(); + + // Business rules + $field['private'] = $fieldStructure->isBusiness(); + $field['indexable'] = $fieldStructure->is_indexable(); + $field['to_aggregate'] = (bool) $fieldStructure->isAggregable(); + + // Thesaurus concept inference + // $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']"; + $helper = new ThesaurusHelper(); + + // TODO Not the real option yet + $field['thesaurus_concept_inference'] = $field['type'] === 'string'; + // TODO Find thesaurus path prefixes + $field['thesaurus_prefix'] = '/categories'; + + //printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']); + + // Since mapping is merged between databoxes, two fields may + // have conflicting names. Indexing is the same for a given + // type so we reject only those with different types. + if (isset($fields[$name])) { + if ($fields[$name]['type'] !== $field['type']) { + throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type'])); + } + + if ($fields[$name]['indexable'] !== $field['indexable']) { + throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name)); + } + + if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) { + throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name)); + } + // TODO other structure incompatibilities + + //printf("Merged with previous \"%s\" field\n", $name); + } + + $fields[$name] = $field; + } + } + + $this->fieldsStructure = $fields; + return $this->fieldsStructure; + } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php index 3af8d73fdb..4e05e46ed9 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php @@ -23,19 +23,9 @@ class Query return $this->root->getTextNodes(); } - /* - * This method seems weird to me, the implementation returns true when the - * query doesn't contain IN statements, but that doesn't define a full text - * search. - */ - public function isFullTextOnly() + public function build(QueryContext $context) { - return $this->root->isFullTextOnly(); - } - - public function getElasticsearchQuery($fields = array()) - { - return $this->root->getQuery($fields); + return $this->root->buildQuery($context); } public function dump() diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php new file mode 100644 index 0000000000..8fc5176da9 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php @@ -0,0 +1,40 @@ +fields = $fields; + $this->locales = $locales; + $this->queryLocale = $queryLocale; + } + + public function narrowToFields(array $fields) + { + // Ensure we are not escaping from original fields restrictions + $fields = array_intersect($this->fields, $fields); + + return new static($fields, $this->locales, $this->queryLocale); + } + + public function getLocalizedFields() + { + $fields = array(); + foreach ($this->fields as $field) { + foreach ($this->locales as $locale) { + $boost = ($locale === $this->queryLocale) ? '^5' : ''; + $fields[] = sprintf('caption.%s.%s%s', $field, $locale, $boost); + } + // TODO Put generic analyzers on main field instead of "light" sub-field + $fields[] = sprintf('caption.%s.%s', $field, 'light^10'); + } + + return $fields; + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php index 75318df71d..efb62efb79 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php @@ -119,7 +119,7 @@ class QueryVisitor implements Visit if ($root instanceof AST\TextNode && !$root instanceof AST\QuotedTextNode && !$node instanceof AST\QuotedTextNode) { - $root = new AST\TextNode(sprintf('%s %s', $root->getText(), $node->getText())); + $root = new AST\TextNode(sprintf('%s %s', $root->getValue(), $node->getValue())); } else { $root = new AST\AndExpression($root, $node); } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php index 6fc1244ab8..66d81aa5a1 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php @@ -12,6 +12,9 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface; use Elasticsearch\Client; class Thesaurus @@ -30,24 +33,34 @@ class Thesaurus public function findConceptsBulk(array $terms, $lang = null) { // TODO Use bulk queries for performance + $concepts = array(); + foreach ($terms as $term) { + $concepts[] = $this->findConcepts($term, $lang); + } + + return $concepts; } - public function findConcepts($term, $context = null, $lang = null) + public function findConcepts($term, $lang = null) { + if (!($term instanceof TermInterface)) { + $term = new Term($term); + } + // TODO Check that term queries are ok with multiple words $query = array(); $field = $lang ? sprintf('value.%s', $lang) : 'value.light'; - $query['match'][$field]['query'] = $term; + $query['match'][$field]['query'] = $term->getValue(); $query['match'][$field]['operator'] = 'and'; // Allow 25% of non-matching tokens // (not exactly the same that 75% of matching tokens) // $query['match'][$field]['minimum_should_match'] = '-25%'; - if ($context) { + if ($term->hasContext()) { $term_query = $query; $query = array(); $query['bool']['must'][0] = $term_query; - $query['bool']['must'][1]['term']['context'] = $context; + $query['bool']['must'][1]['term']['context'] = $term->getContext(); } if ($lang) { @@ -86,7 +99,7 @@ class Thesaurus $buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []); foreach ($buckets as $bucket) { if (isset($bucket['key'])) { - $concepts[] = $bucket['key']; + $concepts[] = new Concept($bucket['key']); } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php new file mode 100644 index 0000000000..b757ce92c0 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php @@ -0,0 +1,40 @@ +path = (string) $path; + } + + public function getPath() + { + return $this->path; + } + + public function __toString() + { + return $this->path; + } + + public static function toPathArray(array $concepts) + { + foreach ($concepts as $index => $concept) { + $concepts[$index] = $concept->getPath(); + } + return $concepts; + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php new file mode 100644 index 0000000000..395b5bd765 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php @@ -0,0 +1,57 @@ +value = (string) $value; + if ($context) { + $this->context = (string) $context; + } + } + + public function getValue() + { + return $this->value; + } + + public function hasContext() + { + return $this->context !== null; + } + + public function getContext() + { + return $this->context; + } + + public function __toString() + { + return self::dump($this); + } + + public static function dump(TermInterface $term) + { + if ($term->hasContext()) { + return sprintf('%s (%s)', $term->getValue(), $term->getContext()); + } + + return $term->getValue(); + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php new file mode 100644 index 0000000000..72a9f05492 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php @@ -0,0 +1,19 @@ + Date: Fri, 16 Jan 2015 12:22:42 +0100 Subject: [PATCH 05/14] Context support in concept finding command --- .../Command/Thesaurus/FindConceptsCommand.php | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php b/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php index 269f6700dd..45fa006ccf 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php @@ -13,6 +13,7 @@ namespace Alchemy\Phrasea\Command\Thesaurus; use Alchemy\Phrasea\Command\Command; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; use Symfony\Component\Console\Input\InputArgument; use Symfony\Component\Console\Input\InputInterface; use Symfony\Component\Console\Input\InputOption; @@ -30,6 +31,11 @@ class FindConceptsCommand extends Command InputArgument::REQUIRED, 'Reverse search a term to infer concepts' ) + ->addArgument( + 'context', + InputArgument::OPTIONAL, + 'Restrict search to a specific term context' + ) ->addOption( 'locale', null, @@ -48,16 +54,22 @@ class FindConceptsCommand extends Command protected function doExecute(InputInterface $input, OutputInterface $output) { $term = $input->getArgument('term'); + $context = $input->getArgument('context'); $raw = $input->getOption('raw'); if (!$raw) { - $output->writeln(sprintf('Finding linked concepts: %s', $term)); + $message = sprintf('Finding linked concepts: %s', $term); + if ($context) { + $message .= sprintf(' (with context %s)', $context); + } + $output->writeln($message); $output->writeln(str_repeat('-', 20)); } $thesaurus = $this->container['thesaurus']; + $term = new Term($term, $context); $locale = $input->getOption('locale'); - $concepts = $thesaurus->findConcepts($term, null, $locale); + $concepts = $thesaurus->findConcepts($term, $locale); if (count($concepts)) { $output->writeln($concepts); From 95606280376601ca3582c2994c61178d509e63cd Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Mon, 19 Jan 2015 19:59:53 +0100 Subject: [PATCH 06/14] Prune narrow concepts while building query --- .../Command/Thesaurus/FindConceptsCommand.php | 11 +++++++ .../SearchEngine/Elastic/AST/TextNode.php | 3 +- .../Elastic/Indexer/RecordIndexer.php | 2 +- .../Elastic/Thesaurus/Concept.php | 29 +++++++++++++++++++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php b/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php index 45fa006ccf..ac5da01b6b 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/FindConceptsCommand.php @@ -13,6 +13,7 @@ namespace Alchemy\Phrasea\Command\Thesaurus; use Alchemy\Phrasea\Command\Command; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; use Symfony\Component\Console\Input\InputArgument; use Symfony\Component\Console\Input\InputInterface; @@ -42,6 +43,12 @@ class FindConceptsCommand extends Command InputOption::VALUE_REQUIRED, 'Specify input locale' ) + ->addOption( + 'broad', + null, + InputOption::VALUE_NONE, + 'Keep broad concepts (discards narrower concepts)' + ) ->addOption( 'raw', null, @@ -71,6 +78,10 @@ class FindConceptsCommand extends Command $locale = $input->getOption('locale'); $concepts = $thesaurus->findConcepts($term, $locale); + if ($input->getOption('broad')) { + $concepts = Concept::pruneNarrowConcepts($concepts); + } + if (count($concepts)) { $output->writeln($concepts); } elseif (!$raw) { diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php index e1b5a6e498..da7315385b 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php @@ -3,6 +3,7 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface; class TextNode extends Node implements TermInterface @@ -31,7 +32,7 @@ class TextNode extends Node implements TermInterface if ($this->concepts) { $shoulds = array($query); - foreach ($this->concepts as $concept) { + foreach (Concept::pruneNarrowConcepts($this->concepts) as $concept) { $shoulds[]['term']['concept_paths'] = $concept->getPath(); } $query = array(); diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php index 246548f18d..faa54ab42a 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php @@ -301,7 +301,7 @@ class RecordIndexer } } - $record['concept_paths'] = $this->findLinkedConcepts($structure, $record); + // $record['concept_paths'] = $this->findLinkedConcepts($structure, $record); return $record; } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php index b757ce92c0..0512abff0d 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php @@ -25,6 +25,12 @@ class Concept return $this->path; } + public function isNarrowerThan(Concept $other) + { + // A concept is the child of another if it begins with the other + return 0 === strpos($this->getPath(), $other->getPath() . '/'); + } + public function __toString() { return $this->path; @@ -37,4 +43,27 @@ class Concept } return $concepts; } + + public static function pruneNarrowConcepts($concepts) + { + // Build a map with paths as keys + $concepts = array_combine(Concept::toPathArray($concepts), $concepts); + // Paths are sorted in advance to keep search O(n) + ksort($concepts); + // With sorting, the first element can't be a child + $broad = current($concepts); + next($concepts); + // Start prunning concepts narrower than current broad one + while ($concept = current($concepts)) { + if ($concept->isNarrowerThan($broad)) { + unset($concepts[key($concepts)]); + } else { + // End of prunable childs, beginning of a new concept + $broad = $concept; + next($concepts); + } + } + + return $concepts; + } } From cd8c3cd85a0e2ff0962ec52f4b2b212745099951 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Tue, 20 Jan 2015 11:34:07 +0100 Subject: [PATCH 07/14] Set thesaurus requested size to zero (only aggs are needed) --- lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php index 66d81aa5a1..8b1aef3dbc 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php @@ -91,6 +91,8 @@ class Thesaurus // We also need to disable TF/IDF on terms, and try to boost score only // when the search match nearly all tokens of term's value field. $params['body']['min_score'] = self::MIN_SCORE; + // No need to get any hits since we extract data from aggs + $params['body']['size'] = 0; $response = $this->client->search($params); From 2f0863000ff77db35ce2e5c22774bd39dccb5500 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Tue, 20 Jan 2015 15:21:51 +0100 Subject: [PATCH 08/14] Update null query node with new API --- .../Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php index 54e294de86..00890f8c03 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/NullQueryNode.php @@ -2,9 +2,11 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class NullQueryNode extends Node { - public function getQuery() + public function buildQuery(QueryContext $context) { return array('match_all' => array()); } @@ -18,9 +20,4 @@ class NullQueryNode extends Node { return ''; } - - public function isFullTextOnly() - { - return false; - } } From f9bb8c579c0b599629b4cb4866b0f88ac430fa2c Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Tue, 20 Jan 2015 16:08:46 +0100 Subject: [PATCH 09/14] Remove "fields" field in thesaurus terms index --- .../Elastic/Indexer/TermIndexer.php | 49 +++---------------- 1 file changed, 8 insertions(+), 41 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php index a8c83003ff..ec384e9ef9 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php @@ -41,39 +41,28 @@ class TermIndexer { foreach ($this->appbox->get_databoxes() as $databox) { /** @var databox $databox */ - $databoxId = $databox->get_sbas_id(); + $databoxId = $databox->get_sbas_id(); - $document = self::thesaurusFromDatabox($databox); - $dedicatedFieldTerms = $this->getDedicatedFieldTerms($databox, $document); + $visitor = new TermVisitor(function ($term) use ($bulk, $databoxId) { + // Path and id are prefixed with a databox identifier to not + // collide with other databoxes terms - $visitor = new TermVisitor(function ($term) use ($bulk, $databoxId, $dedicatedFieldTerms) { - //printf("- %s (%s)\n", $term['path'], $term['value']); // Term structure - $id = $term['id']; + $id = sprintf('%s_%s', $databoxId, $term['id']); unset($term['id']); - + $term['path'] = sprintf('/%s%s', $databoxId, $term['path']); $term['databox_id'] = $databoxId; - $term['branch_id'] = $id; - - // @todo move to the TermVisitor? dunno. - $term['fields'] = null; - foreach ($dedicatedFieldTerms as $partialId => $fields) { - if (strpos($id, $partialId) === 0) { - foreach ($fields as $field) { - $term['fields'][] = $field; - } - } - } // Index request $params = array(); - $params['id'] = sprintf('%s_%s', $databoxId, $id); + $params['id'] = $id; $params['type'] = self::TYPE_NAME; $params['body'] = $term; $bulk->index($params); }); + $document = self::thesaurusFromDatabox($databox); $this->navigator->walk($document, $visitor); } } @@ -88,26 +77,6 @@ class TermIndexer return $dom; } - private function getDedicatedFieldTerms(databox $databox, DOMDocument $document) - { - $xpath = new \DOMXpath($document); - $dedicatedFieldTerms = []; - - foreach ($databox->get_meta_structure() as $f) { - if ($f->get_tbranch()) { - $elements = $xpath->query($f->get_tbranch()); - - if ($elements) { - foreach ($elements as $element) { - $dedicatedFieldTerms[$element->getAttribute('id')][] = $f->get_name(); - } - } - } - } - - return $dedicatedFieldTerms; - } - public function getMapping() { $mapping = new Mapping(); @@ -119,9 +88,7 @@ class TermIndexer ->add('context', 'string')->addAnalyzedVersion($this->locales) ->add('path', 'string')->notAnalyzed() ->add('lang', 'string')->notAnalyzed() - ->add('branch_id', 'string')->notAnalyzed() ->add('databox_id', 'integer') - ->add('fields', 'string')->notAnalyzed() ; return $mapping->export(); From 1192d11a3af0adb9f964796829de171c82409221 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Tue, 20 Jan 2015 16:32:07 +0100 Subject: [PATCH 10/14] Use general_light analyzer for value and context, remove light subfield --- .../Elastic/Indexer/TermIndexer.php | 6 +++-- .../Phrasea/SearchEngine/Elastic/Mapping.php | 26 ++++++++++++------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php index ec384e9ef9..54f4e43b08 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/TermIndexer.php @@ -84,8 +84,10 @@ class TermIndexer ->add('raw_value', 'string')->notAnalyzed() ->add('value', 'string') ->analyzer('general_light') - ->addAnalyzedVersion($this->locales) - ->add('context', 'string')->addAnalyzedVersion($this->locales) + ->addLocalizedSubfields($this->locales) + ->add('context', 'string') + ->analyzer('general_light') + ->addLocalizedSubfields($this->locales) ->add('path', 'string')->notAnalyzed() ->add('lang', 'string')->notAnalyzed() ->add('databox_id', 'integer') diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php index 2232d65d19..8552f2d24c 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php @@ -146,22 +146,30 @@ class Mapping return $this; } - public function addAnalyzedVersion(array $langs) + /** + * @deprecated + */ + public function addAnalyzedVersion(array $locales) { $field = &$this->currentField(); - - foreach ($langs as $lang) { - $field['fields'][$lang] = [ - 'type' => $field['type'], - 'analyzer' => sprintf('%s_full', $lang) - ]; - } - $field['fields']['light'] = [ 'type' => $field['type'], 'analyzer' => 'general_light' ]; + return $this->addLocalizedSubfields($locales); + } + + public function addLocalizedSubfields(array $locales) + { + $field = &$this->currentField(); + + foreach ($locales as $locale) { + $field['fields'][$locale] = array(); + $field['fields'][$locale]['type'] = $field['type']; + $field['fields'][$locale]['analyzer'] = sprintf('%s_full', $locale); + } + return $this; } From 55464f075b8b7b9305b63d53578e0521285649c6 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Tue, 20 Jan 2015 16:51:33 +0100 Subject: [PATCH 11/14] Indexable -> Searchable --- .../SearchEngine/Elastic/Indexer/RecordIndexer.php | 4 ++-- lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php index faa54ab42a..c095c687bd 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php @@ -191,9 +191,9 @@ class RecordIndexer } if ($params['type'] === Mapping::TYPE_STRING) { - if (!$params['indexable'] && !$params['to_aggregate']) { + if (!$params['searchable'] && !$params['to_aggregate']) { $m->notIndexed(); - } elseif (!$params['indexable'] && $params['to_aggregate']) { + } elseif (!$params['searchable'] && $params['to_aggregate']) { $m->notAnalyzed(); $m->addRawVersion(); } else { diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php index 7bfb7c043b..da4710ca95 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php @@ -82,7 +82,7 @@ class RecordHelper continue; } // Skip not searchable fields - if ($onlySearchable && !$options['indexable']) { + if ($onlySearchable && !$options['searchable']) { continue; } $fields[] = $name; @@ -124,7 +124,7 @@ class RecordHelper // Business rules $field['private'] = $fieldStructure->isBusiness(); - $field['indexable'] = $fieldStructure->is_indexable(); + $field['searchable'] = $fieldStructure->is_indexable(); $field['to_aggregate'] = (bool) $fieldStructure->isAggregable(); // Thesaurus concept inference @@ -146,8 +146,8 @@ class RecordHelper throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type'])); } - if ($fields[$name]['indexable'] !== $field['indexable']) { - throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name)); + if ($fields[$name]['searchable'] !== $field['searchable']) { + throw new MergeException(sprintf("Field %s can't be merged, incompatible searchable state", $name)); } if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) { From a4df8d02e00260fa5e2c062faeb05d183c87d882 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Tue, 20 Jan 2015 17:29:15 +0100 Subject: [PATCH 12/14] Remove subdefs from mapping --- .../Elastic/Indexer/RecordIndexer.php | 15 ++---------- .../Phrasea/SearchEngine/Elastic/Mapping.php | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php index c095c687bd..31493be0b6 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php @@ -154,6 +154,8 @@ class RecordIndexer ->analyzer('thesaurus_path', 'indexing') ->analyzer('keyword', 'searching') ->addRawVersion() + // Keep subdefs arround for display purpose + ->addDisabled('subdefs'); ; // Index title @@ -164,19 +166,6 @@ class RecordIndexer } $mapping->add('title', $titleMapping); - // Minimal subdefs mapping info for display purpose - $subdefMapping = new Mapping(); - $subdefMapping->add('path', 'string')->notAnalyzed()->notIndexed(); - $subdefMapping->add('height', 'integer')->notIndexed(); - $subdefMapping->add('width', 'integer')->notIndexed(); - - $subdefsMapping = new Mapping(); - $subdefsMapping->add('thumbnail', $subdefMapping); - $subdefsMapping->add('thumbnailgif', $subdefMapping); - $subdefsMapping->add('preview', $subdefMapping); - - $mapping->add('subdefs', $subdefsMapping); - // Caption mapping $captionMapping = new Mapping(); $mapping->add('caption', $captionMapping); diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php index 8552f2d24c..b748360e2e 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php @@ -72,6 +72,13 @@ class Mapping return $this; } + public function addDisabled($name) + { + $this->add($name, new self())->disable(); + + return $this; + } + public function export() { return ['properties' => $this->exportProperties()]; @@ -134,6 +141,22 @@ class Mapping return $this; } + /** + * Allows to disable parsing and indexing a named object completely. + * This is handy when a portion of the JSON document contains arbitrary JSON + * which should not be indexed, nor added to the mapping. + */ + public function disable() + { + $field = &$this->currentField(); + if ($field['type'] !== self::TYPE_OBJECT) { + throw new LogicException('Only object fields can be disabled'); + } + $field['enabled'] = false; + + return $this; + } + public function addRawVersion() { $field = &$this->currentField(); From e72c8b9674b13b0458c13a0df045eea6d95aa61e Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Tue, 20 Jan 2015 18:21:57 +0100 Subject: [PATCH 13/14] Reword disabled mapping --- .../Elastic/Indexer/RecordIndexer.php | 2 +- .../Phrasea/SearchEngine/Elastic/Mapping.php | 40 ++++++++----------- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php index 31493be0b6..579e155a70 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php @@ -155,7 +155,7 @@ class RecordIndexer ->analyzer('keyword', 'searching') ->addRawVersion() // Keep subdefs arround for display purpose - ->addDisabled('subdefs'); + ->add('subdefs', Mapping::disabledMapping()) ; // Index title diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php index b748360e2e..3093c96e9a 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Mapping.php @@ -18,6 +18,7 @@ class Mapping { private $fields = array(); private $current; + private $enabled = true; const DATE_FORMAT_MYSQL = 'yyyy-MM-dd HH:mm:ss'; const DATE_FORMAT_CAPTION = 'yyyy/MM/dd'; // ES format @@ -54,7 +55,7 @@ class Mapping $field = array(); if ($type instanceof self) { $field['type'] = self::TYPE_OBJECT; - $field['properties'] = $type; + $field['mapping'] = $type; } elseif (in_array($type, self::$types)) { $field['type'] = $type; @@ -72,29 +73,21 @@ class Mapping return $this; } - public function addDisabled($name) - { - $this->add($name, new self())->disable(); - - return $this; - } - public function export() { - return ['properties' => $this->exportProperties()]; - } - - public function exportProperties() - { - $properties = array(); + $mapping = array(); foreach ($this->fields as $name => $field) { - $properties[$name] = $field; if ($field['type'] === self::TYPE_OBJECT) { - $properties[$name]['properties'] = $field['properties']->exportProperties(); + $field = $field['mapping']->export(); } + $mapping['properties'][$name] = $field; } - return $properties; + if (!$this->enabled) { + $mapping['enabled'] = false; + } + + return $mapping; } public function analyzer($analyzer, $type = null) @@ -141,18 +134,19 @@ class Mapping return $this; } + public static function disabledMapping() + { + return (new self())->disable(); + } + /** * Allows to disable parsing and indexing a named object completely. * This is handy when a portion of the JSON document contains arbitrary JSON * which should not be indexed, nor added to the mapping. */ - public function disable() + private function disable() { - $field = &$this->currentField(); - if ($field['type'] !== self::TYPE_OBJECT) { - throw new LogicException('Only object fields can be disabled'); - } - $field['enabled'] = false; + $this->enabled = false; return $this; } From c1d20c77eeec3948ef677a07f408f459e840f7c4 Mon Sep 17 00:00:00 2001 From: Mathieu Darse Date: Tue, 20 Jan 2015 18:23:25 +0100 Subject: [PATCH 14/14] Remove from index some unused fields & clean up --- .../Elastic/Indexer/RecordIndexer.php | 33 +++++++------------ 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php index 579e155a70..a2d8d05d61 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php @@ -137,13 +137,13 @@ class RecordIndexer ->add('record_id', 'integer') // Compound primary key ->add('databox_id', 'integer') // Compound primary key ->add('base_id', 'integer') // Unique collection ID - ->add('collection_id', 'integer') // Useless collection ID (local to databox) - ->add('collection_name', 'string')->notAnalyzed() // Collection name - ->add('uuid', 'string')->notAnalyzed() - ->add('sha256', 'string')->notAnalyzed() + ->add('collection_id', 'integer')->notIndexed() // Useless collection ID (local to databox) + ->add('collection_name', 'string')->notIndexed() // Collection name + ->add('uuid', 'string')->notIndexed() + ->add('sha256', 'string')->notIndexed() // Mandatory metadata - ->add('original_name', 'string')->notAnalyzed() - ->add('mime', 'string')->notAnalyzed() + ->add('original_name', 'string')->notIndexed() + ->add('mime', 'string')->notIndexed() ->add('type', 'string')->notAnalyzed() ->add('record_type', 'string')->notAnalyzed() // record or story // Dates @@ -154,18 +154,15 @@ class RecordIndexer ->analyzer('thesaurus_path', 'indexing') ->analyzer('keyword', 'searching') ->addRawVersion() - // Keep subdefs arround for display purpose + // EXIF + ->add('exif', $this->getExifMapping()) + // Status + ->add('flags', $this->getFlagsMapping()) + // Keep some fields arround for display purpose ->add('subdefs', Mapping::disabledMapping()) + ->add('title', Mapping::disabledMapping()) ; - // Index title - $titleMapping = new Mapping(); - $titleMapping->add('default', 'string')->notAnalyzed()->notIndexed(); - foreach ($this->locales as $locale) { - $titleMapping->add($locale, 'string')->notAnalyzed()->notIndexed(); - } - $mapping->add('title', $titleMapping); - // Caption mapping $captionMapping = new Mapping(); $mapping->add('caption', $captionMapping); @@ -192,12 +189,6 @@ class RecordIndexer } } - // EXIF - $mapping->add('exif', $this->getExifMapping()); - - // Status - $mapping->add('flags', $this->getFlagsMapping()); - return $mapping->export(); }