From 63bee24775a77ff30ca8249f01207a280df2dc1f Mon Sep 17 00:00:00 2001 From: jygaulier Date: Thu, 8 Apr 2021 18:15:27 +0200 Subject: [PATCH] PHRAS-3389_use-only-conceptpaths-from-selected-dbs_MASTER fix : search only concept-paths from the relevant thesaurus fix : search only fields from the relevant databoxes (collections) --- .../Elastic/AST/AbstractTermNode.php | 44 ++++-- .../SearchEngine/Elastic/AST/TextNode.php | 11 +- .../Record/Hydrator/ThesaurusHydrator.php | 15 +- .../Elastic/Search/QueryCompiler.php | 10 +- .../Elastic/Search/QueryContext.php | 12 +- .../Elastic/Search/QueryHelper.php | 19 ++- .../SearchEngine/Elastic/Structure/Field.php | 28 +++- .../Elastic/Structure/GlobalStructure.php | 5 + .../Elastic/Structure/LimitedStructure.php | 31 +++- .../Elastic/Structure/Structure.php | 5 + .../SearchEngine/Elastic/Thesaurus.php | 137 +++++++++++++----- .../Elastic/Thesaurus/Concept.php | 9 +- .../SearchEngine/Elastic/Thesaurus/Helper.php | 51 ++----- .../Phrasea/SearchEngine/AST/TermNodeTest.php | 9 +- .../Phrasea/SearchEngine/AST/TextNodeTest.php | 5 +- .../SearchEngine/Structure/FieldTest.php | 11 +- .../SearchEngine/Structure/StructureTest.php | 5 +- .../SearchEngine/Thesaurus/ConceptTest.php | 24 +-- 18 files changed, 290 insertions(+), 141 deletions(-) diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AbstractTermNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AbstractTermNode.php index a822123c97..368f21806c 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AbstractTermNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AbstractTermNode.php @@ -2,8 +2,7 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; -use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; -use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryHelper; +use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface; @@ -12,6 +11,14 @@ abstract class AbstractTermNode extends Node implements TermInterface protected $text; protected $context; private $concepts = []; + /** + * pruned concepts is a reduced list of concepts, keeping only high-level ones + * by removing concepts-included-in-concept, + * e.g. + * /1/animal/mamal + * /1/animal/mamal/dog -- removed because included + * /2/subject/animal + */ private $pruned_concepts; public function __construct($text, Context $context = null) @@ -26,6 +33,9 @@ abstract class AbstractTermNode extends Node implements TermInterface $this->concepts = $concepts; } + /** + * @return Concept[] + */ private function getPrunedConcepts() { if ($this->pruned_concepts === null) { @@ -34,6 +44,10 @@ abstract class AbstractTermNode extends Node implements TermInterface return $this->pruned_concepts; } + /** + * @param Field[] $fields + * @return array + */ protected function buildConceptQueries(array $fields) { $concepts = $this->getPrunedConcepts(); @@ -43,20 +57,26 @@ abstract class AbstractTermNode extends Node implements TermInterface $index_fields = []; foreach ($fields as $field) { - $index_fields[] = $field->getConceptPathIndexField(); - } - if (!$index_fields) { - return []; + // $db = $field->get_databox_id(); + foreach ($field->getDependantDataboxes() as $db) { + if(!array_key_exists($db, $index_fields)) { + $index_fields[$db] = []; + } + $index_fields[$db][] = $field->getConceptPathIndexField(); + } } $queries = []; foreach ($concepts as $concept) { - $queries[] = [ - 'multi_match' => [ - 'fields' => $index_fields, - 'query' => $concept->getPath() - ] - ]; + $db = $concept->getDataboxId(); + if(array_key_exists($db, $index_fields)) { + $queries[] = [ + 'multi_match' => [ + 'fields' => $index_fields[$db], + 'query' => $concept->getPath() + ] + ]; + } } return $queries; } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php index bf239ffbd1..ed4293eaaa 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php @@ -4,7 +4,7 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryHelper; -use Alchemy\Phrasea\SearchEngine\Elastic\Structure\ValueChecker; +use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; class TextNode extends AbstractTermNode implements ContextAbleInterface @@ -39,15 +39,20 @@ class TextNode extends AbstractTermNode implements ContextAbleInterface public function buildQuery(QueryContext $context) { $query_builder = function (array $fields) use ($context) { + /** @var Field[] $fields */ // Full text $index_fields = []; - foreach (ValueChecker::filterByValueCompatibility($fields, $this->text) as $field) { + $th_fields = []; + foreach ($fields as $field) { foreach ($context->localizeField($field) as $f) { $index_fields[] = $f; } foreach ($context->truncationField($field) as $f) { $index_fields[] = $f; } + if($field->hasConceptInference()) { + $th_fields[] = $field; + } } if (!$index_fields) { return null; @@ -62,7 +67,7 @@ class TextNode extends AbstractTermNode implements ContextAbleInterface ] ]; // Thesaurus - $concept_queries = $this->buildConceptQueries($fields); + $concept_queries = $this->buildConceptQueries($th_fields); foreach ($concept_queries as $concept_query) { $query = QueryHelper::applyBooleanClause($query, 'should', $concept_query); } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/Record/Hydrator/ThesaurusHydrator.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/Record/Hydrator/ThesaurusHydrator.php index d8076b2922..e2992061e0 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/Record/Hydrator/ThesaurusHydrator.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/Record/Hydrator/ThesaurusHydrator.php @@ -12,15 +12,12 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Indexer\Record\Hydrator; use Alchemy\Phrasea\SearchEngine\Elastic\Exception\Exception; -use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper; +use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\GlobalStructure; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\CandidateTerms; -use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Filter; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; -use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Structure; -use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; class ThesaurusHydrator implements HydratorInterface { @@ -64,12 +61,14 @@ class ThesaurusHydrator implements HydratorInterface throw new Exception('Expected a record with the "databox_id" key set.'); } + $sbid = $record['databox_id']; + $values = array(); $terms = array(); $filters = array(); $field_names = array(); /** @var Field[] $dbFields */ - $dbFields = $this->structure->getAllFieldsByDatabox($record['databox_id']); + $dbFields = $this->structure->getAllFieldsByDatabox($sbid); foreach ($fields as $name => $field) { if(!array_key_exists($name, $dbFields) || !$dbFields[$name]->get_generate_cterms()) { continue; @@ -82,8 +81,8 @@ class ThesaurusHydrator implements HydratorInterface // Concepts are databox's specific, but when no root concepts are // given we need to make sure we only match in the right databox. $filter = $root_concepts - ? Filter::childOfConcepts($record['databox_id'], $root_concepts) - : Filter::byDatabox($record['databox_id']); + ? Filter::childOfConcepts($sbid, $root_concepts) + : Filter::byDatabox($sbid); foreach ($field_values as $value) { $values[] = $value; $terms[] = Term::parse($value); @@ -95,7 +94,7 @@ class ThesaurusHydrator implements HydratorInterface if(empty($terms)) { return; } - $bulk = $this->thesaurus->findConceptsBulk($terms, null, $filters, true); + $bulk = $this->thesaurus->findConceptsBulk($terms, [$sbid], null, $filters, true); foreach ($bulk as $offset => $item_concepts) { $name = $field_names[$offset]; diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryCompiler.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryCompiler.php index ac3de7929a..6b782a57be 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryCompiler.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryCompiler.php @@ -32,17 +32,21 @@ class QueryCompiler public function compile($string, QueryContext $context) { $query = $this->parse($string); - $this->injectThesaurusConcepts($query); + $this->injectThesaurusConcepts($query, $context); return $query->build($context); } - private function injectThesaurusConcepts(Query $query) + /** + * @param Query $query + * @param QueryContext $context + */ + private function injectThesaurusConcepts(Query $query, $context) { // TODO We must restrict thesaurus matching for IN queries, and only // search in each field's root concepts. $nodes = $query->getTermNodes(); - $concepts = $this->thesaurus->findConceptsBulk($nodes); + $concepts = $this->thesaurus->findConceptsBulk($nodes, $context->getDataboxes()); foreach ($concepts as $index => $termConcepts) { $node = $nodes[$index]; diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php index 69baf9f5bf..5f2a4d95a3 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php @@ -2,12 +2,11 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Search; -use Alchemy\Phrasea\SearchEngine\Elastic\Exception\QueryException; -use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Mapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\AST\Field as ASTField; use Alchemy\Phrasea\SearchEngine\Elastic\AST\Flag; +use Alchemy\Phrasea\SearchEngine\Elastic\Exception\QueryException; +use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; +use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Structure; use Alchemy\Phrasea\SearchEngine\SearchEngineOptions; @@ -43,6 +42,11 @@ class QueryContext $this->options = $options; } + public function getDataboxes() + { + return $this->structure->getDataboxes(); + } + public function narrowToFields(array $fields) { if (is_array($this->fields)) { diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryHelper.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryHelper.php index 071ac94b1c..5500b8991d 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryHelper.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryHelper.php @@ -2,14 +2,18 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Search; -use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Mapping; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; class QueryHelper { private function __construct() {} + /** + * @param Field[] $private_fields + * @param Field[] $unrestricted_fields + * @param \Closure $query_builder + * @return array + */ public static function wrapPrivateFieldQueries(array $private_fields, array $unrestricted_fields, \Closure $query_builder) { // We make a boolean clause for each collection set to shrink query size @@ -31,7 +35,16 @@ class QueryHelper foreach ($fields_map as $hash => $fields) { // Right to query on a private field is dependant of document collection // Here we make sure we can only match on allowed collections - $query = $query_builder(array_merge($fields, $unrestricted_fields)); + $relevant_fields = []; + foreach($unrestricted_fields as $uf) { + foreach ($uf->getDependantCollections() as $c) { + if(in_array($c, $collections_map[$hash])) { + $relevant_fields[] = $uf; + break; + } + } + } + $query = $query_builder(array_merge($fields, $relevant_fields)); if ($query !== null) { $queries[] = self::restrictQueryToCollections($query, $collections_map[$hash]); } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/Field.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/Field.php index 63ba164e34..1ed21204a0 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/Field.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/Field.php @@ -49,6 +49,8 @@ class Field implements Typed private $used_by_collections; + private $used_by_databoxes; + public static function createFromLegacyField(databox_field $field) { $type = self::getTypeFromLegacy($field); @@ -75,7 +77,8 @@ class Field implements Typed 'facet' => $facet, 'thesaurus_roots' => $roots, 'generate_cterms' => $field->get_generate_cterms(), - 'used_by_collections' => $databox->get_collection_unique_ids() + 'used_by_collections' => $databox->get_collection_unique_ids(), + 'used_by_databoxes' => [$databox->get_sbas_id()] ]); } @@ -107,6 +110,7 @@ class Field implements Typed $this->thesaurus_roots = \igorw\get_in($options, ['thesaurus_roots'], null); $this->generate_cterms = \igorw\get_in($options, ['generate_cterms'], false); $this->used_by_collections = \igorw\get_in($options, ['used_by_collections'], []); + $this->used_by_databoxes = \igorw\get_in($options, ['used_by_databoxes'], []); } else { // todo: this is faster code, but need to fix unit-tests to pass all options @@ -117,6 +121,7 @@ class Field implements Typed $this->thesaurus_roots = $options['thesaurus_roots']; $this->generate_cterms = $options['generate_cterms']; $this->used_by_collections = $options['used_by_collections']; + $this->used_by_databoxes = $options['used_by_databoxes']; } } @@ -129,7 +134,8 @@ class Field implements Typed 'facet' => $this->facet, 'thesaurus_roots' => $this->thesaurus_roots, 'generate_cterms' => $this->generate_cterms, - 'used_by_collections' => $this->used_by_collections + 'used_by_collections' => $this->used_by_collections, + 'used_by_databoxes' => $this->used_by_databoxes ]); } @@ -168,6 +174,11 @@ class Field implements Typed return $this->used_by_collections; } + public function getDependantDataboxes() + { + return $this->used_by_databoxes; + } + public function isSearchable() { return $this->is_searchable; @@ -255,9 +266,20 @@ class Field implements Typed ) ); + $used_by_databoxes = array_values( + array_unique( + array_merge( + $this->used_by_databoxes, + $other->used_by_databoxes + ), + SORT_REGULAR + ) + ); + return $this->withOptions([ 'thesaurus_roots' => $thesaurus_roots, - 'used_by_collections' => $used_by_collections + 'used_by_collections' => $used_by_collections, + 'used_by_databoxes' => $used_by_databoxes ]); } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/GlobalStructure.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/GlobalStructure.php index 6ffb8cc8ed..be6e0ad423 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/GlobalStructure.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/GlobalStructure.php @@ -98,6 +98,11 @@ final class GlobalStructure implements Structure } } + public function getDataboxes() + { + return array_keys($this->fieldsByDatabox); + } + /** * @return Field[] */ diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/LimitedStructure.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/LimitedStructure.php index 053ca6b0e0..4f52f45298 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/LimitedStructure.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/LimitedStructure.php @@ -32,6 +32,11 @@ final class LimitedStructure implements Structure $this->search_options = $search_options; } + public function getDataboxes() + { + return array_keys($this->search_options->getCollectionsReferencesByDatabox()); + } + public function getAllFields() { return $this->limit($this->structure->getAllFields()); @@ -39,7 +44,8 @@ final class LimitedStructure implements Structure public function getUnrestrictedFields() { - return $this->structure->getUnrestrictedFields(); + // return $this->structure->getUnrestrictedFields(); + return $this->limit($this->structure->getUnrestrictedFields()); } public function getPrivateFields() @@ -93,7 +99,7 @@ final class LimitedStructure implements Structure return $this->structure->getMetadataTagByName($name); } - private function limit(array $fields) + private function old_limit(array $fields) { $allowed_collections = $this->allowedCollections(); // Filter private field collections (base_id) on which access is restricted. @@ -111,6 +117,27 @@ final class LimitedStructure implements Structure return $limited_fields; } + private function limit(array $fields) + { + $allowed_collections = $this->allowedCollections(); + // Filter private field collections (base_id) on which access is restricted. + $limited_fields = []; + foreach ($fields as $name => $field) { + $field = $this->limitField($field, $allowed_collections); + if(empty($field->getDependantCollections())) { + continue; + } + if ($field->isPrivate()) { + // Private fields without collections can't be ever visible, we skip them + if (!$field->getDependantCollections()) { + continue; + } + } + $limited_fields[$name] = $field; + } + return $limited_fields; + } + private function limitField(Field $field, array $allowed_collections = null) { if ($allowed_collections === null) { diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/Structure.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/Structure.php index 72462488e2..cef7771b59 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/Structure.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Structure/Structure.php @@ -12,6 +12,11 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Structure; interface Structure { + /** + * @return mixed + */ + public function getDataboxes(); + /** * @return Field[] */ diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php index 1acc2c740f..ed950c8b18 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php @@ -45,7 +45,7 @@ class Thesaurus * @param boolean $strict Strict mode matching * @return Concept[][] List of matching concepts for each term */ - public function findConceptsBulk(array $terms, $lang = null, $filter = null, $strict = false) + public function findConceptsBulk(array $terms, array $databoxIds, $lang = null, $filter = null, $strict = false) { $this->logger->debug(sprintf('Finding linked concepts in bulk for %d terms', count($terms))); @@ -61,7 +61,7 @@ class Thesaurus $concepts = array(); foreach ($terms as $index => $term) { $strict |= ($term instanceof AST\TermNode); // a "term" node is [strict group of words] - $concepts[] = $this->findConcepts($term, $lang, $filters[$index], $strict); + $concepts[] = $this->findConcepts($term, $databoxIds, $lang, $filters[$index], $strict); } return $concepts; @@ -79,16 +79,16 @@ class Thesaurus * @param boolean $strict Whether to enable strict search or not * @return Concept[] Matching concepts */ - public function findConcepts($term, $lang = null, Filter $filter = null, $strict = false) + public function findConcepts($term, array $databoxIds, $lang = null, Filter $filter = null, $strict = false) { return $strict ? - $this->findConceptsStrict($term, $lang, $filter) + $this->findConceptsStrict($term, $databoxIds, $lang, $filter) : - $this->findConceptsFuzzy($term, $lang, $filter) + $this->findConceptsFuzzy($term, $databoxIds, $lang, $filter) ; } - private function findConceptsStrict($term, $lang = null, Filter $filter = null) + private function findConceptsStrict($term, array $databoxIds, $lang = null, Filter $filter = null) { if (!($term instanceof TermInterface)) { $term = new Term($term); @@ -126,6 +126,24 @@ class Thesaurus ] ]; } + + if(count($databoxIds) > 0) { + if(count($databoxIds) == 1) { + $filters[] = [ + 'term' => [ + 'databox_id' => $databoxIds[0] + ] + ]; + } + else { + $filters[] = [ + 'terms' => [ + 'databox_id' => $databoxIds + ] + ]; + } + } + if ($lang) { $filters[] = [ 'term' => [ @@ -133,6 +151,7 @@ class Thesaurus ] ]; } + if ($filter) { $filters = array_merge($filters, $filter->getQueryFilters()); } @@ -184,12 +203,18 @@ class Thesaurus // Extract concept paths from response $concepts = array(); - $buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []); + $db_buckets = \igorw\get_in($response, ['aggregations', 'db', 'buckets'], []); $keys = array(); - foreach ($buckets as $bucket) { - if (isset($bucket['key'])) { - $keys[] = $bucket['key']; - $concepts[] = new Concept($bucket['key']); + foreach ($db_buckets as $db_bucket) { + if (isset($db_bucket['key'])) { + $db = $db_bucket['key']; + $cp_buckets = \igorw\get_in($db_bucket, ['cp', 'buckets'], []); + foreach ($cp_buckets as $cp_bucket) { + if (isset($cp_bucket['key'])) { + $keys[] = $cp_bucket['key']; + $concepts[] = new Concept($db, $cp_bucket['key']); + } + } } } @@ -200,7 +225,7 @@ class Thesaurus return $concepts; } - private function findConceptsFuzzy($term, $lang = null, Filter $filter = null) + private function findConceptsFuzzy($term, array $databoxIds, $lang = null, Filter $filter = null) { if (!($term instanceof TermInterface)) { $term = new Term($term); @@ -236,6 +261,29 @@ class Thesaurus $query['bool']['must'][1] = $context_query; } + if(count($databoxIds) > 0) { + if(count($databoxIds) == 1) { + $query = self::applyQueryFilter( + $query, + [ + 'term' => [ + 'databox_id' => $databoxIds[0] + ] + ] + ); + } + else { + $query = self::applyQueryFilter( + $query, + [ + 'terms' => [ + 'databox_id' => $databoxIds + ] + ] + ); + } + } + if ($lang) { $lang_filter = array(); $lang_filter['term']['lang'] = $lang; @@ -246,36 +294,55 @@ class Thesaurus $this->logger->debug('Using filter', array('filter' => Filter::dump($filter))); $query = self::applyQueryFilter($query, $filter->getQueryFilter()); } + $params = [ + 'index' => $this->options->getIndexName(), + 'type' => TermIndexer::TYPE_NAME, + 'body' => [ + 'query' => $query, + 'aggs' => [ + // Path deduplication + 'db' => [ // databox_id + 'terms' => [ + 'field' => 'databox_id' + ], + 'aggs' => [ + // Path deduplication + 'cp' => [ // concept_path + 'terms' => [ + 'field' => 'path.raw' + ] + ] + ], - // Path deduplication - $aggs = array(); - $aggs['dedup']['terms']['field'] = 'path.raw'; - - // Search request - $params = array(); - $params['index'] = $this->options->getIndexName(); - $params['type'] = TermIndexer::TYPE_NAME; - $params['body']['query'] = $query; - $params['body']['aggs'] = $aggs; - // Arbitrary score low limit, we need find a more granular way to remove - // inexact concepts. - // We also need to disable TF/IDF on terms, and try to boost score only - // when the search match nearly all tokens of term's value field. - $params['body']['min_score'] = $this->options->getMinScore(); - // No need to get any hits since we extract data from aggs - $params['body']['size'] = 0; + ] + ], + // Arbitrary score low limit, we need find a more granular way to remove + // inexact concepts. + // We also need to disable TF/IDF on terms, and try to boost score only + // when the search match nearly all tokens of term's value field. + 'min_score' => $this->options->getMinScore(), + // No need to get any hits since we extract data from aggs + 'size' => 0 + ] + ]; $this->logger->debug('Sending search', $params['body']); $response = $this->client->search($params); // Extract concept paths from response - $concepts = array(); - $buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []); + $concepts = []; + $db_buckets = \igorw\get_in($response, ['aggregations', 'db', 'buckets'], []); $keys = array(); - foreach ($buckets as $bucket) { - if (isset($bucket['key'])) { - $keys[] = $bucket['key']; - $concepts[] = new Concept($bucket['key']); + foreach ($db_buckets as $db_bucket) { + if (isset($db_bucket['key'])) { + $db = $db_bucket['key']; + $cp_buckets = \igorw\get_in($db_bucket, ['cp', 'buckets'], []); + foreach ($cp_buckets as $cp_bucket) { + if (isset($cp_bucket['key'])) { + $keys[] = $cp_bucket['key']; + $concepts[] = new Concept($db, $cp_bucket['key']); + } + } } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php index f8d44b0499..bd1f884b3d 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php @@ -13,13 +13,20 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; class Concept { + private $databox_id; private $path; - public function __construct($path) + public function __construct($databox_id, $path) { + $this->databox_id = $databox_id; $this->path = (string) $path; } + public function getDataboxId() + { + return $this->databox_id; + } + public function getPath() { return $this->path; diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Helper.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Helper.php index ed2d468e60..526d2b1ff0 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Helper.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Helper.php @@ -33,52 +33,27 @@ class Helper /** @var DOMElement $node */ foreach ($nodes as $node) { - if(1) { - $me_and_parents = array_merge([$node], self::getElementAncestors($node)); + $me_and_parents = array_merge([$node], self::getElementAncestors($node)); - $path_segments = []; + $path_segments = []; - foreach ($me_and_parents as $me_or_parent) { - if (!Navigator::isConcept($me_or_parent)) { - // Silently skips invalid targeted nodes - break; - } - - $path_segments[] = self::conceptPathSegment($me_or_parent); + foreach ($me_and_parents as $me_or_parent) { + if (!Navigator::isConcept($me_or_parent)) { + // Silently skips invalid targeted nodes + break; } - // Concept paths are have databox identifier at root level - $concepts[] = new Concept(sprintf( + $path_segments[] = self::conceptPathSegment($me_or_parent); + } + + // Concept paths are have databox identifier at root level + $concepts[] = new Concept( + $databox->get_sbas_id(), + sprintf( '/%d/%s', $databox->get_sbas_id(), implode('/', array_reverse($path_segments)) )); - } - else { - $path = ''; - // go up thru parents - while ($node) { - $v = null; - for ($n = $node->firstChild; $n; $n = $n->nextSibling) { - if ($n->nodeType === XML_ELEMENT_NODE && $n->nodeName === 'sy') { - if ($v === null) { - $v = $n->getAttribute('v'); - continue; - } - if ($n->getAttribute('lng') === 'en') { - $v = $n->getAttribute('v'); - break; - } - } - } - if ($v !== null) { - $path = '/' . $v . $path; - } - $node = $node->parentNode; - } - $path = '/' . $databox->get_sbas_id() . $path; - $concepts[] = new Concept($path); - } } return $concepts; diff --git a/tests/Alchemy/Tests/Phrasea/SearchEngine/AST/TermNodeTest.php b/tests/Alchemy/Tests/Phrasea/SearchEngine/AST/TermNodeTest.php index aa68edbfbd..d07f57e357 100644 --- a/tests/Alchemy/Tests/Phrasea/SearchEngine/AST/TermNodeTest.php +++ b/tests/Alchemy/Tests/Phrasea/SearchEngine/AST/TermNodeTest.php @@ -5,7 +5,6 @@ namespace Alchemy\Tests\Phrasea\SearchEngine\AST; use Alchemy\Phrasea\SearchEngine\Elastic\AST\Context; use Alchemy\Phrasea\SearchEngine\Elastic\AST\TermNode; use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Mapping; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; @@ -39,8 +38,8 @@ class TermNodeTest extends \PHPUnit_Framework_TestCase $node = new TermNode('bar'); $node->setConcepts([ - new Concept('/baz'), - new Concept('/qux'), + new Concept(1, '/baz'), + new Concept(2, '/qux'), ]); $query = $node->buildQuery($query_context->reveal()); @@ -98,8 +97,8 @@ class TermNodeTest extends \PHPUnit_Framework_TestCase $node = new TermNode('baz'); $node->setConcepts([ - new Concept('/baz'), - new Concept('/qux'), + new Concept(1, '/baz'), + new Concept(2, '/qux'), ]); $query = $node->buildQuery($query_context->reveal()); diff --git a/tests/Alchemy/Tests/Phrasea/SearchEngine/AST/TextNodeTest.php b/tests/Alchemy/Tests/Phrasea/SearchEngine/AST/TextNodeTest.php index 571cbd93de..a4489634ec 100644 --- a/tests/Alchemy/Tests/Phrasea/SearchEngine/AST/TextNodeTest.php +++ b/tests/Alchemy/Tests/Phrasea/SearchEngine/AST/TextNodeTest.php @@ -5,7 +5,6 @@ namespace Alchemy\Tests\Phrasea\SearchEngine\AST; use Alchemy\Phrasea\SearchEngine\Elastic\AST\Context; use Alchemy\Phrasea\SearchEngine\Elastic\AST\TextNode; use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Mapping; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; @@ -147,7 +146,7 @@ class TextNodeTest extends \PHPUnit_Framework_TestCase $node = new TextNode('bar'); $node->setConcepts([ - new Concept('/qux'), + new Concept(2, '/qux'), ]); $query = $node->buildQuery($query_context->reveal()); @@ -203,7 +202,7 @@ class TextNodeTest extends \PHPUnit_Framework_TestCase $node = new TextNode('baz'); $node->setConcepts([ - new Concept('/qux'), + new Concept(2, '/qux'), ]); $query = $node->buildQuery($query_context->reveal()); diff --git a/tests/Alchemy/Tests/Phrasea/SearchEngine/Structure/FieldTest.php b/tests/Alchemy/Tests/Phrasea/SearchEngine/Structure/FieldTest.php index 8a4e670165..25913e82c5 100644 --- a/tests/Alchemy/Tests/Phrasea/SearchEngine/Structure/FieldTest.php +++ b/tests/Alchemy/Tests/Phrasea/SearchEngine/Structure/FieldTest.php @@ -3,9 +3,8 @@ namespace Alchemy\Tests\Phrasea\SearchEngine\Structure; use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Mapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; /** * @group unit @@ -87,8 +86,8 @@ class FieldTest extends \PHPUnit_Framework_TestCase public function testMergeWithThesaurusRoots() { - $foo = new Concept('/foo'); - $bar = new Concept('/bar'); + $foo = new Concept(1, '/foo'); + $bar = new Concept(2, '/bar'); $field = new Field('foo', FieldMapping::TYPE_STRING); $other = new Field('foo', FieldMapping::TYPE_STRING, [ 'thesaurus_roots' => [$foo, $bar] @@ -96,8 +95,8 @@ class FieldTest extends \PHPUnit_Framework_TestCase $merged = $field->mergeWith($other); $this->assertEquals([$foo, $bar], $merged->getThesaurusRoots()); - $foo = new Concept('/foo'); - $bar = new Concept('/bar'); + $foo = new Concept(1, '/foo'); + $bar = new Concept(2, '/bar'); $field = new Field('foo', FieldMapping::TYPE_STRING, [ 'thesaurus_roots' => [$foo] ]); diff --git a/tests/Alchemy/Tests/Phrasea/SearchEngine/Structure/StructureTest.php b/tests/Alchemy/Tests/Phrasea/SearchEngine/Structure/StructureTest.php index b3bde05cd9..ab83753128 100644 --- a/tests/Alchemy/Tests/Phrasea/SearchEngine/Structure/StructureTest.php +++ b/tests/Alchemy/Tests/Phrasea/SearchEngine/Structure/StructureTest.php @@ -3,10 +3,9 @@ namespace Alchemy\Tests\Phrasea\SearchEngine\Structure; use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Mapping; -use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\GlobalStructure as Structure; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; /** * @group unit @@ -113,7 +112,7 @@ class StructureTest extends \PHPUnit_Framework_TestCase 'thesaurus_roots' => null ]); $enabled = new Field('bar', FieldMapping::TYPE_STRING, [ - 'thesaurus_roots' => [new Concept('/foo')] + 'thesaurus_roots' => [new Concept(1, '/foo')] ]); $structure = new Structure(); $structure->add($not_enabled); diff --git a/tests/Alchemy/Tests/Phrasea/SearchEngine/Thesaurus/ConceptTest.php b/tests/Alchemy/Tests/Phrasea/SearchEngine/Thesaurus/ConceptTest.php index 5b44b54e73..9ab425ba40 100644 --- a/tests/Alchemy/Tests/Phrasea/SearchEngine/Thesaurus/ConceptTest.php +++ b/tests/Alchemy/Tests/Phrasea/SearchEngine/Thesaurus/ConceptTest.php @@ -12,34 +12,34 @@ class ConceptTest extends \PHPUnit_Framework_TestCase { public function testGetPath() { - $concept = new Concept('/foo/bar'); + $concept = new Concept(1, '/foo/bar'); $this->assertEquals('/foo/bar', $concept->getPath()); } public function testNarrowCheck() { - $parent = new Concept('/foo'); - $child = new Concept('/foo/bar'); + $parent = new Concept(1, '/foo'); + $child = new Concept(1, '/foo/bar'); $this->assertFalse($parent->isNarrowerThan($child)); $this->assertTrue($child->isNarrowerThan($parent)); - $other = new Concept('/other/bar'); + $other = new Concept(1, '/other/bar'); $this->assertFalse($other->isNarrowerThan($child)); } public function testNarrowConceptPruning() { $concepts = [ - new Concept('/foo'), - new Concept('/fooo'), - new Concept('/foo/baz'), - new Concept('/bar/baz'), - new Concept('/bar'), + new Concept(1, '/foo'), + new Concept(1, '/fooo'), + new Concept(1, '/foo/baz'), + new Concept(1, '/bar/baz'), + new Concept(1, '/bar'), ]; $pruned = Concept::pruneNarrowConcepts($concepts); $expected = [ - new Concept('/bar'), - new Concept('/foo'), - new Concept('/fooo'), + new Concept(1, '/bar'), + new Concept(1, '/foo'), + new Concept(1, '/fooo'), ]; $this->assertEquals($expected, $pruned); }