PHRAS-3389_use-only-conceptpaths-from-selected-dbs_MASTER

fix : search only concept-paths from the relevant thesaurus
fix : search only fields from the relevant databoxes (collections)
This commit is contained in:
jygaulier
2021-04-08 18:15:27 +02:00
parent e29926bcd4
commit 63bee24775
18 changed files with 290 additions and 141 deletions

View File

@@ -2,8 +2,7 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
@@ -12,6 +11,14 @@ abstract class AbstractTermNode extends Node implements TermInterface
protected $text; protected $text;
protected $context; protected $context;
private $concepts = []; private $concepts = [];
/**
* pruned concepts is a reduced list of concepts, keeping only high-level ones
* by removing concepts-included-in-concept,
* e.g.
* /1/animal/mamal
* /1/animal/mamal/dog -- removed because included
* /2/subject/animal
*/
private $pruned_concepts; private $pruned_concepts;
public function __construct($text, Context $context = null) public function __construct($text, Context $context = null)
@@ -26,6 +33,9 @@ abstract class AbstractTermNode extends Node implements TermInterface
$this->concepts = $concepts; $this->concepts = $concepts;
} }
/**
* @return Concept[]
*/
private function getPrunedConcepts() private function getPrunedConcepts()
{ {
if ($this->pruned_concepts === null) { if ($this->pruned_concepts === null) {
@@ -34,6 +44,10 @@ abstract class AbstractTermNode extends Node implements TermInterface
return $this->pruned_concepts; return $this->pruned_concepts;
} }
/**
* @param Field[] $fields
* @return array
*/
protected function buildConceptQueries(array $fields) protected function buildConceptQueries(array $fields)
{ {
$concepts = $this->getPrunedConcepts(); $concepts = $this->getPrunedConcepts();
@@ -43,21 +57,27 @@ abstract class AbstractTermNode extends Node implements TermInterface
$index_fields = []; $index_fields = [];
foreach ($fields as $field) { foreach ($fields as $field) {
$index_fields[] = $field->getConceptPathIndexField(); // $db = $field->get_databox_id();
foreach ($field->getDependantDataboxes() as $db) {
if(!array_key_exists($db, $index_fields)) {
$index_fields[$db] = [];
}
$index_fields[$db][] = $field->getConceptPathIndexField();
} }
if (!$index_fields) {
return [];
} }
$queries = []; $queries = [];
foreach ($concepts as $concept) { foreach ($concepts as $concept) {
$db = $concept->getDataboxId();
if(array_key_exists($db, $index_fields)) {
$queries[] = [ $queries[] = [
'multi_match' => [ 'multi_match' => [
'fields' => $index_fields, 'fields' => $index_fields[$db],
'query' => $concept->getPath() 'query' => $concept->getPath()
] ]
]; ];
} }
}
return $queries; return $queries;
} }

View File

@@ -4,7 +4,7 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryHelper; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\ValueChecker; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
class TextNode extends AbstractTermNode implements ContextAbleInterface class TextNode extends AbstractTermNode implements ContextAbleInterface
@@ -39,15 +39,20 @@ class TextNode extends AbstractTermNode implements ContextAbleInterface
public function buildQuery(QueryContext $context) public function buildQuery(QueryContext $context)
{ {
$query_builder = function (array $fields) use ($context) { $query_builder = function (array $fields) use ($context) {
/** @var Field[] $fields */
// Full text // Full text
$index_fields = []; $index_fields = [];
foreach (ValueChecker::filterByValueCompatibility($fields, $this->text) as $field) { $th_fields = [];
foreach ($fields as $field) {
foreach ($context->localizeField($field) as $f) { foreach ($context->localizeField($field) as $f) {
$index_fields[] = $f; $index_fields[] = $f;
} }
foreach ($context->truncationField($field) as $f) { foreach ($context->truncationField($field) as $f) {
$index_fields[] = $f; $index_fields[] = $f;
} }
if($field->hasConceptInference()) {
$th_fields[] = $field;
}
} }
if (!$index_fields) { if (!$index_fields) {
return null; return null;
@@ -62,7 +67,7 @@ class TextNode extends AbstractTermNode implements ContextAbleInterface
] ]
]; ];
// Thesaurus // Thesaurus
$concept_queries = $this->buildConceptQueries($fields); $concept_queries = $this->buildConceptQueries($th_fields);
foreach ($concept_queries as $concept_query) { foreach ($concept_queries as $concept_query) {
$query = QueryHelper::applyBooleanClause($query, 'should', $concept_query); $query = QueryHelper::applyBooleanClause($query, 'should', $concept_query);
} }

View File

@@ -12,15 +12,12 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\Indexer\Record\Hydrator; namespace Alchemy\Phrasea\SearchEngine\Elastic\Indexer\Record\Hydrator;
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\Exception; use Alchemy\Phrasea\SearchEngine\Elastic\Exception\Exception;
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\GlobalStructure; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\GlobalStructure;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\CandidateTerms; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\CandidateTerms;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Filter; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Filter;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Structure;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
class ThesaurusHydrator implements HydratorInterface class ThesaurusHydrator implements HydratorInterface
{ {
@@ -64,12 +61,14 @@ class ThesaurusHydrator implements HydratorInterface
throw new Exception('Expected a record with the "databox_id" key set.'); throw new Exception('Expected a record with the "databox_id" key set.');
} }
$sbid = $record['databox_id'];
$values = array(); $values = array();
$terms = array(); $terms = array();
$filters = array(); $filters = array();
$field_names = array(); $field_names = array();
/** @var Field[] $dbFields */ /** @var Field[] $dbFields */
$dbFields = $this->structure->getAllFieldsByDatabox($record['databox_id']); $dbFields = $this->structure->getAllFieldsByDatabox($sbid);
foreach ($fields as $name => $field) { foreach ($fields as $name => $field) {
if(!array_key_exists($name, $dbFields) || !$dbFields[$name]->get_generate_cterms()) { if(!array_key_exists($name, $dbFields) || !$dbFields[$name]->get_generate_cterms()) {
continue; continue;
@@ -82,8 +81,8 @@ class ThesaurusHydrator implements HydratorInterface
// Concepts are databox's specific, but when no root concepts are // Concepts are databox's specific, but when no root concepts are
// given we need to make sure we only match in the right databox. // given we need to make sure we only match in the right databox.
$filter = $root_concepts $filter = $root_concepts
? Filter::childOfConcepts($record['databox_id'], $root_concepts) ? Filter::childOfConcepts($sbid, $root_concepts)
: Filter::byDatabox($record['databox_id']); : Filter::byDatabox($sbid);
foreach ($field_values as $value) { foreach ($field_values as $value) {
$values[] = $value; $values[] = $value;
$terms[] = Term::parse($value); $terms[] = Term::parse($value);
@@ -95,7 +94,7 @@ class ThesaurusHydrator implements HydratorInterface
if(empty($terms)) { if(empty($terms)) {
return; return;
} }
$bulk = $this->thesaurus->findConceptsBulk($terms, null, $filters, true); $bulk = $this->thesaurus->findConceptsBulk($terms, [$sbid], null, $filters, true);
foreach ($bulk as $offset => $item_concepts) { foreach ($bulk as $offset => $item_concepts) {
$name = $field_names[$offset]; $name = $field_names[$offset];

View File

@@ -32,17 +32,21 @@ class QueryCompiler
public function compile($string, QueryContext $context) public function compile($string, QueryContext $context)
{ {
$query = $this->parse($string); $query = $this->parse($string);
$this->injectThesaurusConcepts($query); $this->injectThesaurusConcepts($query, $context);
return $query->build($context); return $query->build($context);
} }
private function injectThesaurusConcepts(Query $query) /**
* @param Query $query
* @param QueryContext $context
*/
private function injectThesaurusConcepts(Query $query, $context)
{ {
// TODO We must restrict thesaurus matching for IN queries, and only // TODO We must restrict thesaurus matching for IN queries, and only
// search in each field's root concepts. // search in each field's root concepts.
$nodes = $query->getTermNodes(); $nodes = $query->getTermNodes();
$concepts = $this->thesaurus->findConceptsBulk($nodes); $concepts = $this->thesaurus->findConceptsBulk($nodes, $context->getDataboxes());
foreach ($concepts as $index => $termConcepts) { foreach ($concepts as $index => $termConcepts) {
$node = $nodes[$index]; $node = $nodes[$index];

View File

@@ -2,12 +2,11 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\Search; namespace Alchemy\Phrasea\SearchEngine\Elastic\Search;
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\QueryException;
use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\AST\Field as ASTField; use Alchemy\Phrasea\SearchEngine\Elastic\AST\Field as ASTField;
use Alchemy\Phrasea\SearchEngine\Elastic\AST\Flag; use Alchemy\Phrasea\SearchEngine\Elastic\AST\Flag;
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\QueryException;
use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Structure; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Structure;
use Alchemy\Phrasea\SearchEngine\SearchEngineOptions; use Alchemy\Phrasea\SearchEngine\SearchEngineOptions;
@@ -43,6 +42,11 @@ class QueryContext
$this->options = $options; $this->options = $options;
} }
public function getDataboxes()
{
return $this->structure->getDataboxes();
}
public function narrowToFields(array $fields) public function narrowToFields(array $fields)
{ {
if (is_array($this->fields)) { if (is_array($this->fields)) {

View File

@@ -2,14 +2,18 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\Search; namespace Alchemy\Phrasea\SearchEngine\Elastic\Search;
use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
class QueryHelper class QueryHelper
{ {
private function __construct() {} private function __construct() {}
/**
* @param Field[] $private_fields
* @param Field[] $unrestricted_fields
* @param \Closure $query_builder
* @return array
*/
public static function wrapPrivateFieldQueries(array $private_fields, array $unrestricted_fields, \Closure $query_builder) public static function wrapPrivateFieldQueries(array $private_fields, array $unrestricted_fields, \Closure $query_builder)
{ {
// We make a boolean clause for each collection set to shrink query size // We make a boolean clause for each collection set to shrink query size
@@ -31,7 +35,16 @@ class QueryHelper
foreach ($fields_map as $hash => $fields) { foreach ($fields_map as $hash => $fields) {
// Right to query on a private field is dependant of document collection // Right to query on a private field is dependant of document collection
// Here we make sure we can only match on allowed collections // Here we make sure we can only match on allowed collections
$query = $query_builder(array_merge($fields, $unrestricted_fields)); $relevant_fields = [];
foreach($unrestricted_fields as $uf) {
foreach ($uf->getDependantCollections() as $c) {
if(in_array($c, $collections_map[$hash])) {
$relevant_fields[] = $uf;
break;
}
}
}
$query = $query_builder(array_merge($fields, $relevant_fields));
if ($query !== null) { if ($query !== null) {
$queries[] = self::restrictQueryToCollections($query, $collections_map[$hash]); $queries[] = self::restrictQueryToCollections($query, $collections_map[$hash]);
} }

View File

@@ -49,6 +49,8 @@ class Field implements Typed
private $used_by_collections; private $used_by_collections;
private $used_by_databoxes;
public static function createFromLegacyField(databox_field $field) public static function createFromLegacyField(databox_field $field)
{ {
$type = self::getTypeFromLegacy($field); $type = self::getTypeFromLegacy($field);
@@ -75,7 +77,8 @@ class Field implements Typed
'facet' => $facet, 'facet' => $facet,
'thesaurus_roots' => $roots, 'thesaurus_roots' => $roots,
'generate_cterms' => $field->get_generate_cterms(), 'generate_cterms' => $field->get_generate_cterms(),
'used_by_collections' => $databox->get_collection_unique_ids() 'used_by_collections' => $databox->get_collection_unique_ids(),
'used_by_databoxes' => [$databox->get_sbas_id()]
]); ]);
} }
@@ -107,6 +110,7 @@ class Field implements Typed
$this->thesaurus_roots = \igorw\get_in($options, ['thesaurus_roots'], null); $this->thesaurus_roots = \igorw\get_in($options, ['thesaurus_roots'], null);
$this->generate_cterms = \igorw\get_in($options, ['generate_cterms'], false); $this->generate_cterms = \igorw\get_in($options, ['generate_cterms'], false);
$this->used_by_collections = \igorw\get_in($options, ['used_by_collections'], []); $this->used_by_collections = \igorw\get_in($options, ['used_by_collections'], []);
$this->used_by_databoxes = \igorw\get_in($options, ['used_by_databoxes'], []);
} }
else { else {
// todo: this is faster code, but need to fix unit-tests to pass all options // todo: this is faster code, but need to fix unit-tests to pass all options
@@ -117,6 +121,7 @@ class Field implements Typed
$this->thesaurus_roots = $options['thesaurus_roots']; $this->thesaurus_roots = $options['thesaurus_roots'];
$this->generate_cterms = $options['generate_cterms']; $this->generate_cterms = $options['generate_cterms'];
$this->used_by_collections = $options['used_by_collections']; $this->used_by_collections = $options['used_by_collections'];
$this->used_by_databoxes = $options['used_by_databoxes'];
} }
} }
@@ -129,7 +134,8 @@ class Field implements Typed
'facet' => $this->facet, 'facet' => $this->facet,
'thesaurus_roots' => $this->thesaurus_roots, 'thesaurus_roots' => $this->thesaurus_roots,
'generate_cterms' => $this->generate_cterms, 'generate_cterms' => $this->generate_cterms,
'used_by_collections' => $this->used_by_collections 'used_by_collections' => $this->used_by_collections,
'used_by_databoxes' => $this->used_by_databoxes
]); ]);
} }
@@ -168,6 +174,11 @@ class Field implements Typed
return $this->used_by_collections; return $this->used_by_collections;
} }
public function getDependantDataboxes()
{
return $this->used_by_databoxes;
}
public function isSearchable() public function isSearchable()
{ {
return $this->is_searchable; return $this->is_searchable;
@@ -255,9 +266,20 @@ class Field implements Typed
) )
); );
$used_by_databoxes = array_values(
array_unique(
array_merge(
$this->used_by_databoxes,
$other->used_by_databoxes
),
SORT_REGULAR
)
);
return $this->withOptions([ return $this->withOptions([
'thesaurus_roots' => $thesaurus_roots, 'thesaurus_roots' => $thesaurus_roots,
'used_by_collections' => $used_by_collections 'used_by_collections' => $used_by_collections,
'used_by_databoxes' => $used_by_databoxes
]); ]);
} }

View File

@@ -98,6 +98,11 @@ final class GlobalStructure implements Structure
} }
} }
public function getDataboxes()
{
return array_keys($this->fieldsByDatabox);
}
/** /**
* @return Field[] * @return Field[]
*/ */

View File

@@ -32,6 +32,11 @@ final class LimitedStructure implements Structure
$this->search_options = $search_options; $this->search_options = $search_options;
} }
public function getDataboxes()
{
return array_keys($this->search_options->getCollectionsReferencesByDatabox());
}
public function getAllFields() public function getAllFields()
{ {
return $this->limit($this->structure->getAllFields()); return $this->limit($this->structure->getAllFields());
@@ -39,7 +44,8 @@ final class LimitedStructure implements Structure
public function getUnrestrictedFields() public function getUnrestrictedFields()
{ {
return $this->structure->getUnrestrictedFields(); // return $this->structure->getUnrestrictedFields();
return $this->limit($this->structure->getUnrestrictedFields());
} }
public function getPrivateFields() public function getPrivateFields()
@@ -93,7 +99,7 @@ final class LimitedStructure implements Structure
return $this->structure->getMetadataTagByName($name); return $this->structure->getMetadataTagByName($name);
} }
private function limit(array $fields) private function old_limit(array $fields)
{ {
$allowed_collections = $this->allowedCollections(); $allowed_collections = $this->allowedCollections();
// Filter private field collections (base_id) on which access is restricted. // Filter private field collections (base_id) on which access is restricted.
@@ -111,6 +117,27 @@ final class LimitedStructure implements Structure
return $limited_fields; return $limited_fields;
} }
private function limit(array $fields)
{
$allowed_collections = $this->allowedCollections();
// Filter private field collections (base_id) on which access is restricted.
$limited_fields = [];
foreach ($fields as $name => $field) {
$field = $this->limitField($field, $allowed_collections);
if(empty($field->getDependantCollections())) {
continue;
}
if ($field->isPrivate()) {
// Private fields without collections can't be ever visible, we skip them
if (!$field->getDependantCollections()) {
continue;
}
}
$limited_fields[$name] = $field;
}
return $limited_fields;
}
private function limitField(Field $field, array $allowed_collections = null) private function limitField(Field $field, array $allowed_collections = null)
{ {
if ($allowed_collections === null) { if ($allowed_collections === null) {

View File

@@ -12,6 +12,11 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Structure;
interface Structure interface Structure
{ {
/**
* @return mixed
*/
public function getDataboxes();
/** /**
* @return Field[] * @return Field[]
*/ */

View File

@@ -45,7 +45,7 @@ class Thesaurus
* @param boolean $strict Strict mode matching * @param boolean $strict Strict mode matching
* @return Concept[][] List of matching concepts for each term * @return Concept[][] List of matching concepts for each term
*/ */
public function findConceptsBulk(array $terms, $lang = null, $filter = null, $strict = false) public function findConceptsBulk(array $terms, array $databoxIds, $lang = null, $filter = null, $strict = false)
{ {
$this->logger->debug(sprintf('Finding linked concepts in bulk for %d terms', count($terms))); $this->logger->debug(sprintf('Finding linked concepts in bulk for %d terms', count($terms)));
@@ -61,7 +61,7 @@ class Thesaurus
$concepts = array(); $concepts = array();
foreach ($terms as $index => $term) { foreach ($terms as $index => $term) {
$strict |= ($term instanceof AST\TermNode); // a "term" node is [strict group of words] $strict |= ($term instanceof AST\TermNode); // a "term" node is [strict group of words]
$concepts[] = $this->findConcepts($term, $lang, $filters[$index], $strict); $concepts[] = $this->findConcepts($term, $databoxIds, $lang, $filters[$index], $strict);
} }
return $concepts; return $concepts;
@@ -79,16 +79,16 @@ class Thesaurus
* @param boolean $strict Whether to enable strict search or not * @param boolean $strict Whether to enable strict search or not
* @return Concept[] Matching concepts * @return Concept[] Matching concepts
*/ */
public function findConcepts($term, $lang = null, Filter $filter = null, $strict = false) public function findConcepts($term, array $databoxIds, $lang = null, Filter $filter = null, $strict = false)
{ {
return $strict ? return $strict ?
$this->findConceptsStrict($term, $lang, $filter) $this->findConceptsStrict($term, $databoxIds, $lang, $filter)
: :
$this->findConceptsFuzzy($term, $lang, $filter) $this->findConceptsFuzzy($term, $databoxIds, $lang, $filter)
; ;
} }
private function findConceptsStrict($term, $lang = null, Filter $filter = null) private function findConceptsStrict($term, array $databoxIds, $lang = null, Filter $filter = null)
{ {
if (!($term instanceof TermInterface)) { if (!($term instanceof TermInterface)) {
$term = new Term($term); $term = new Term($term);
@@ -126,6 +126,24 @@ class Thesaurus
] ]
]; ];
} }
if(count($databoxIds) > 0) {
if(count($databoxIds) == 1) {
$filters[] = [
'term' => [
'databox_id' => $databoxIds[0]
]
];
}
else {
$filters[] = [
'terms' => [
'databox_id' => $databoxIds
]
];
}
}
if ($lang) { if ($lang) {
$filters[] = [ $filters[] = [
'term' => [ 'term' => [
@@ -133,6 +151,7 @@ class Thesaurus
] ]
]; ];
} }
if ($filter) { if ($filter) {
$filters = array_merge($filters, $filter->getQueryFilters()); $filters = array_merge($filters, $filter->getQueryFilters());
} }
@@ -184,12 +203,18 @@ class Thesaurus
// Extract concept paths from response // Extract concept paths from response
$concepts = array(); $concepts = array();
$buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []); $db_buckets = \igorw\get_in($response, ['aggregations', 'db', 'buckets'], []);
$keys = array(); $keys = array();
foreach ($buckets as $bucket) { foreach ($db_buckets as $db_bucket) {
if (isset($bucket['key'])) { if (isset($db_bucket['key'])) {
$keys[] = $bucket['key']; $db = $db_bucket['key'];
$concepts[] = new Concept($bucket['key']); $cp_buckets = \igorw\get_in($db_bucket, ['cp', 'buckets'], []);
foreach ($cp_buckets as $cp_bucket) {
if (isset($cp_bucket['key'])) {
$keys[] = $cp_bucket['key'];
$concepts[] = new Concept($db, $cp_bucket['key']);
}
}
} }
} }
@@ -200,7 +225,7 @@ class Thesaurus
return $concepts; return $concepts;
} }
private function findConceptsFuzzy($term, $lang = null, Filter $filter = null) private function findConceptsFuzzy($term, array $databoxIds, $lang = null, Filter $filter = null)
{ {
if (!($term instanceof TermInterface)) { if (!($term instanceof TermInterface)) {
$term = new Term($term); $term = new Term($term);
@@ -236,6 +261,29 @@ class Thesaurus
$query['bool']['must'][1] = $context_query; $query['bool']['must'][1] = $context_query;
} }
if(count($databoxIds) > 0) {
if(count($databoxIds) == 1) {
$query = self::applyQueryFilter(
$query,
[
'term' => [
'databox_id' => $databoxIds[0]
]
]
);
}
else {
$query = self::applyQueryFilter(
$query,
[
'terms' => [
'databox_id' => $databoxIds
]
]
);
}
}
if ($lang) { if ($lang) {
$lang_filter = array(); $lang_filter = array();
$lang_filter['term']['lang'] = $lang; $lang_filter['term']['lang'] = $lang;
@@ -246,36 +294,55 @@ class Thesaurus
$this->logger->debug('Using filter', array('filter' => Filter::dump($filter))); $this->logger->debug('Using filter', array('filter' => Filter::dump($filter)));
$query = self::applyQueryFilter($query, $filter->getQueryFilter()); $query = self::applyQueryFilter($query, $filter->getQueryFilter());
} }
$params = [
'index' => $this->options->getIndexName(),
'type' => TermIndexer::TYPE_NAME,
'body' => [
'query' => $query,
'aggs' => [
// Path deduplication // Path deduplication
$aggs = array(); 'db' => [ // databox_id
$aggs['dedup']['terms']['field'] = 'path.raw'; 'terms' => [
'field' => 'databox_id'
],
'aggs' => [
// Path deduplication
'cp' => [ // concept_path
'terms' => [
'field' => 'path.raw'
]
]
],
// Search request ]
$params = array(); ],
$params['index'] = $this->options->getIndexName();
$params['type'] = TermIndexer::TYPE_NAME;
$params['body']['query'] = $query;
$params['body']['aggs'] = $aggs;
// Arbitrary score low limit, we need find a more granular way to remove // Arbitrary score low limit, we need find a more granular way to remove
// inexact concepts. // inexact concepts.
// We also need to disable TF/IDF on terms, and try to boost score only // We also need to disable TF/IDF on terms, and try to boost score only
// when the search match nearly all tokens of term's value field. // when the search match nearly all tokens of term's value field.
$params['body']['min_score'] = $this->options->getMinScore(); 'min_score' => $this->options->getMinScore(),
// No need to get any hits since we extract data from aggs // No need to get any hits since we extract data from aggs
$params['body']['size'] = 0; 'size' => 0
]
];
$this->logger->debug('Sending search', $params['body']); $this->logger->debug('Sending search', $params['body']);
$response = $this->client->search($params); $response = $this->client->search($params);
// Extract concept paths from response // Extract concept paths from response
$concepts = array(); $concepts = [];
$buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []); $db_buckets = \igorw\get_in($response, ['aggregations', 'db', 'buckets'], []);
$keys = array(); $keys = array();
foreach ($buckets as $bucket) { foreach ($db_buckets as $db_bucket) {
if (isset($bucket['key'])) { if (isset($db_bucket['key'])) {
$keys[] = $bucket['key']; $db = $db_bucket['key'];
$concepts[] = new Concept($bucket['key']); $cp_buckets = \igorw\get_in($db_bucket, ['cp', 'buckets'], []);
foreach ($cp_buckets as $cp_bucket) {
if (isset($cp_bucket['key'])) {
$keys[] = $cp_bucket['key'];
$concepts[] = new Concept($db, $cp_bucket['key']);
}
}
} }
} }

View File

@@ -13,13 +13,20 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
class Concept class Concept
{ {
private $databox_id;
private $path; private $path;
public function __construct($path) public function __construct($databox_id, $path)
{ {
$this->databox_id = $databox_id;
$this->path = (string) $path; $this->path = (string) $path;
} }
public function getDataboxId()
{
return $this->databox_id;
}
public function getPath() public function getPath()
{ {
return $this->path; return $this->path;

View File

@@ -33,7 +33,6 @@ class Helper
/** @var DOMElement $node */ /** @var DOMElement $node */
foreach ($nodes as $node) { foreach ($nodes as $node) {
if(1) {
$me_and_parents = array_merge([$node], self::getElementAncestors($node)); $me_and_parents = array_merge([$node], self::getElementAncestors($node));
$path_segments = []; $path_segments = [];
@@ -48,38 +47,14 @@ class Helper
} }
// Concept paths are have databox identifier at root level // Concept paths are have databox identifier at root level
$concepts[] = new Concept(sprintf( $concepts[] = new Concept(
$databox->get_sbas_id(),
sprintf(
'/%d/%s', '/%d/%s',
$databox->get_sbas_id(), $databox->get_sbas_id(),
implode('/', array_reverse($path_segments)) implode('/', array_reverse($path_segments))
)); ));
} }
else {
$path = '';
// go up thru parents
while ($node) {
$v = null;
for ($n = $node->firstChild; $n; $n = $n->nextSibling) {
if ($n->nodeType === XML_ELEMENT_NODE && $n->nodeName === 'sy') {
if ($v === null) {
$v = $n->getAttribute('v');
continue;
}
if ($n->getAttribute('lng') === 'en') {
$v = $n->getAttribute('v');
break;
}
}
}
if ($v !== null) {
$path = '/' . $v . $path;
}
$node = $node->parentNode;
}
$path = '/' . $databox->get_sbas_id() . $path;
$concepts[] = new Concept($path);
}
}
return $concepts; return $concepts;
} }

View File

@@ -5,7 +5,6 @@ namespace Alchemy\Tests\Phrasea\SearchEngine\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\AST\Context; use Alchemy\Phrasea\SearchEngine\Elastic\AST\Context;
use Alchemy\Phrasea\SearchEngine\Elastic\AST\TermNode; use Alchemy\Phrasea\SearchEngine\Elastic\AST\TermNode;
use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
@@ -39,8 +38,8 @@ class TermNodeTest extends \PHPUnit_Framework_TestCase
$node = new TermNode('bar'); $node = new TermNode('bar');
$node->setConcepts([ $node->setConcepts([
new Concept('/baz'), new Concept(1, '/baz'),
new Concept('/qux'), new Concept(2, '/qux'),
]); ]);
$query = $node->buildQuery($query_context->reveal()); $query = $node->buildQuery($query_context->reveal());
@@ -98,8 +97,8 @@ class TermNodeTest extends \PHPUnit_Framework_TestCase
$node = new TermNode('baz'); $node = new TermNode('baz');
$node->setConcepts([ $node->setConcepts([
new Concept('/baz'), new Concept(1, '/baz'),
new Concept('/qux'), new Concept(2, '/qux'),
]); ]);
$query = $node->buildQuery($query_context->reveal()); $query = $node->buildQuery($query_context->reveal());

View File

@@ -5,7 +5,6 @@ namespace Alchemy\Tests\Phrasea\SearchEngine\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\AST\Context; use Alchemy\Phrasea\SearchEngine\Elastic\AST\Context;
use Alchemy\Phrasea\SearchEngine\Elastic\AST\TextNode; use Alchemy\Phrasea\SearchEngine\Elastic\AST\TextNode;
use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
@@ -147,7 +146,7 @@ class TextNodeTest extends \PHPUnit_Framework_TestCase
$node = new TextNode('bar'); $node = new TextNode('bar');
$node->setConcepts([ $node->setConcepts([
new Concept('/qux'), new Concept(2, '/qux'),
]); ]);
$query = $node->buildQuery($query_context->reveal()); $query = $node->buildQuery($query_context->reveal());
@@ -203,7 +202,7 @@ class TextNodeTest extends \PHPUnit_Framework_TestCase
$node = new TextNode('baz'); $node = new TextNode('baz');
$node->setConcepts([ $node->setConcepts([
new Concept('/qux'), new Concept(2, '/qux'),
]); ]);
$query = $node->buildQuery($query_context->reveal()); $query = $node->buildQuery($query_context->reveal());

View File

@@ -3,9 +3,8 @@
namespace Alchemy\Tests\Phrasea\SearchEngine\Structure; namespace Alchemy\Tests\Phrasea\SearchEngine\Structure;
use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
/** /**
* @group unit * @group unit
@@ -87,8 +86,8 @@ class FieldTest extends \PHPUnit_Framework_TestCase
public function testMergeWithThesaurusRoots() public function testMergeWithThesaurusRoots()
{ {
$foo = new Concept('/foo'); $foo = new Concept(1, '/foo');
$bar = new Concept('/bar'); $bar = new Concept(2, '/bar');
$field = new Field('foo', FieldMapping::TYPE_STRING); $field = new Field('foo', FieldMapping::TYPE_STRING);
$other = new Field('foo', FieldMapping::TYPE_STRING, [ $other = new Field('foo', FieldMapping::TYPE_STRING, [
'thesaurus_roots' => [$foo, $bar] 'thesaurus_roots' => [$foo, $bar]
@@ -96,8 +95,8 @@ class FieldTest extends \PHPUnit_Framework_TestCase
$merged = $field->mergeWith($other); $merged = $field->mergeWith($other);
$this->assertEquals([$foo, $bar], $merged->getThesaurusRoots()); $this->assertEquals([$foo, $bar], $merged->getThesaurusRoots());
$foo = new Concept('/foo'); $foo = new Concept(1, '/foo');
$bar = new Concept('/bar'); $bar = new Concept(2, '/bar');
$field = new Field('foo', FieldMapping::TYPE_STRING, [ $field = new Field('foo', FieldMapping::TYPE_STRING, [
'thesaurus_roots' => [$foo] 'thesaurus_roots' => [$foo]
]); ]);

View File

@@ -3,10 +3,9 @@
namespace Alchemy\Tests\Phrasea\SearchEngine\Structure; namespace Alchemy\Tests\Phrasea\SearchEngine\Structure;
use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping; use Alchemy\Phrasea\SearchEngine\Elastic\FieldMapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\Field;
use Alchemy\Phrasea\SearchEngine\Elastic\Structure\GlobalStructure as Structure; use Alchemy\Phrasea\SearchEngine\Elastic\Structure\GlobalStructure as Structure;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
/** /**
* @group unit * @group unit
@@ -113,7 +112,7 @@ class StructureTest extends \PHPUnit_Framework_TestCase
'thesaurus_roots' => null 'thesaurus_roots' => null
]); ]);
$enabled = new Field('bar', FieldMapping::TYPE_STRING, [ $enabled = new Field('bar', FieldMapping::TYPE_STRING, [
'thesaurus_roots' => [new Concept('/foo')] 'thesaurus_roots' => [new Concept(1, '/foo')]
]); ]);
$structure = new Structure(); $structure = new Structure();
$structure->add($not_enabled); $structure->add($not_enabled);

View File

@@ -12,34 +12,34 @@ class ConceptTest extends \PHPUnit_Framework_TestCase
{ {
public function testGetPath() public function testGetPath()
{ {
$concept = new Concept('/foo/bar'); $concept = new Concept(1, '/foo/bar');
$this->assertEquals('/foo/bar', $concept->getPath()); $this->assertEquals('/foo/bar', $concept->getPath());
} }
public function testNarrowCheck() public function testNarrowCheck()
{ {
$parent = new Concept('/foo'); $parent = new Concept(1, '/foo');
$child = new Concept('/foo/bar'); $child = new Concept(1, '/foo/bar');
$this->assertFalse($parent->isNarrowerThan($child)); $this->assertFalse($parent->isNarrowerThan($child));
$this->assertTrue($child->isNarrowerThan($parent)); $this->assertTrue($child->isNarrowerThan($parent));
$other = new Concept('/other/bar'); $other = new Concept(1, '/other/bar');
$this->assertFalse($other->isNarrowerThan($child)); $this->assertFalse($other->isNarrowerThan($child));
} }
public function testNarrowConceptPruning() public function testNarrowConceptPruning()
{ {
$concepts = [ $concepts = [
new Concept('/foo'), new Concept(1, '/foo'),
new Concept('/fooo'), new Concept(1, '/fooo'),
new Concept('/foo/baz'), new Concept(1, '/foo/baz'),
new Concept('/bar/baz'), new Concept(1, '/bar/baz'),
new Concept('/bar'), new Concept(1, '/bar'),
]; ];
$pruned = Concept::pruneNarrowConcepts($concepts); $pruned = Concept::pruneNarrowConcepts($concepts);
$expected = [ $expected = [
new Concept('/bar'), new Concept(1, '/bar'),
new Concept('/foo'), new Concept(1, '/foo'),
new Concept('/fooo'), new Concept(1, '/fooo'),
]; ];
$this->assertEquals($expected, $pruned); $this->assertEquals($expected, $pruned);
} }