diff --git a/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php b/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php index a3b428cc36..b0b34fcaf7 100644 --- a/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php +++ b/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php @@ -18,6 +18,7 @@ use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; +use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryParser; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngine; @@ -87,6 +88,7 @@ class SearchEngineServiceProvider implements ServiceProviderInterface $app['elasticsearch.indexer.record_indexer'] = $app->share(function ($app) { return new RecordIndexer( + $app['elasticsearch.record_helper'], $app['thesaurus'], $app['elasticsearch.engine'], $app['phraseanet.appbox'], @@ -94,6 +96,10 @@ class SearchEngineServiceProvider implements ServiceProviderInterface ); }); + $app['elasticsearch.record_helper'] = $app->share(function ($app) { + return new RecordHelper($app['phraseanet.appbox']); + }); + $app['elasticsearch.client'] = $app->share(function($app) { $options = $app['elasticsearch.options']; $clientParams = ['hosts' => [sprintf('%s:%s', $options['host'], $options['port'])]]; diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php index 99e5579c64..3d890e1fa7 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/AndExpression.php @@ -2,14 +2,16 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class AndExpression extends BinaryOperator { protected $operator = 'AND'; - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { - $left = $this->left->getQuery($fields); - $right = $this->right->getQuery($fields); + $left = $this->left->buildQuery($context); + $right = $this->right->buildQuery($context); return array( 'bool' => array( diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php index 714f40cb7c..6876e16f8f 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/BinaryOperator.php @@ -26,10 +26,4 @@ abstract class BinaryOperator extends Node $this->right->getTextNodes() ); } - - public function isFullTextOnly() - { - return $this->left->isFullTextOnly() - && $this->right->isFullTextOnly(); - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php index a1c812b444..2959ee4eec 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/ExceptExpression.php @@ -2,14 +2,16 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class ExceptExpression extends BinaryOperator { protected $operator = 'EXCEPT'; - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { - $left = $this->left->getQuery($fields); - $right = $this->right->getQuery($fields); + $left = $this->left->buildQuery($context); + $right = $this->right->buildQuery($context); return array( 'bool' => array( diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php index 6a634c7ebb..5bd7aec253 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/FieldNode.php @@ -2,6 +2,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class FieldNode extends Node { protected $keyword; @@ -16,7 +18,7 @@ class FieldNode extends Node return $this->keyword; } - public function getQuery() + public function buildQuery(QueryContext $context) { throw new \LogicException("A keyword can't be converted to a query."); } @@ -30,9 +32,4 @@ class FieldNode extends Node { return sprintf('', $this->keyword); } - - public function isFullTextOnly() - { - return false; - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php index 0552d4dbad..00f95afdc3 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/InExpression.php @@ -2,6 +2,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class InExpression extends Node { protected $field; @@ -13,9 +15,11 @@ class InExpression extends Node $this->expression = $expression; } - public function getQuery() + public function buildQuery(QueryContext $context) { - return $this->expression->getQuery($this->field->getValue()); + $fields = array($this->field->getValue()); + + return $this->expression->buildQuery($context->narrowToFields($fields)); } public function getTextNodes() @@ -27,10 +31,4 @@ class InExpression extends Node { return sprintf('(%s IN %s)', $this->expression, $this->field); } - - public function isFullTextOnly() - { - // In expressions are never full-text - return false; - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php index 7a13e47478..36467b9fb3 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/Node.php @@ -2,17 +2,14 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + abstract class Node { /** * @return array The Elasticsearch formatted query */ - abstract public function getQuery(); - - /** - * @return bool Tell if the node and it's child are full-text queries only - */ - abstract public function isFullTextOnly(); + abstract public function buildQuery(QueryContext $context); abstract public function getTextNodes(); } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php index 6a345688e5..61c58e2109 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/OrExpression.php @@ -2,14 +2,16 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class OrExpression extends BinaryOperator { protected $operator = 'OR'; - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { - $left = $this->left->getQuery($fields); - $right = $this->right->getQuery($fields); + $left = $this->left->buildQuery($context); + $right = $this->right->buildQuery($context); return array( 'bool' => array( diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php index 8e63f9e586..c18754e58e 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/PrefixNode.php @@ -2,6 +2,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class PrefixNode extends Node { protected $prefix; @@ -11,11 +13,11 @@ class PrefixNode extends Node $this->prefix = $prefix; } - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { return array( 'multi_match' => array( - 'fields' => $fields, + 'fields' => $context->getLocalizedFields(), 'query' => $this->prefix, 'type' => 'phrase_prefix' ) @@ -26,9 +28,4 @@ class PrefixNode extends Node { return sprintf('prefix("%s")', $this->prefix); } - - public function isFullTextOnly() - { - return true; - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php index 35df144d6c..48a5f913d3 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/QuotedTextNode.php @@ -2,22 +2,19 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; + class QuotedTextNode extends TextNode { - public function getQuery($fields = ['_all']) + public function buildQuery(QueryContext $context) { return array( 'multi_match' => array( 'type' => 'phrase', - 'fields' => $fields, + 'fields' => $context->getLocalizedFields(), 'query' => $this->text, // 'operator' => 'and' ) ); } - - public function isFullTextOnly() - { - return true; - } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php index cbb0ab4b4d..e1b5a6e498 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/AST/TextNode.php @@ -2,23 +2,43 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; -class TextNode extends Node +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface; + +class TextNode extends Node implements TermInterface { protected $text; + protected $concepts = array(); public function __construct($text) { $this->text = $text; } - public function getQuery($fields = ['_all']) + public function setConcepts(array $concepts) { - return array( + $this->concepts = $concepts; + } + + public function buildQuery(QueryContext $context) + { + $query = array( 'multi_match' => array( - 'fields' => $fields, + 'fields' => $context->getLocalizedFields(), 'query' => $this->text, ) ); + + if ($this->concepts) { + $shoulds = array($query); + foreach ($this->concepts as $concept) { + $shoulds[]['term']['concept_paths'] = $concept->getPath(); + } + $query = array(); + $query['bool']['should'] = $shoulds; + } + + return $query; } public function getTextNodes() @@ -31,13 +51,22 @@ class TextNode extends Node return sprintf('"%s"', $this->text); } - public function isFullTextOnly() - { - return true; - } - public function getText() + // Implementation of TermInterface + + public function getValue() { return $this->text; } + + public function hasContext() + { + return false; + } + + public function getContext() + { + // TODO Insert context during parsing + return null; + } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php index 46b51e849f..cfd2930649 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/ElasticSearchEngine.php @@ -13,7 +13,9 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; -use Alchemy\Phrasea\SearchEngine\Elastic\Search\SearchQuery; +use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; use Alchemy\Phrasea\SearchEngine\SearchEngineInterface; use Alchemy\Phrasea\SearchEngine\SearchEngineOptions; use Alchemy\Phrasea\SearchEngine\SearchEngineResult; @@ -280,83 +282,26 @@ class ElasticSearchEngine implements SearchEngineInterface $query['_ast'] = $searchQuery->dump(); + + $thesaurus = $this->app['thesaurus']; - foreach ($searchQuery->getTextNodes() as $textNode) { - $text = $textNode->getText(); - $concepts = $thesaurus->findConcepts($text); - $query['_thesaurus_concepts'][$text] = $concepts; + $textNodes = $searchQuery->getTextNodes(); + $concepts = $thesaurus->findConceptsBulk($textNodes); + + foreach ($concepts as $index => $termConcepts) { + $node = $textNodes[$index]; + $node->setConcepts($termConcepts); + $term = Term::dump($node); + $query['_thesaurus_concepts'][$term] = Concept::toPathArray($termConcepts); } - // $concepts = $thesaurus->findConceptsBulk($terms); + $recordHelper = $this->app['elasticsearch.record_helper']; + // TODO Pass options to getFields to include/exclude private fields + $searchableFields = $recordHelper->getFields(); + $queryContext = new QueryContext($searchableFields, $this->locales, $this->app['locale']); + $recordQuery = $searchQuery->build($queryContext); - // Contains the full thesaurus paths to search on - $pathsToFilter = []; - // Contains the thesaurus values by fields (synonyms, translations, etc) - $collectFields = []; - - // Only search in thesaurus for full text search - if ($searchQuery->isFullTextOnly()) { - $termFields = $this->expendToAnalyzedFieldsNames('value', null, $this->app['locale']); - $termsQuery = $searchQuery->getElasticsearchQuery($termFields); - - $params = $this->createTermQueryParams($termsQuery, $options); - $terms = $this->doExecute('search', $params); - - foreach ($terms['hits']['hits'] as $term) { - // Skip paths with very low score - if ($term['_score'] < 1) { - continue; - } - - $pathsToFilter[$term['_source']['path']] = $term['_score']; - - foreach ($term['_source']['fields'] as $field) { - $collectFields['caption.'.$field][] = $term['_source']['value']; - } - } - $pathsToFilter = array_unique($pathsToFilter); - } - - if (empty($collectFields)) { - // @todo a list of field by default? all fields? - $searchFieldNames = ['caption.*']; - } else { - $searchFieldNames = array_keys($collectFields); - } - - $recordFields = $this->expendToAnalyzedFieldsNames($searchFieldNames, null, $this->app['locale']); - - $recordQuery = [ - 'bool' => [ - 'should' => [ - $searchQuery->getElasticsearchQuery($recordFields) - ] - ] - ]; - - foreach ($pathsToFilter as $path => $score) { - // Also match incomplete path. /a/b/c will return /a/b/c/d records - $recordQuery['bool']['should'][] = [ - 'match' => [ - 'concept_paths' => array( - 'query' => $path, - 'boost' => $score, - ) - ] - ]; - - // Add signal for exact path only - $recordQuery['bool']['should'][] = [ - 'term' => [ - 'concept_paths.raw' => array( - 'value' => $path, - 'boost' => $score, - ) - ] - ]; - } - $params = $this->createRecordQueryParams($recordQuery, $options, null); $params['body']['from'] = $offset; $params['body']['size'] = $perPage; @@ -390,10 +335,15 @@ class ElasticSearchEngine implements SearchEngineInterface $results[] = ElasticsearchRecordHydrator::hydrate($hit['_source'], $n++); } + $query['_searchable_fields'] = $searchableFields; $query['_ast'] = $searchQuery->dump(); - $query['_paths'] = $pathsToFilter; - $query['_richFields'] = $collectFields; - $query['query'] = json_encode($params); + // $query['_paths'] = $pathsToFilter; + // $query['_richFields'] = $collectFields; + + $queryyy = $recordQuery; + // $queryyy = $params['body']; + $query['query'] = $queryyy; + $query['query_as_string'] = json_encode($queryyy); return new SearchEngineResult($results, json_encode($query), $res['took'], $offset, $res['hits']['total'], $res['hits']['total'], null, null, $suggestions, [], @@ -584,36 +534,6 @@ class ElasticSearchEngine implements SearchEngineInterface return $res; } - /** - * @param array|string $fields - * @param array|null $locales - * @param null $currentLocale - * @return array - */ - public function expendToAnalyzedFieldsNames($fields, $locales = null, $currentLocale = null) - { - $fieldsExpended = []; - - if (!$locales) { - $locales = $this->locales; - } - - foreach ((array) $fields as $field) { - foreach ($locales as $locale) { - $boost = ""; - - if ($locale === $currentLocale) { - $boost = "^5"; - } - - $fieldsExpended[] = sprintf('%s.%s%s', $field, $locale, $boost); - } - $fieldsExpended[] = sprintf('%s.%s', $field, 'light^10'); - } - - return $fieldsExpended; - } - private function getFlagsKey(\appbox $appbox) { $flags = []; diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php index 36006f325f..246548f18d 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Indexer/RecordIndexer.php @@ -20,13 +20,16 @@ use Alchemy\Phrasea\SearchEngine\Elastic\RecordFetcher; use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper; use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; -use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper; use media_subdef; class RecordIndexer { const TYPE_NAME = 'record'; + private $helper; + + private $thesaurus; + /** * @var \appbox */ @@ -42,10 +45,9 @@ class RecordIndexer */ private $locales; - private $dataStructure; - - public function __construct(Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales) + public function __construct(RecordHelper $helper, Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales) { + $this->helper = $helper; $this->thesaurus = $thesaurus; $this->appbox = $appbox; $this->elasticSearchEngine = $elasticSearchEngine; @@ -54,11 +56,8 @@ class RecordIndexer public function populateIndex(BulkOperation $bulk) { - // Helper to fetch record related data - $recordHelper = new RecordHelper($this->appbox); - foreach ($this->appbox->get_databoxes() as $databox) { - $fetcher = new RecordFetcher($databox, $recordHelper); + $fetcher = new RecordFetcher($databox, $this->helper); $fetcher->setBatchSize(200); while ($records = $fetcher->fetch()) { foreach ($records as $record) { @@ -74,9 +73,7 @@ class RecordIndexer public function indexSingleRecord(\record_adapter $record_adapter, $indexName) { - // Helper to fetch record related data - $recordHelper = new RecordHelper($this->appbox); - $fetcher = new RecordFetcher($record_adapter->get_databox(), $recordHelper); + $fetcher = new RecordFetcher($record_adapter->get_databox(), $this->helper); $record = $fetcher->fetchOne($record_adapter); $params = array(); @@ -218,77 +215,7 @@ class RecordIndexer private function getFieldsStructure() { - if (!empty($this->dataStructure)) { - return $this->dataStructure; - } - - $fields = array(); - - foreach ($this->appbox->get_databoxes() as $databox) { - //printf("Databox %d\n", $databox->get_sbas_id()); - foreach ($databox->get_meta_structure() as $fieldStructure) { - $field = array(); - // Field type - switch ($fieldStructure->get_type()) { - case \databox_field::TYPE_DATE: - $field['type'] = 'date'; - break; - case \databox_field::TYPE_NUMBER: - $field['type'] = 'double'; - break; - case \databox_field::TYPE_STRING: - case \databox_field::TYPE_TEXT: - $field['type'] = 'string'; - break; - default: - throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type())); - break; - } - - $name = $fieldStructure->get_name(); - - // Business rules - $field['private'] = $fieldStructure->isBusiness(); - $field['indexable'] = $fieldStructure->is_indexable(); - $field['to_aggregate'] = (bool) $fieldStructure->isAggregable(); - - // Thesaurus concept inference - // $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']"; - $helper = new ThesaurusHelper(); - - // TODO Not the real option yet - $field['thesaurus_concept_inference'] = $field['type'] === 'string'; - // TODO Find thesaurus path prefixes - $field['thesaurus_prefix'] = '/categories'; - - //printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']); - - // Since mapping is merged between databoxes, two fields may - // have conflicting names. Indexing is the same for a given - // type so we reject only those with different types. - if (isset($fields[$name])) { - if ($fields[$name]['type'] !== $field['type']) { - throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type'])); - } - - if ($fields[$name]['indexable'] !== $field['indexable']) { - throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name)); - } - - if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) { - throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name)); - } - // TODO other structure incompatibilities - - //printf("Merged with previous \"%s\" field\n", $name); - } - - $fields[$name] = $field; - } - } - - $this->dataStructure = $fields; - return $this->dataStructure; + return $this->helper->getFieldsStructure(); } // @todo Add call to addAnalyzedVersion ? diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php index 2453a06f8e..7bfb7c043b 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/RecordHelper.php @@ -11,18 +11,21 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper; use appbox; use igorw; class RecordHelper { - private $connection; + private $appbox; + // Computation caches private $collectionMap; + private $fieldStructure; public function __construct(appbox $appbox) { - $this->connection = $appbox->get_connection(); + $this->appbox = $appbox; } public function getUniqueRecordId($databoxId, $recordId) @@ -46,12 +49,13 @@ class RecordHelper private function collectionMap() { if (!$this->collectionMap) { + $connection = $this->appbox->get_connection(); $sql = 'SELECT sbas_id as databox_id, server_coll_id as collection_id, base_id FROM bas'; - $statement = $this->connection->query($sql); + $statement = $connection->query($sql); $map = array(); while ($mapping = $statement->fetch()) { @@ -68,4 +72,97 @@ class RecordHelper { return StringUtils::slugify($key, '_'); } + + public function getFields($includePrivate = false, $onlySearchable = true) + { + $fields = array(); + foreach ($this->getFieldsStructure() as $name => $options) { + // Skip private fields + if ($options['private'] && !$includePrivate) { + continue; + } + // Skip not searchable fields + if ($onlySearchable && !$options['indexable']) { + continue; + } + $fields[] = $name; + } + + return $fields; + } + + public function getFieldsStructure() + { + if (!empty($this->fieldsStructure)) { + return $this->fieldsStructure; + } + + $fields = array(); + + foreach ($this->appbox->get_databoxes() as $databox) { + //printf("Databox %d\n", $databox->get_sbas_id()); + foreach ($databox->get_meta_structure() as $fieldStructure) { + $field = array(); + // Field type + switch ($fieldStructure->get_type()) { + case \databox_field::TYPE_DATE: + $field['type'] = 'date'; + break; + case \databox_field::TYPE_NUMBER: + $field['type'] = 'double'; + break; + case \databox_field::TYPE_STRING: + case \databox_field::TYPE_TEXT: + $field['type'] = 'string'; + break; + default: + throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type())); + break; + } + + $name = $fieldStructure->get_name(); + + // Business rules + $field['private'] = $fieldStructure->isBusiness(); + $field['indexable'] = $fieldStructure->is_indexable(); + $field['to_aggregate'] = (bool) $fieldStructure->isAggregable(); + + // Thesaurus concept inference + // $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']"; + $helper = new ThesaurusHelper(); + + // TODO Not the real option yet + $field['thesaurus_concept_inference'] = $field['type'] === 'string'; + // TODO Find thesaurus path prefixes + $field['thesaurus_prefix'] = '/categories'; + + //printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']); + + // Since mapping is merged between databoxes, two fields may + // have conflicting names. Indexing is the same for a given + // type so we reject only those with different types. + if (isset($fields[$name])) { + if ($fields[$name]['type'] !== $field['type']) { + throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type'])); + } + + if ($fields[$name]['indexable'] !== $field['indexable']) { + throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name)); + } + + if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) { + throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name)); + } + // TODO other structure incompatibilities + + //printf("Merged with previous \"%s\" field\n", $name); + } + + $fields[$name] = $field; + } + } + + $this->fieldsStructure = $fields; + return $this->fieldsStructure; + } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php index 3af8d73fdb..4e05e46ed9 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/Query.php @@ -23,19 +23,9 @@ class Query return $this->root->getTextNodes(); } - /* - * This method seems weird to me, the implementation returns true when the - * query doesn't contain IN statements, but that doesn't define a full text - * search. - */ - public function isFullTextOnly() + public function build(QueryContext $context) { - return $this->root->isFullTextOnly(); - } - - public function getElasticsearchQuery($fields = array()) - { - return $this->root->getQuery($fields); + return $this->root->buildQuery($context); } public function dump() diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php new file mode 100644 index 0000000000..8fc5176da9 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryContext.php @@ -0,0 +1,40 @@ +fields = $fields; + $this->locales = $locales; + $this->queryLocale = $queryLocale; + } + + public function narrowToFields(array $fields) + { + // Ensure we are not escaping from original fields restrictions + $fields = array_intersect($this->fields, $fields); + + return new static($fields, $this->locales, $this->queryLocale); + } + + public function getLocalizedFields() + { + $fields = array(); + foreach ($this->fields as $field) { + foreach ($this->locales as $locale) { + $boost = ($locale === $this->queryLocale) ? '^5' : ''; + $fields[] = sprintf('caption.%s.%s%s', $field, $locale, $boost); + } + // TODO Put generic analyzers on main field instead of "light" sub-field + $fields[] = sprintf('caption.%s.%s', $field, 'light^10'); + } + + return $fields; + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php index 75318df71d..efb62efb79 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Search/QueryVisitor.php @@ -119,7 +119,7 @@ class QueryVisitor implements Visit if ($root instanceof AST\TextNode && !$root instanceof AST\QuotedTextNode && !$node instanceof AST\QuotedTextNode) { - $root = new AST\TextNode(sprintf('%s %s', $root->getText(), $node->getText())); + $root = new AST\TextNode(sprintf('%s %s', $root->getValue(), $node->getValue())); } else { $root = new AST\AndExpression($root, $node); } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php index 6fc1244ab8..66d81aa5a1 100644 --- a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus.php @@ -12,6 +12,9 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term; +use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface; use Elasticsearch\Client; class Thesaurus @@ -30,24 +33,34 @@ class Thesaurus public function findConceptsBulk(array $terms, $lang = null) { // TODO Use bulk queries for performance + $concepts = array(); + foreach ($terms as $term) { + $concepts[] = $this->findConcepts($term, $lang); + } + + return $concepts; } - public function findConcepts($term, $context = null, $lang = null) + public function findConcepts($term, $lang = null) { + if (!($term instanceof TermInterface)) { + $term = new Term($term); + } + // TODO Check that term queries are ok with multiple words $query = array(); $field = $lang ? sprintf('value.%s', $lang) : 'value.light'; - $query['match'][$field]['query'] = $term; + $query['match'][$field]['query'] = $term->getValue(); $query['match'][$field]['operator'] = 'and'; // Allow 25% of non-matching tokens // (not exactly the same that 75% of matching tokens) // $query['match'][$field]['minimum_should_match'] = '-25%'; - if ($context) { + if ($term->hasContext()) { $term_query = $query; $query = array(); $query['bool']['must'][0] = $term_query; - $query['bool']['must'][1]['term']['context'] = $context; + $query['bool']['must'][1]['term']['context'] = $term->getContext(); } if ($lang) { @@ -86,7 +99,7 @@ class Thesaurus $buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []); foreach ($buckets as $bucket) { if (isset($bucket['key'])) { - $concepts[] = $bucket['key']; + $concepts[] = new Concept($bucket['key']); } } diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php new file mode 100644 index 0000000000..b757ce92c0 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Concept.php @@ -0,0 +1,40 @@ +path = (string) $path; + } + + public function getPath() + { + return $this->path; + } + + public function __toString() + { + return $this->path; + } + + public static function toPathArray(array $concepts) + { + foreach ($concepts as $index => $concept) { + $concepts[$index] = $concept->getPath(); + } + return $concepts; + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php new file mode 100644 index 0000000000..395b5bd765 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Term.php @@ -0,0 +1,57 @@ +value = (string) $value; + if ($context) { + $this->context = (string) $context; + } + } + + public function getValue() + { + return $this->value; + } + + public function hasContext() + { + return $this->context !== null; + } + + public function getContext() + { + return $this->context; + } + + public function __toString() + { + return self::dump($this); + } + + public static function dump(TermInterface $term) + { + if ($term->hasContext()) { + return sprintf('%s (%s)', $term->getValue(), $term->getContext()); + } + + return $term->getValue(); + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php new file mode 100644 index 0000000000..72a9f05492 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermInterface.php @@ -0,0 +1,19 @@ +