Refactor thesaurus query build

- Look for text nodes and infer the concepts behind term using thesaurus
- Use value objects for thesaurus terms and concepts
- Pass a QueryContext holding allowed fields and locales informations when building the Elasticsearch query
- Change type hinting and name of query building method on nodes
- Remove unused method Node#isFullTextOnly()
- Move getFieldsStructure from RecordIndexer to RecordHelper for reusing field structure in SearchEngine
This commit is contained in:
Mathieu Darse
2015-01-15 20:04:46 +01:00
parent f283bf01d1
commit dc2c9f8c7f
21 changed files with 391 additions and 267 deletions

View File

@@ -18,6 +18,7 @@ use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryParser; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryParser;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngine; use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngine;
@@ -87,6 +88,7 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
$app['elasticsearch.indexer.record_indexer'] = $app->share(function ($app) { $app['elasticsearch.indexer.record_indexer'] = $app->share(function ($app) {
return new RecordIndexer( return new RecordIndexer(
$app['elasticsearch.record_helper'],
$app['thesaurus'], $app['thesaurus'],
$app['elasticsearch.engine'], $app['elasticsearch.engine'],
$app['phraseanet.appbox'], $app['phraseanet.appbox'],
@@ -94,6 +96,10 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
); );
}); });
$app['elasticsearch.record_helper'] = $app->share(function ($app) {
return new RecordHelper($app['phraseanet.appbox']);
});
$app['elasticsearch.client'] = $app->share(function($app) { $app['elasticsearch.client'] = $app->share(function($app) {
$options = $app['elasticsearch.options']; $options = $app['elasticsearch.options'];
$clientParams = ['hosts' => [sprintf('%s:%s', $options['host'], $options['port'])]]; $clientParams = ['hosts' => [sprintf('%s:%s', $options['host'], $options['port'])]];

View File

@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class AndExpression extends BinaryOperator class AndExpression extends BinaryOperator
{ {
protected $operator = 'AND'; protected $operator = 'AND';
public function getQuery($fields = ['_all']) public function buildQuery(QueryContext $context)
{ {
$left = $this->left->getQuery($fields); $left = $this->left->buildQuery($context);
$right = $this->right->getQuery($fields); $right = $this->right->buildQuery($context);
return array( return array(
'bool' => array( 'bool' => array(

View File

@@ -26,10 +26,4 @@ abstract class BinaryOperator extends Node
$this->right->getTextNodes() $this->right->getTextNodes()
); );
} }
public function isFullTextOnly()
{
return $this->left->isFullTextOnly()
&& $this->right->isFullTextOnly();
}
} }

View File

@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class ExceptExpression extends BinaryOperator class ExceptExpression extends BinaryOperator
{ {
protected $operator = 'EXCEPT'; protected $operator = 'EXCEPT';
public function getQuery($fields = ['_all']) public function buildQuery(QueryContext $context)
{ {
$left = $this->left->getQuery($fields); $left = $this->left->buildQuery($context);
$right = $this->right->getQuery($fields); $right = $this->right->buildQuery($context);
return array( return array(
'bool' => array( 'bool' => array(

View File

@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class FieldNode extends Node class FieldNode extends Node
{ {
protected $keyword; protected $keyword;
@@ -16,7 +18,7 @@ class FieldNode extends Node
return $this->keyword; return $this->keyword;
} }
public function getQuery() public function buildQuery(QueryContext $context)
{ {
throw new \LogicException("A keyword can't be converted to a query."); throw new \LogicException("A keyword can't be converted to a query.");
} }
@@ -30,9 +32,4 @@ class FieldNode extends Node
{ {
return sprintf('<field:%s>', $this->keyword); return sprintf('<field:%s>', $this->keyword);
} }
public function isFullTextOnly()
{
return false;
}
} }

View File

@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class InExpression extends Node class InExpression extends Node
{ {
protected $field; protected $field;
@@ -13,9 +15,11 @@ class InExpression extends Node
$this->expression = $expression; $this->expression = $expression;
} }
public function getQuery() public function buildQuery(QueryContext $context)
{ {
return $this->expression->getQuery($this->field->getValue()); $fields = array($this->field->getValue());
return $this->expression->buildQuery($context->narrowToFields($fields));
} }
public function getTextNodes() public function getTextNodes()
@@ -27,10 +31,4 @@ class InExpression extends Node
{ {
return sprintf('(%s IN %s)', $this->expression, $this->field); return sprintf('(%s IN %s)', $this->expression, $this->field);
} }
public function isFullTextOnly()
{
// In expressions are never full-text
return false;
}
} }

View File

@@ -2,17 +2,14 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
abstract class Node abstract class Node
{ {
/** /**
* @return array The Elasticsearch formatted query * @return array The Elasticsearch formatted query
*/ */
abstract public function getQuery(); abstract public function buildQuery(QueryContext $context);
/**
* @return bool Tell if the node and it's child are full-text queries only
*/
abstract public function isFullTextOnly();
abstract public function getTextNodes(); abstract public function getTextNodes();
} }

View File

@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class OrExpression extends BinaryOperator class OrExpression extends BinaryOperator
{ {
protected $operator = 'OR'; protected $operator = 'OR';
public function getQuery($fields = ['_all']) public function buildQuery(QueryContext $context)
{ {
$left = $this->left->getQuery($fields); $left = $this->left->buildQuery($context);
$right = $this->right->getQuery($fields); $right = $this->right->buildQuery($context);
return array( return array(
'bool' => array( 'bool' => array(

View File

@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class PrefixNode extends Node class PrefixNode extends Node
{ {
protected $prefix; protected $prefix;
@@ -11,11 +13,11 @@ class PrefixNode extends Node
$this->prefix = $prefix; $this->prefix = $prefix;
} }
public function getQuery($fields = ['_all']) public function buildQuery(QueryContext $context)
{ {
return array( return array(
'multi_match' => array( 'multi_match' => array(
'fields' => $fields, 'fields' => $context->getLocalizedFields(),
'query' => $this->prefix, 'query' => $this->prefix,
'type' => 'phrase_prefix' 'type' => 'phrase_prefix'
) )
@@ -26,9 +28,4 @@ class PrefixNode extends Node
{ {
return sprintf('prefix("%s")', $this->prefix); return sprintf('prefix("%s")', $this->prefix);
} }
public function isFullTextOnly()
{
return true;
}
} }

View File

@@ -2,22 +2,19 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class QuotedTextNode extends TextNode class QuotedTextNode extends TextNode
{ {
public function getQuery($fields = ['_all']) public function buildQuery(QueryContext $context)
{ {
return array( return array(
'multi_match' => array( 'multi_match' => array(
'type' => 'phrase', 'type' => 'phrase',
'fields' => $fields, 'fields' => $context->getLocalizedFields(),
'query' => $this->text, 'query' => $this->text,
// 'operator' => 'and' // 'operator' => 'and'
) )
); );
} }
public function isFullTextOnly()
{
return true;
}
} }

View File

@@ -2,23 +2,43 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST; namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
class TextNode extends Node use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
class TextNode extends Node implements TermInterface
{ {
protected $text; protected $text;
protected $concepts = array();
public function __construct($text) public function __construct($text)
{ {
$this->text = $text; $this->text = $text;
} }
public function getQuery($fields = ['_all']) public function setConcepts(array $concepts)
{ {
return array( $this->concepts = $concepts;
}
public function buildQuery(QueryContext $context)
{
$query = array(
'multi_match' => array( 'multi_match' => array(
'fields' => $fields, 'fields' => $context->getLocalizedFields(),
'query' => $this->text, 'query' => $this->text,
) )
); );
if ($this->concepts) {
$shoulds = array($query);
foreach ($this->concepts as $concept) {
$shoulds[]['term']['concept_paths'] = $concept->getPath();
}
$query = array();
$query['bool']['should'] = $shoulds;
}
return $query;
} }
public function getTextNodes() public function getTextNodes()
@@ -31,13 +51,22 @@ class TextNode extends Node
return sprintf('"%s"', $this->text); return sprintf('"%s"', $this->text);
} }
public function isFullTextOnly()
{
return true;
}
public function getText() // Implementation of TermInterface
public function getValue()
{ {
return $this->text; return $this->text;
} }
public function hasContext()
{
return false;
}
public function getContext()
{
// TODO Insert context during parsing
return null;
}
} }

View File

@@ -13,7 +13,9 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\SearchQuery; use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
use Alchemy\Phrasea\SearchEngine\SearchEngineInterface; use Alchemy\Phrasea\SearchEngine\SearchEngineInterface;
use Alchemy\Phrasea\SearchEngine\SearchEngineOptions; use Alchemy\Phrasea\SearchEngine\SearchEngineOptions;
use Alchemy\Phrasea\SearchEngine\SearchEngineResult; use Alchemy\Phrasea\SearchEngine\SearchEngineResult;
@@ -280,83 +282,26 @@ class ElasticSearchEngine implements SearchEngineInterface
$query['_ast'] = $searchQuery->dump(); $query['_ast'] = $searchQuery->dump();
$thesaurus = $this->app['thesaurus']; $thesaurus = $this->app['thesaurus'];
foreach ($searchQuery->getTextNodes() as $textNode) { $textNodes = $searchQuery->getTextNodes();
$text = $textNode->getText(); $concepts = $thesaurus->findConceptsBulk($textNodes);
$concepts = $thesaurus->findConcepts($text);
$query['_thesaurus_concepts'][$text] = $concepts; foreach ($concepts as $index => $termConcepts) {
$node = $textNodes[$index];
$node->setConcepts($termConcepts);
$term = Term::dump($node);
$query['_thesaurus_concepts'][$term] = Concept::toPathArray($termConcepts);
} }
// $concepts = $thesaurus->findConceptsBulk($terms); $recordHelper = $this->app['elasticsearch.record_helper'];
// TODO Pass options to getFields to include/exclude private fields
$searchableFields = $recordHelper->getFields();
$queryContext = new QueryContext($searchableFields, $this->locales, $this->app['locale']);
$recordQuery = $searchQuery->build($queryContext);
// Contains the full thesaurus paths to search on
$pathsToFilter = [];
// Contains the thesaurus values by fields (synonyms, translations, etc)
$collectFields = [];
// Only search in thesaurus for full text search
if ($searchQuery->isFullTextOnly()) {
$termFields = $this->expendToAnalyzedFieldsNames('value', null, $this->app['locale']);
$termsQuery = $searchQuery->getElasticsearchQuery($termFields);
$params = $this->createTermQueryParams($termsQuery, $options);
$terms = $this->doExecute('search', $params);
foreach ($terms['hits']['hits'] as $term) {
// Skip paths with very low score
if ($term['_score'] < 1) {
continue;
}
$pathsToFilter[$term['_source']['path']] = $term['_score'];
foreach ($term['_source']['fields'] as $field) {
$collectFields['caption.'.$field][] = $term['_source']['value'];
}
}
$pathsToFilter = array_unique($pathsToFilter);
}
if (empty($collectFields)) {
// @todo a list of field by default? all fields?
$searchFieldNames = ['caption.*'];
} else {
$searchFieldNames = array_keys($collectFields);
}
$recordFields = $this->expendToAnalyzedFieldsNames($searchFieldNames, null, $this->app['locale']);
$recordQuery = [
'bool' => [
'should' => [
$searchQuery->getElasticsearchQuery($recordFields)
]
]
];
foreach ($pathsToFilter as $path => $score) {
// Also match incomplete path. /a/b/c will return /a/b/c/d records
$recordQuery['bool']['should'][] = [
'match' => [
'concept_paths' => array(
'query' => $path,
'boost' => $score,
)
]
];
// Add signal for exact path only
$recordQuery['bool']['should'][] = [
'term' => [
'concept_paths.raw' => array(
'value' => $path,
'boost' => $score,
)
]
];
}
$params = $this->createRecordQueryParams($recordQuery, $options, null); $params = $this->createRecordQueryParams($recordQuery, $options, null);
$params['body']['from'] = $offset; $params['body']['from'] = $offset;
$params['body']['size'] = $perPage; $params['body']['size'] = $perPage;
@@ -390,10 +335,15 @@ class ElasticSearchEngine implements SearchEngineInterface
$results[] = ElasticsearchRecordHydrator::hydrate($hit['_source'], $n++); $results[] = ElasticsearchRecordHydrator::hydrate($hit['_source'], $n++);
} }
$query['_searchable_fields'] = $searchableFields;
$query['_ast'] = $searchQuery->dump(); $query['_ast'] = $searchQuery->dump();
$query['_paths'] = $pathsToFilter; // $query['_paths'] = $pathsToFilter;
$query['_richFields'] = $collectFields; // $query['_richFields'] = $collectFields;
$query['query'] = json_encode($params);
$queryyy = $recordQuery;
// $queryyy = $params['body'];
$query['query'] = $queryyy;
$query['query_as_string'] = json_encode($queryyy);
return new SearchEngineResult($results, json_encode($query), $res['took'], $offset, return new SearchEngineResult($results, json_encode($query), $res['took'], $offset,
$res['hits']['total'], $res['hits']['total'], null, null, $suggestions, [], $res['hits']['total'], $res['hits']['total'], null, null, $suggestions, [],
@@ -584,36 +534,6 @@ class ElasticSearchEngine implements SearchEngineInterface
return $res; return $res;
} }
/**
* @param array|string $fields
* @param array|null $locales
* @param null $currentLocale
* @return array
*/
public function expendToAnalyzedFieldsNames($fields, $locales = null, $currentLocale = null)
{
$fieldsExpended = [];
if (!$locales) {
$locales = $this->locales;
}
foreach ((array) $fields as $field) {
foreach ($locales as $locale) {
$boost = "";
if ($locale === $currentLocale) {
$boost = "^5";
}
$fieldsExpended[] = sprintf('%s.%s%s', $field, $locale, $boost);
}
$fieldsExpended[] = sprintf('%s.%s', $field, 'light^10');
}
return $fieldsExpended;
}
private function getFlagsKey(\appbox $appbox) private function getFlagsKey(\appbox $appbox)
{ {
$flags = []; $flags = [];

View File

@@ -20,13 +20,16 @@ use Alchemy\Phrasea\SearchEngine\Elastic\RecordFetcher;
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper; use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils; use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
use media_subdef; use media_subdef;
class RecordIndexer class RecordIndexer
{ {
const TYPE_NAME = 'record'; const TYPE_NAME = 'record';
private $helper;
private $thesaurus;
/** /**
* @var \appbox * @var \appbox
*/ */
@@ -42,10 +45,9 @@ class RecordIndexer
*/ */
private $locales; private $locales;
private $dataStructure; public function __construct(RecordHelper $helper, Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales)
public function __construct(Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales)
{ {
$this->helper = $helper;
$this->thesaurus = $thesaurus; $this->thesaurus = $thesaurus;
$this->appbox = $appbox; $this->appbox = $appbox;
$this->elasticSearchEngine = $elasticSearchEngine; $this->elasticSearchEngine = $elasticSearchEngine;
@@ -54,11 +56,8 @@ class RecordIndexer
public function populateIndex(BulkOperation $bulk) public function populateIndex(BulkOperation $bulk)
{ {
// Helper to fetch record related data
$recordHelper = new RecordHelper($this->appbox);
foreach ($this->appbox->get_databoxes() as $databox) { foreach ($this->appbox->get_databoxes() as $databox) {
$fetcher = new RecordFetcher($databox, $recordHelper); $fetcher = new RecordFetcher($databox, $this->helper);
$fetcher->setBatchSize(200); $fetcher->setBatchSize(200);
while ($records = $fetcher->fetch()) { while ($records = $fetcher->fetch()) {
foreach ($records as $record) { foreach ($records as $record) {
@@ -74,9 +73,7 @@ class RecordIndexer
public function indexSingleRecord(\record_adapter $record_adapter, $indexName) public function indexSingleRecord(\record_adapter $record_adapter, $indexName)
{ {
// Helper to fetch record related data $fetcher = new RecordFetcher($record_adapter->get_databox(), $this->helper);
$recordHelper = new RecordHelper($this->appbox);
$fetcher = new RecordFetcher($record_adapter->get_databox(), $recordHelper);
$record = $fetcher->fetchOne($record_adapter); $record = $fetcher->fetchOne($record_adapter);
$params = array(); $params = array();
@@ -218,77 +215,7 @@ class RecordIndexer
private function getFieldsStructure() private function getFieldsStructure()
{ {
if (!empty($this->dataStructure)) { return $this->helper->getFieldsStructure();
return $this->dataStructure;
}
$fields = array();
foreach ($this->appbox->get_databoxes() as $databox) {
//printf("Databox %d\n", $databox->get_sbas_id());
foreach ($databox->get_meta_structure() as $fieldStructure) {
$field = array();
// Field type
switch ($fieldStructure->get_type()) {
case \databox_field::TYPE_DATE:
$field['type'] = 'date';
break;
case \databox_field::TYPE_NUMBER:
$field['type'] = 'double';
break;
case \databox_field::TYPE_STRING:
case \databox_field::TYPE_TEXT:
$field['type'] = 'string';
break;
default:
throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type()));
break;
}
$name = $fieldStructure->get_name();
// Business rules
$field['private'] = $fieldStructure->isBusiness();
$field['indexable'] = $fieldStructure->is_indexable();
$field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
// Thesaurus concept inference
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
$helper = new ThesaurusHelper();
// TODO Not the real option yet
$field['thesaurus_concept_inference'] = $field['type'] === 'string';
// TODO Find thesaurus path prefixes
$field['thesaurus_prefix'] = '/categories';
//printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);
// Since mapping is merged between databoxes, two fields may
// have conflicting names. Indexing is the same for a given
// type so we reject only those with different types.
if (isset($fields[$name])) {
if ($fields[$name]['type'] !== $field['type']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type']));
}
if ($fields[$name]['indexable'] !== $field['indexable']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name));
}
if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name));
}
// TODO other structure incompatibilities
//printf("Merged with previous \"%s\" field\n", $name);
}
$fields[$name] = $field;
}
}
$this->dataStructure = $fields;
return $this->dataStructure;
} }
// @todo Add call to addAnalyzedVersion ? // @todo Add call to addAnalyzedVersion ?

View File

@@ -11,18 +11,21 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic; namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
use appbox; use appbox;
use igorw; use igorw;
class RecordHelper class RecordHelper
{ {
private $connection; private $appbox;
// Computation caches
private $collectionMap; private $collectionMap;
private $fieldStructure;
public function __construct(appbox $appbox) public function __construct(appbox $appbox)
{ {
$this->connection = $appbox->get_connection(); $this->appbox = $appbox;
} }
public function getUniqueRecordId($databoxId, $recordId) public function getUniqueRecordId($databoxId, $recordId)
@@ -46,12 +49,13 @@ class RecordHelper
private function collectionMap() private function collectionMap()
{ {
if (!$this->collectionMap) { if (!$this->collectionMap) {
$connection = $this->appbox->get_connection();
$sql = 'SELECT $sql = 'SELECT
sbas_id as databox_id, sbas_id as databox_id,
server_coll_id as collection_id, server_coll_id as collection_id,
base_id base_id
FROM bas'; FROM bas';
$statement = $this->connection->query($sql); $statement = $connection->query($sql);
$map = array(); $map = array();
while ($mapping = $statement->fetch()) { while ($mapping = $statement->fetch()) {
@@ -68,4 +72,97 @@ class RecordHelper
{ {
return StringUtils::slugify($key, '_'); return StringUtils::slugify($key, '_');
} }
public function getFields($includePrivate = false, $onlySearchable = true)
{
$fields = array();
foreach ($this->getFieldsStructure() as $name => $options) {
// Skip private fields
if ($options['private'] && !$includePrivate) {
continue;
}
// Skip not searchable fields
if ($onlySearchable && !$options['indexable']) {
continue;
}
$fields[] = $name;
}
return $fields;
}
public function getFieldsStructure()
{
if (!empty($this->fieldsStructure)) {
return $this->fieldsStructure;
}
$fields = array();
foreach ($this->appbox->get_databoxes() as $databox) {
//printf("Databox %d\n", $databox->get_sbas_id());
foreach ($databox->get_meta_structure() as $fieldStructure) {
$field = array();
// Field type
switch ($fieldStructure->get_type()) {
case \databox_field::TYPE_DATE:
$field['type'] = 'date';
break;
case \databox_field::TYPE_NUMBER:
$field['type'] = 'double';
break;
case \databox_field::TYPE_STRING:
case \databox_field::TYPE_TEXT:
$field['type'] = 'string';
break;
default:
throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type()));
break;
}
$name = $fieldStructure->get_name();
// Business rules
$field['private'] = $fieldStructure->isBusiness();
$field['indexable'] = $fieldStructure->is_indexable();
$field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
// Thesaurus concept inference
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
$helper = new ThesaurusHelper();
// TODO Not the real option yet
$field['thesaurus_concept_inference'] = $field['type'] === 'string';
// TODO Find thesaurus path prefixes
$field['thesaurus_prefix'] = '/categories';
//printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);
// Since mapping is merged between databoxes, two fields may
// have conflicting names. Indexing is the same for a given
// type so we reject only those with different types.
if (isset($fields[$name])) {
if ($fields[$name]['type'] !== $field['type']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type']));
}
if ($fields[$name]['indexable'] !== $field['indexable']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name));
}
if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name));
}
// TODO other structure incompatibilities
//printf("Merged with previous \"%s\" field\n", $name);
}
$fields[$name] = $field;
}
}
$this->fieldsStructure = $fields;
return $this->fieldsStructure;
}
} }

View File

@@ -23,19 +23,9 @@ class Query
return $this->root->getTextNodes(); return $this->root->getTextNodes();
} }
/* public function build(QueryContext $context)
* This method seems weird to me, the implementation returns true when the
* query doesn't contain IN statements, but that doesn't define a full text
* search.
*/
public function isFullTextOnly()
{ {
return $this->root->isFullTextOnly(); return $this->root->buildQuery($context);
}
public function getElasticsearchQuery($fields = array())
{
return $this->root->getQuery($fields);
} }
public function dump() public function dump()

View File

@@ -0,0 +1,40 @@
<?php
namespace Alchemy\Phrasea\SearchEngine\Elastic\Search;
class QueryContext
{
private $fields;
private $locales;
private $queryLocale;
public function __construct(array $fields, array $locales, $queryLocale)
{
$this->fields = $fields;
$this->locales = $locales;
$this->queryLocale = $queryLocale;
}
public function narrowToFields(array $fields)
{
// Ensure we are not escaping from original fields restrictions
$fields = array_intersect($this->fields, $fields);
return new static($fields, $this->locales, $this->queryLocale);
}
public function getLocalizedFields()
{
$fields = array();
foreach ($this->fields as $field) {
foreach ($this->locales as $locale) {
$boost = ($locale === $this->queryLocale) ? '^5' : '';
$fields[] = sprintf('caption.%s.%s%s', $field, $locale, $boost);
}
// TODO Put generic analyzers on main field instead of "light" sub-field
$fields[] = sprintf('caption.%s.%s', $field, 'light^10');
}
return $fields;
}
}

View File

@@ -119,7 +119,7 @@ class QueryVisitor implements Visit
if ($root instanceof AST\TextNode && if ($root instanceof AST\TextNode &&
!$root instanceof AST\QuotedTextNode && !$root instanceof AST\QuotedTextNode &&
!$node instanceof AST\QuotedTextNode) { !$node instanceof AST\QuotedTextNode) {
$root = new AST\TextNode(sprintf('%s %s', $root->getText(), $node->getText())); $root = new AST\TextNode(sprintf('%s %s', $root->getValue(), $node->getValue()));
} else { } else {
$root = new AST\AndExpression($root, $node); $root = new AST\AndExpression($root, $node);
} }

View File

@@ -12,6 +12,9 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic; namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer; use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
use Elasticsearch\Client; use Elasticsearch\Client;
class Thesaurus class Thesaurus
@@ -30,24 +33,34 @@ class Thesaurus
public function findConceptsBulk(array $terms, $lang = null) public function findConceptsBulk(array $terms, $lang = null)
{ {
// TODO Use bulk queries for performance // TODO Use bulk queries for performance
$concepts = array();
foreach ($terms as $term) {
$concepts[] = $this->findConcepts($term, $lang);
}
return $concepts;
} }
public function findConcepts($term, $context = null, $lang = null) public function findConcepts($term, $lang = null)
{ {
if (!($term instanceof TermInterface)) {
$term = new Term($term);
}
// TODO Check that term queries are ok with multiple words // TODO Check that term queries are ok with multiple words
$query = array(); $query = array();
$field = $lang ? sprintf('value.%s', $lang) : 'value.light'; $field = $lang ? sprintf('value.%s', $lang) : 'value.light';
$query['match'][$field]['query'] = $term; $query['match'][$field]['query'] = $term->getValue();
$query['match'][$field]['operator'] = 'and'; $query['match'][$field]['operator'] = 'and';
// Allow 25% of non-matching tokens // Allow 25% of non-matching tokens
// (not exactly the same that 75% of matching tokens) // (not exactly the same that 75% of matching tokens)
// $query['match'][$field]['minimum_should_match'] = '-25%'; // $query['match'][$field]['minimum_should_match'] = '-25%';
if ($context) { if ($term->hasContext()) {
$term_query = $query; $term_query = $query;
$query = array(); $query = array();
$query['bool']['must'][0] = $term_query; $query['bool']['must'][0] = $term_query;
$query['bool']['must'][1]['term']['context'] = $context; $query['bool']['must'][1]['term']['context'] = $term->getContext();
} }
if ($lang) { if ($lang) {
@@ -86,7 +99,7 @@ class Thesaurus
$buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []); $buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []);
foreach ($buckets as $bucket) { foreach ($buckets as $bucket) {
if (isset($bucket['key'])) { if (isset($bucket['key'])) {
$concepts[] = $bucket['key']; $concepts[] = new Concept($bucket['key']);
} }
} }

View File

@@ -0,0 +1,40 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
class Concept
{
private $path;
public function __construct($path)
{
$this->path = (string) $path;
}
public function getPath()
{
return $this->path;
}
public function __toString()
{
return $this->path;
}
public static function toPathArray(array $concepts)
{
foreach ($concepts as $index => $concept) {
$concepts[$index] = $concept->getPath();
}
return $concepts;
}
}

View File

@@ -0,0 +1,57 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\AST\TextNode;
class Term implements TermInterface
{
private $value;
private $context;
public function __construct($value, $context = null)
{
$this->value = (string) $value;
if ($context) {
$this->context = (string) $context;
}
}
public function getValue()
{
return $this->value;
}
public function hasContext()
{
return $this->context !== null;
}
public function getContext()
{
return $this->context;
}
public function __toString()
{
return self::dump($this);
}
public static function dump(TermInterface $term)
{
if ($term->hasContext()) {
return sprintf('%s (%s)', $term->getValue(), $term->getContext());
}
return $term->getValue();
}
}

View File

@@ -0,0 +1,19 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
interface TermInterface
{
public function getValue();
public function hasContext();
public function getContext();
}