Merge pull request #23 from mdarse/thesaurus-relevance

Enhance thesaurus search relevance
This commit is contained in:
Mathieu Darse
2015-01-20 18:41:43 +01:00
25 changed files with 578 additions and 352 deletions

View File

@@ -13,6 +13,8 @@ namespace Alchemy\Phrasea\Command\Thesaurus;
use Alchemy\Phrasea\Command\Command;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
@@ -30,12 +32,23 @@ class FindConceptsCommand extends Command
InputArgument::REQUIRED,
'Reverse search a term to infer concepts'
)
->addArgument(
'context',
InputArgument::OPTIONAL,
'Restrict search to a specific term context'
)
->addOption(
'locale',
null,
InputOption::VALUE_REQUIRED,
'Specify input locale'
)
->addOption(
'broad',
null,
InputOption::VALUE_NONE,
'Keep broad concepts (discards narrower concepts)'
)
->addOption(
'raw',
null,
@@ -48,16 +61,26 @@ class FindConceptsCommand extends Command
protected function doExecute(InputInterface $input, OutputInterface $output)
{
$term = $input->getArgument('term');
$context = $input->getArgument('context');
$raw = $input->getOption('raw');
if (!$raw) {
$output->writeln(sprintf('Finding linked concepts: <comment>%s</comment>', $term));
$message = sprintf('Finding linked concepts: <comment>%s</comment>', $term);
if ($context) {
$message .= sprintf(' (with context <comment>%s</comment>)', $context);
}
$output->writeln($message);
$output->writeln(str_repeat('-', 20));
}
$thesaurus = $this->container['thesaurus'];
$term = new Term($term, $context);
$locale = $input->getOption('locale');
$concepts = $thesaurus->findConcepts($term, null, $locale);
$concepts = $thesaurus->findConcepts($term, $locale);
if ($input->getOption('broad')) {
$concepts = Concept::pruneNarrowConcepts($concepts);
}
if (count($concepts)) {
$output->writeln($concepts);

View File

@@ -18,6 +18,7 @@ use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryParser;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngine;
@@ -87,6 +88,7 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
$app['elasticsearch.indexer.record_indexer'] = $app->share(function ($app) {
return new RecordIndexer(
$app['elasticsearch.record_helper'],
$app['thesaurus'],
$app['elasticsearch.engine'],
$app['phraseanet.appbox'],
@@ -94,6 +96,10 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
);
});
$app['elasticsearch.record_helper'] = $app->share(function ($app) {
return new RecordHelper($app['phraseanet.appbox']);
});
$app['elasticsearch.client'] = $app->share(function($app) {
$options = $app['elasticsearch.options'];
$clientParams = ['hosts' => [sprintf('%s:%s', $options['host'], $options['port'])]];

View File

@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class AndExpression extends BinaryOperator
{
protected $operator = 'AND';
public function getQuery($fields = ['_all'])
public function buildQuery(QueryContext $context)
{
$left = $this->left->getQuery($fields);
$right = $this->right->getQuery($fields);
$left = $this->left->buildQuery($context);
$right = $this->right->buildQuery($context);
return array(
'bool' => array(

View File

@@ -19,9 +19,11 @@ abstract class BinaryOperator extends Node
return sprintf('(%s %s %s)', $this->left, $this->operator, $this->right);
}
public function isFullTextOnly()
public function getTextNodes()
{
return $this->left->isFullTextOnly()
&& $this->right->isFullTextOnly();
return array_merge(
$this->left->getTextNodes(),
$this->right->getTextNodes()
);
}
}

View File

@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class ExceptExpression extends BinaryOperator
{
protected $operator = 'EXCEPT';
public function getQuery($fields = ['_all'])
public function buildQuery(QueryContext $context)
{
$left = $this->left->getQuery($fields);
$right = $this->right->getQuery($fields);
$left = $this->left->buildQuery($context);
$right = $this->right->buildQuery($context);
return array(
'bool' => array(

View File

@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class FieldNode extends Node
{
protected $keyword;
@@ -16,18 +18,18 @@ class FieldNode extends Node
return $this->keyword;
}
public function getQuery()
public function buildQuery(QueryContext $context)
{
throw new \LogicException("A keyword can't be converted to a query.");
}
public function __toString()
public function getTextNodes()
{
return sprintf('<%s>', $this->keyword);
throw new \LogicException("A keyword can't contain text nodes.");
}
public function isFullTextOnly()
public function __toString()
{
return false;
return sprintf('<field:%s>', $this->keyword);
}
}

View File

@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class InExpression extends Node
{
protected $field;
@@ -13,19 +15,20 @@ class InExpression extends Node
$this->expression = $expression;
}
public function getQuery()
public function buildQuery(QueryContext $context)
{
return $this->expression->getQuery($this->field->getValue());
$fields = array($this->field->getValue());
return $this->expression->buildQuery($context->narrowToFields($fields));
}
public function getTextNodes()
{
return $this->expression->getTextNodes();
}
public function __toString()
{
return sprintf('(%s IN %s)', $this->expression, $this->field);
}
public function isFullTextOnly()
{
// In expressions are never full-text
return false;
}
}

View File

@@ -2,15 +2,14 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
abstract class Node
{
/**
* @return array The Elasticsearch formatted query
*/
abstract public function getQuery();
abstract public function buildQuery(QueryContext $context);
/**
* @return bool Tell if the node and it's child are full-text queries only
*/
abstract public function isFullTextOnly();
abstract public function getTextNodes();
}

View File

@@ -2,9 +2,11 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class NullQueryNode extends Node
{
public function getQuery()
public function buildQuery(QueryContext $context)
{
return array('match_all' => array());
}
@@ -18,9 +20,4 @@ class NullQueryNode extends Node
{
return '<NULL>';
}
public function isFullTextOnly()
{
return false;
}
}

View File

@@ -2,14 +2,16 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class OrExpression extends BinaryOperator
{
protected $operator = 'OR';
public function getQuery($fields = ['_all'])
public function buildQuery(QueryContext $context)
{
$left = $this->left->getQuery($fields);
$right = $this->right->getQuery($fields);
$left = $this->left->buildQuery($context);
$right = $this->right->buildQuery($context);
return array(
'bool' => array(

View File

@@ -2,6 +2,8 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class PrefixNode extends Node
{
protected $prefix;
@@ -11,11 +13,11 @@ class PrefixNode extends Node
$this->prefix = $prefix;
}
public function getQuery($fields = ['_all'])
public function buildQuery(QueryContext $context)
{
return array(
'multi_match' => array(
'fields' => $fields,
'fields' => $context->getLocalizedFields(),
'query' => $this->prefix,
'type' => 'phrase_prefix'
)
@@ -26,9 +28,4 @@ class PrefixNode extends Node
{
return sprintf('prefix("%s")', $this->prefix);
}
public function isFullTextOnly()
{
return true;
}
}

View File

@@ -2,22 +2,19 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
class QuotedTextNode extends TextNode
{
public function getQuery($fields = ['_all'])
public function buildQuery(QueryContext $context)
{
return array(
'multi_match' => array(
'type' => 'phrase',
'fields' => $fields,
'fields' => $context->getLocalizedFields(),
'query' => $this->text,
// 'operator' => 'and'
)
);
}
public function isFullTextOnly()
{
return true;
}
}

View File

@@ -2,23 +2,49 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\AST;
class TextNode extends Node
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
class TextNode extends Node implements TermInterface
{
protected $text;
protected $concepts = array();
public function __construct($text)
{
$this->text = $text;
}
public function getQuery($fields = ['_all'])
public function setConcepts(array $concepts)
{
return array(
$this->concepts = $concepts;
}
public function buildQuery(QueryContext $context)
{
$query = array(
'multi_match' => array(
'fields' => $fields,
'fields' => $context->getLocalizedFields(),
'query' => $this->text,
)
);
if ($this->concepts) {
$shoulds = array($query);
foreach (Concept::pruneNarrowConcepts($this->concepts) as $concept) {
$shoulds[]['term']['concept_paths'] = $concept->getPath();
}
$query = array();
$query['bool']['should'] = $shoulds;
}
return $query;
}
public function getTextNodes()
{
return array($this);
}
public function __toString()
@@ -26,13 +52,22 @@ class TextNode extends Node
return sprintf('"%s"', $this->text);
}
public function isFullTextOnly()
{
return true;
}
public function getText()
// Implementation of TermInterface
public function getValue()
{
return $this->text;
}
public function hasContext()
{
return false;
}
public function getContext()
{
// TODO Insert context during parsing
return null;
}
}

View File

@@ -13,7 +13,9 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\SearchQuery;
use Alchemy\Phrasea\SearchEngine\Elastic\Search\QueryContext;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
use Alchemy\Phrasea\SearchEngine\SearchEngineInterface;
use Alchemy\Phrasea\SearchEngine\SearchEngineOptions;
use Alchemy\Phrasea\SearchEngine\SearchEngineResult;
@@ -278,72 +280,27 @@ class ElasticSearchEngine implements SearchEngineInterface
$searchQuery = $this->app['query_parser']->parse($string);
// Contains the full thesaurus paths to search on
$pathsToFilter = [];
// Contains the thesaurus values by fields (synonyms, translations, etc)
$collectFields = [];
$query['_ast'] = $searchQuery->dump();
// Only search in thesaurus for full text search
if ($searchQuery->isFullTextOnly()) {
$termFields = $this->expendToAnalyzedFieldsNames('value', null, $this->app['locale']);
$termsQuery = $searchQuery->getElasticsearchQuery($termFields);
$params = $this->createTermQueryParams($termsQuery, $options);
$terms = $this->doExecute('search', $params);
foreach ($terms['hits']['hits'] as $term) {
// Skip paths with very low score
if ($term['_score'] < 1) {
continue;
$thesaurus = $this->app['thesaurus'];
$textNodes = $searchQuery->getTextNodes();
$concepts = $thesaurus->findConceptsBulk($textNodes);
foreach ($concepts as $index => $termConcepts) {
$node = $textNodes[$index];
$node->setConcepts($termConcepts);
$term = Term::dump($node);
$query['_thesaurus_concepts'][$term] = Concept::toPathArray($termConcepts);
}
$pathsToFilter[$term['_source']['path']] = $term['_score'];
$recordHelper = $this->app['elasticsearch.record_helper'];
// TODO Pass options to getFields to include/exclude private fields
$searchableFields = $recordHelper->getFields();
$queryContext = new QueryContext($searchableFields, $this->locales, $this->app['locale']);
$recordQuery = $searchQuery->build($queryContext);
foreach ($term['_source']['fields'] as $field) {
$collectFields['caption.'.$field][] = $term['_source']['value'];
}
}
$pathsToFilter = array_unique($pathsToFilter);
}
if (empty($collectFields)) {
// @todo a list of field by default? all fields?
$searchFieldNames = ['caption.*'];
} else {
$searchFieldNames = array_keys($collectFields);
}
$recordFields = $this->expendToAnalyzedFieldsNames($searchFieldNames, null, $this->app['locale']);
$recordQuery = [
'bool' => [
'should' => [
$searchQuery->getElasticsearchQuery($recordFields)
]
]
];
foreach ($pathsToFilter as $path => $score) {
// Also match incomplete path. /a/b/c will return /a/b/c/d records
$recordQuery['bool']['should'][] = [
'match' => [
'concept_paths' => array(
'query' => $path,
'boost' => $score,
)
]
];
// Add signal for exact path only
$recordQuery['bool']['should'][] = [
'term' => [
'concept_paths.raw' => array(
'value' => $path,
'boost' => $score,
)
]
];
}
$params = $this->createRecordQueryParams($recordQuery, $options, null);
$params['body']['from'] = $offset;
@@ -378,10 +335,15 @@ class ElasticSearchEngine implements SearchEngineInterface
$results[] = ElasticsearchRecordHydrator::hydrate($hit['_source'], $n++);
}
$query['_searchable_fields'] = $searchableFields;
$query['_ast'] = $searchQuery->dump();
$query['_paths'] = $pathsToFilter;
$query['_richFields'] = $collectFields;
$query['query'] = json_encode($params);
// $query['_paths'] = $pathsToFilter;
// $query['_richFields'] = $collectFields;
$queryyy = $recordQuery;
// $queryyy = $params['body'];
$query['query'] = $queryyy;
$query['query_as_string'] = json_encode($queryyy);
return new SearchEngineResult($results, json_encode($query), $res['took'], $offset,
$res['hits']['total'], $res['hits']['total'], null, null, $suggestions, [],
@@ -572,36 +534,6 @@ class ElasticSearchEngine implements SearchEngineInterface
return $res;
}
/**
* @param array|string $fields
* @param array|null $locales
* @param null $currentLocale
* @return array
*/
public function expendToAnalyzedFieldsNames($fields, $locales = null, $currentLocale = null)
{
$fieldsExpended = [];
if (!$locales) {
$locales = $this->locales;
}
foreach ((array) $fields as $field) {
foreach ($locales as $locale) {
$boost = "";
if ($locale === $currentLocale) {
$boost = "^5";
}
$fieldsExpended[] = sprintf('%s.%s%s', $field, $locale, $boost);
}
$fieldsExpended[] = sprintf('%s.%s', $field, 'light^10');
}
return $fieldsExpended;
}
private function getFlagsKey(\appbox $appbox)
{
$flags = [];

View File

@@ -20,13 +20,16 @@ use Alchemy\Phrasea\SearchEngine\Elastic\RecordFetcher;
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
use media_subdef;
class RecordIndexer
{
const TYPE_NAME = 'record';
private $helper;
private $thesaurus;
/**
* @var \appbox
*/
@@ -42,10 +45,9 @@ class RecordIndexer
*/
private $locales;
private $dataStructure;
public function __construct(Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales)
public function __construct(RecordHelper $helper, Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox, array $locales)
{
$this->helper = $helper;
$this->thesaurus = $thesaurus;
$this->appbox = $appbox;
$this->elasticSearchEngine = $elasticSearchEngine;
@@ -54,11 +56,8 @@ class RecordIndexer
public function populateIndex(BulkOperation $bulk)
{
// Helper to fetch record related data
$recordHelper = new RecordHelper($this->appbox);
foreach ($this->appbox->get_databoxes() as $databox) {
$fetcher = new RecordFetcher($databox, $recordHelper);
$fetcher = new RecordFetcher($databox, $this->helper);
$fetcher->setBatchSize(200);
while ($records = $fetcher->fetch()) {
foreach ($records as $record) {
@@ -74,9 +73,7 @@ class RecordIndexer
public function indexSingleRecord(\record_adapter $record_adapter, $indexName)
{
// Helper to fetch record related data
$recordHelper = new RecordHelper($this->appbox);
$fetcher = new RecordFetcher($record_adapter->get_databox(), $recordHelper);
$fetcher = new RecordFetcher($record_adapter->get_databox(), $this->helper);
$record = $fetcher->fetchOne($record_adapter);
$params = array();
@@ -140,13 +137,13 @@ class RecordIndexer
->add('record_id', 'integer') // Compound primary key
->add('databox_id', 'integer') // Compound primary key
->add('base_id', 'integer') // Unique collection ID
->add('collection_id', 'integer') // Useless collection ID (local to databox)
->add('collection_name', 'string')->notAnalyzed() // Collection name
->add('uuid', 'string')->notAnalyzed()
->add('sha256', 'string')->notAnalyzed()
->add('collection_id', 'integer')->notIndexed() // Useless collection ID (local to databox)
->add('collection_name', 'string')->notIndexed() // Collection name
->add('uuid', 'string')->notIndexed()
->add('sha256', 'string')->notIndexed()
// Mandatory metadata
->add('original_name', 'string')->notAnalyzed()
->add('mime', 'string')->notAnalyzed()
->add('original_name', 'string')->notIndexed()
->add('mime', 'string')->notIndexed()
->add('type', 'string')->notAnalyzed()
->add('record_type', 'string')->notAnalyzed() // record or story
// Dates
@@ -157,29 +154,15 @@ class RecordIndexer
->analyzer('thesaurus_path', 'indexing')
->analyzer('keyword', 'searching')
->addRawVersion()
// EXIF
->add('exif', $this->getExifMapping())
// Status
->add('flags', $this->getFlagsMapping())
// Keep some fields arround for display purpose
->add('subdefs', Mapping::disabledMapping())
->add('title', Mapping::disabledMapping())
;
// Index title
$titleMapping = new Mapping();
$titleMapping->add('default', 'string')->notAnalyzed()->notIndexed();
foreach ($this->locales as $locale) {
$titleMapping->add($locale, 'string')->notAnalyzed()->notIndexed();
}
$mapping->add('title', $titleMapping);
// Minimal subdefs mapping info for display purpose
$subdefMapping = new Mapping();
$subdefMapping->add('path', 'string')->notAnalyzed()->notIndexed();
$subdefMapping->add('height', 'integer')->notIndexed();
$subdefMapping->add('width', 'integer')->notIndexed();
$subdefsMapping = new Mapping();
$subdefsMapping->add('thumbnail', $subdefMapping);
$subdefsMapping->add('thumbnailgif', $subdefMapping);
$subdefsMapping->add('preview', $subdefMapping);
$mapping->add('subdefs', $subdefsMapping);
// Caption mapping
$captionMapping = new Mapping();
$mapping->add('caption', $captionMapping);
@@ -194,9 +177,9 @@ class RecordIndexer
}
if ($params['type'] === Mapping::TYPE_STRING) {
if (!$params['indexable'] && !$params['to_aggregate']) {
if (!$params['searchable'] && !$params['to_aggregate']) {
$m->notIndexed();
} elseif (!$params['indexable'] && $params['to_aggregate']) {
} elseif (!$params['searchable'] && $params['to_aggregate']) {
$m->notAnalyzed();
$m->addRawVersion();
} else {
@@ -206,89 +189,13 @@ class RecordIndexer
}
}
// EXIF
$mapping->add('exif', $this->getExifMapping());
// Status
$mapping->add('flags', $this->getFlagsMapping());
return $mapping->export();
}
private function getFieldsStructure()
{
if (!empty($this->dataStructure)) {
return $this->dataStructure;
}
$fields = array();
foreach ($this->appbox->get_databoxes() as $databox) {
//printf("Databox %d\n", $databox->get_sbas_id());
foreach ($databox->get_meta_structure() as $fieldStructure) {
$field = array();
// Field type
switch ($fieldStructure->get_type()) {
case \databox_field::TYPE_DATE:
$field['type'] = 'date';
break;
case \databox_field::TYPE_NUMBER:
$field['type'] = 'double';
break;
case \databox_field::TYPE_STRING:
case \databox_field::TYPE_TEXT:
$field['type'] = 'string';
break;
default:
throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type()));
break;
}
$name = $fieldStructure->get_name();
// Business rules
$field['private'] = $fieldStructure->isBusiness();
$field['indexable'] = $fieldStructure->is_indexable();
$field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
// Thesaurus concept inference
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
$helper = new ThesaurusHelper();
// TODO Not the real option yet
$field['thesaurus_concept_inference'] = $field['type'] === 'string';
// TODO Find thesaurus path prefixes
$field['thesaurus_prefix'] = '/categories';
//printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);
// Since mapping is merged between databoxes, two fields may
// have conflicting names. Indexing is the same for a given
// type so we reject only those with different types.
if (isset($fields[$name])) {
if ($fields[$name]['type'] !== $field['type']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type']));
}
if ($fields[$name]['indexable'] !== $field['indexable']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible indexable state", $name));
}
if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name));
}
// TODO other structure incompatibilities
//printf("Merged with previous \"%s\" field\n", $name);
}
$fields[$name] = $field;
}
}
$this->dataStructure = $fields;
return $this->dataStructure;
return $this->helper->getFieldsStructure();
}
// @todo Add call to addAnalyzedVersion ?
@@ -374,7 +281,7 @@ class RecordIndexer
}
}
$record['concept_paths'] = $this->findLinkedConcepts($structure, $record);
// $record['concept_paths'] = $this->findLinkedConcepts($structure, $record);
return $record;
}

View File

@@ -43,37 +43,26 @@ class TermIndexer
/** @var databox $databox */
$databoxId = $databox->get_sbas_id();
$document = self::thesaurusFromDatabox($databox);
$dedicatedFieldTerms = $this->getDedicatedFieldTerms($databox, $document);
$visitor = new TermVisitor(function ($term) use ($bulk, $databoxId) {
// Path and id are prefixed with a databox identifier to not
// collide with other databoxes terms
$visitor = new TermVisitor(function ($term) use ($bulk, $databoxId, $dedicatedFieldTerms) {
//printf("- %s (%s)\n", $term['path'], $term['value']);
// Term structure
$id = $term['id'];
$id = sprintf('%s_%s', $databoxId, $term['id']);
unset($term['id']);
$term['path'] = sprintf('/%s%s', $databoxId, $term['path']);
$term['databox_id'] = $databoxId;
$term['branch_id'] = $id;
// @todo move to the TermVisitor? dunno.
$term['fields'] = null;
foreach ($dedicatedFieldTerms as $partialId => $fields) {
if (strpos($id, $partialId) === 0) {
foreach ($fields as $field) {
$term['fields'][] = $field;
}
}
}
// Index request
$params = array();
$params['id'] = sprintf('%s_%s', $databoxId, $id);
$params['id'] = $id;
$params['type'] = self::TYPE_NAME;
$params['body'] = $term;
$bulk->index($params);
});
$document = self::thesaurusFromDatabox($databox);
$this->navigator->walk($document, $visitor);
}
}
@@ -88,38 +77,20 @@ class TermIndexer
return $dom;
}
private function getDedicatedFieldTerms(databox $databox, DOMDocument $document)
{
$xpath = new \DOMXpath($document);
$dedicatedFieldTerms = [];
foreach ($databox->get_meta_structure() as $f) {
if ($f->get_tbranch()) {
$elements = $xpath->query($f->get_tbranch());
if ($elements) {
foreach ($elements as $element) {
$dedicatedFieldTerms[$element->getAttribute('id')][] = $f->get_name();
}
}
}
}
return $dedicatedFieldTerms;
}
public function getMapping()
{
$mapping = new Mapping();
$mapping
->add('raw_value', 'string')->notAnalyzed()
->add('value', 'string')->addAnalyzedVersion($this->locales)
->add('context', 'string')->addAnalyzedVersion($this->locales)
->add('value', 'string')
->analyzer('general_light')
->addLocalizedSubfields($this->locales)
->add('context', 'string')
->analyzer('general_light')
->addLocalizedSubfields($this->locales)
->add('path', 'string')->notAnalyzed()
->add('lang', 'string')->notAnalyzed()
->add('branch_id', 'string')->notAnalyzed()
->add('databox_id', 'integer')
->add('fields', 'string')->notAnalyzed()
;
return $mapping->export();

View File

@@ -18,6 +18,7 @@ class Mapping
{
private $fields = array();
private $current;
private $enabled = true;
const DATE_FORMAT_MYSQL = 'yyyy-MM-dd HH:mm:ss';
const DATE_FORMAT_CAPTION = 'yyyy/MM/dd'; // ES format
@@ -54,7 +55,7 @@ class Mapping
$field = array();
if ($type instanceof self) {
$field['type'] = self::TYPE_OBJECT;
$field['properties'] = $type;
$field['mapping'] = $type;
}
elseif (in_array($type, self::$types)) {
$field['type'] = $type;
@@ -74,20 +75,19 @@ class Mapping
public function export()
{
return ['properties' => $this->exportProperties()];
}
public function exportProperties()
{
$properties = array();
$mapping = array();
foreach ($this->fields as $name => $field) {
$properties[$name] = $field;
if ($field['type'] === self::TYPE_OBJECT) {
$properties[$name]['properties'] = $field['properties']->exportProperties();
$field = $field['mapping']->export();
}
$mapping['properties'][$name] = $field;
}
return $properties;
if (!$this->enabled) {
$mapping['enabled'] = false;
}
return $mapping;
}
public function analyzer($analyzer, $type = null)
@@ -134,6 +134,23 @@ class Mapping
return $this;
}
public static function disabledMapping()
{
return (new self())->disable();
}
/**
* Allows to disable parsing and indexing a named object completely.
* This is handy when a portion of the JSON document contains arbitrary JSON
* which should not be indexed, nor added to the mapping.
*/
private function disable()
{
$this->enabled = false;
return $this;
}
public function addRawVersion()
{
$field = &$this->currentField();
@@ -146,22 +163,30 @@ class Mapping
return $this;
}
public function addAnalyzedVersion(array $langs)
/**
* @deprecated
*/
public function addAnalyzedVersion(array $locales)
{
$field = &$this->currentField();
foreach ($langs as $lang) {
$field['fields'][$lang] = [
'type' => $field['type'],
'analyzer' => sprintf('%s_full', $lang)
];
}
$field['fields']['light'] = [
'type' => $field['type'],
'analyzer' => 'general_light'
];
return $this->addLocalizedSubfields($locales);
}
public function addLocalizedSubfields(array $locales)
{
$field = &$this->currentField();
foreach ($locales as $locale) {
$field['fields'][$locale] = array();
$field['fields'][$locale]['type'] = $field['type'];
$field['fields'][$locale]['analyzer'] = sprintf('%s_full', $locale);
}
return $this;
}

View File

@@ -11,18 +11,21 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
use appbox;
use igorw;
class RecordHelper
{
private $connection;
private $appbox;
// Computation caches
private $collectionMap;
private $fieldStructure;
public function __construct(appbox $appbox)
{
$this->connection = $appbox->get_connection();
$this->appbox = $appbox;
}
public function getUniqueRecordId($databoxId, $recordId)
@@ -46,12 +49,13 @@ class RecordHelper
private function collectionMap()
{
if (!$this->collectionMap) {
$connection = $this->appbox->get_connection();
$sql = 'SELECT
sbas_id as databox_id,
server_coll_id as collection_id,
base_id
FROM bas';
$statement = $this->connection->query($sql);
$statement = $connection->query($sql);
$map = array();
while ($mapping = $statement->fetch()) {
@@ -68,4 +72,97 @@ class RecordHelper
{
return StringUtils::slugify($key, '_');
}
public function getFields($includePrivate = false, $onlySearchable = true)
{
$fields = array();
foreach ($this->getFieldsStructure() as $name => $options) {
// Skip private fields
if ($options['private'] && !$includePrivate) {
continue;
}
// Skip not searchable fields
if ($onlySearchable && !$options['searchable']) {
continue;
}
$fields[] = $name;
}
return $fields;
}
public function getFieldsStructure()
{
if (!empty($this->fieldsStructure)) {
return $this->fieldsStructure;
}
$fields = array();
foreach ($this->appbox->get_databoxes() as $databox) {
//printf("Databox %d\n", $databox->get_sbas_id());
foreach ($databox->get_meta_structure() as $fieldStructure) {
$field = array();
// Field type
switch ($fieldStructure->get_type()) {
case \databox_field::TYPE_DATE:
$field['type'] = 'date';
break;
case \databox_field::TYPE_NUMBER:
$field['type'] = 'double';
break;
case \databox_field::TYPE_STRING:
case \databox_field::TYPE_TEXT:
$field['type'] = 'string';
break;
default:
throw new Exception(sprintf('Invalid field type "%s", expected "date", "number" or "string".', $fieldStructure->get_type()));
break;
}
$name = $fieldStructure->get_name();
// Business rules
$field['private'] = $fieldStructure->isBusiness();
$field['searchable'] = $fieldStructure->is_indexable();
$field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
// Thesaurus concept inference
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
$helper = new ThesaurusHelper();
// TODO Not the real option yet
$field['thesaurus_concept_inference'] = $field['type'] === 'string';
// TODO Find thesaurus path prefixes
$field['thesaurus_prefix'] = '/categories';
//printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);
// Since mapping is merged between databoxes, two fields may
// have conflicting names. Indexing is the same for a given
// type so we reject only those with different types.
if (isset($fields[$name])) {
if ($fields[$name]['type'] !== $field['type']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible types (%s vs %s)", $name, $fields[$name]['type'], $field['type']));
}
if ($fields[$name]['searchable'] !== $field['searchable']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible searchable state", $name));
}
if ($fields[$name]['to_aggregate'] !== $field['to_aggregate']) {
throw new MergeException(sprintf("Field %s can't be merged, incompatible to_aggregate state", $name));
}
// TODO other structure incompatibilities
//printf("Merged with previous \"%s\" field\n", $name);
}
$fields[$name] = $field;
}
}
$this->fieldsStructure = $fields;
return $this->fieldsStructure;
}
}

View File

@@ -18,19 +18,14 @@ class Query
$this->root = $root;
}
/*
* This method seems weird to me, the implementation returns true when the
* query doesn't contain IN statements, but that doesn't define a full text
* search.
*/
public function isFullTextOnly()
public function getTextNodes()
{
return $this->root->isFullTextOnly();
return $this->root->getTextNodes();
}
public function getElasticsearchQuery($fields = array())
public function build(QueryContext $context)
{
return $this->root->getQuery($fields);
return $this->root->buildQuery($context);
}
public function dump()

View File

@@ -0,0 +1,40 @@
<?php
namespace Alchemy\Phrasea\SearchEngine\Elastic\Search;
class QueryContext
{
private $fields;
private $locales;
private $queryLocale;
public function __construct(array $fields, array $locales, $queryLocale)
{
$this->fields = $fields;
$this->locales = $locales;
$this->queryLocale = $queryLocale;
}
public function narrowToFields(array $fields)
{
// Ensure we are not escaping from original fields restrictions
$fields = array_intersect($this->fields, $fields);
return new static($fields, $this->locales, $this->queryLocale);
}
public function getLocalizedFields()
{
$fields = array();
foreach ($this->fields as $field) {
foreach ($this->locales as $locale) {
$boost = ($locale === $this->queryLocale) ? '^5' : '';
$fields[] = sprintf('caption.%s.%s%s', $field, $locale, $boost);
}
// TODO Put generic analyzers on main field instead of "light" sub-field
$fields[] = sprintf('caption.%s.%s', $field, 'light^10');
}
return $fields;
}
}

View File

@@ -119,7 +119,7 @@ class QueryVisitor implements Visit
if ($root instanceof AST\TextNode &&
!$root instanceof AST\QuotedTextNode &&
!$node instanceof AST\QuotedTextNode) {
$root = new AST\TextNode(sprintf('%s %s', $root->getText(), $node->getText()));
$root = new AST\TextNode(sprintf('%s %s', $root->getValue(), $node->getValue()));
} else {
$root = new AST\AndExpression($root, $node);
}

View File

@@ -12,6 +12,9 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Concept;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Term;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermInterface;
use Elasticsearch\Client;
class Thesaurus
@@ -19,34 +22,78 @@ class Thesaurus
private $client;
private $index;
const MIN_SCORE = 6;
public function __construct(Client $client, $index)
{
$this->client = $client;
$this->index = $index;
}
public function findConcepts($term, $context = null, $lang = null)
public function findConceptsBulk(array $terms, $lang = null)
{
// TODO Use bulk queries for performance
$concepts = array();
foreach ($terms as $term) {
$concepts[] = $this->findConcepts($term, $lang);
}
return $concepts;
}
public function findConcepts($term, $lang = null)
{
if (!($term instanceof TermInterface)) {
$term = new Term($term);
}
// TODO Check that term queries are ok with multiple words
$query = array();
$query['term']['value'] = $term;
$field = $lang ? sprintf('value.%s', $lang) : 'value.light';
$query['match'][$field]['query'] = $term->getValue();
$query['match'][$field]['operator'] = 'and';
// Allow 25% of non-matching tokens
// (not exactly the same that 75% of matching tokens)
// $query['match'][$field]['minimum_should_match'] = '-25%';
if ($context) {
if ($term->hasContext()) {
$term_query = $query;
$query = array();
$query['bool']['must'][0] = $term_query;
$query['bool']['must'][1]['term']['context'] = $context;
$query['bool']['must'][1]['term']['context'] = $term->getContext();
}
if ($lang) {
$term_query = $query;
$query = array();
$query['filtered']['query'] = $term_query;
$query['filtered']['filter']['term']['lang'] = $lang;
}
// TODO Only search in a specific databox
// $term_query = $query;
// $query = array();
// $query['filtered']['query'] = $term_query;
// $query['filtered']['filter']['term']['databox_id'] = $databox_id;
// Path deduplication
$aggs = array();
$aggs['dedup']['terms']['field'] = 'path';
// Search request
$params = array();
$params['index'] = $this->index;
$params['type'] = TermIndexer::TYPE_NAME;
$params['body']['query'] = $query;
$params['body']['aggs'] = $aggs;
// Arbitrary score low limit, we need find a more granular way to remove
// inexact concepts.
// We also need to disable TF/IDF on terms, and try to boost score only
// when the search match nearly all tokens of term's value field.
$params['body']['min_score'] = self::MIN_SCORE;
// No need to get any hits since we extract data from aggs
$params['body']['size'] = 0;
$response = $this->client->search($params);
// Extract concept paths from response
@@ -54,7 +101,7 @@ class Thesaurus
$buckets = \igorw\get_in($response, ['aggregations', 'dedup', 'buckets'], []);
foreach ($buckets as $bucket) {
if (isset($bucket['key'])) {
$concepts[] = $bucket['key'];
$concepts[] = new Concept($bucket['key']);
}
}

View File

@@ -0,0 +1,69 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
class Concept
{
private $path;
public function __construct($path)
{
$this->path = (string) $path;
}
public function getPath()
{
return $this->path;
}
public function isNarrowerThan(Concept $other)
{
// A concept is the child of another if it begins with the other
return 0 === strpos($this->getPath(), $other->getPath() . '/');
}
public function __toString()
{
return $this->path;
}
public static function toPathArray(array $concepts)
{
foreach ($concepts as $index => $concept) {
$concepts[$index] = $concept->getPath();
}
return $concepts;
}
public static function pruneNarrowConcepts($concepts)
{
// Build a map with paths as keys
$concepts = array_combine(Concept::toPathArray($concepts), $concepts);
// Paths are sorted in advance to keep search O(n)
ksort($concepts);
// With sorting, the first element can't be a child
$broad = current($concepts);
next($concepts);
// Start prunning concepts narrower than current broad one
while ($concept = current($concepts)) {
if ($concept->isNarrowerThan($broad)) {
unset($concepts[key($concepts)]);
} else {
// End of prunable childs, beginning of a new concept
$broad = $concept;
next($concepts);
}
}
return $concepts;
}
}

View File

@@ -0,0 +1,57 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\AST\TextNode;
class Term implements TermInterface
{
private $value;
private $context;
public function __construct($value, $context = null)
{
$this->value = (string) $value;
if ($context) {
$this->context = (string) $context;
}
}
public function getValue()
{
return $this->value;
}
public function hasContext()
{
return $this->context !== null;
}
public function getContext()
{
return $this->context;
}
public function __toString()
{
return self::dump($this);
}
public static function dump(TermInterface $term)
{
if ($term->hasContext()) {
return sprintf('%s (%s)', $term->getValue(), $term->getContext());
}
return $term->getValue();
}
}

View File

@@ -0,0 +1,19 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
interface TermInterface
{
public function getValue();
public function hasContext();
public function getContext();
}