WIP Thesaurus terms linking

This commit is contained in:
Mathieu Darse
2014-09-22 10:17:53 +02:00
parent c3e2c25787
commit bc22dfcd6d
6 changed files with 102 additions and 7 deletions

View File

@@ -18,6 +18,7 @@ use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\RecordIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Indexer\TermIndexer;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngine;
use Alchemy\Phrasea\SearchEngine\Phrasea\PhraseaEngineSubscriber;
use Elasticsearch\Client;
@@ -82,6 +83,7 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
$app['elasticsearch.indexer.record_indexer'] = $app->share(function ($app) {
return new RecordIndexer(
$app['elasticsearch.thesaurus'],
$app['elasticsearch.engine'],
$app['phraseanet.appbox']
);
@@ -108,6 +110,13 @@ class SearchEngineServiceProvider implements ServiceProviderInterface
return array_replace($defaults, $options);
});
$app['elasticsearch.thesaurus'] = $app->share(function ($app) {
return new Thesaurus(
$app['elasticsearch.client'],
$app['elasticsearch.options']['index']
);
});
}
public function boot(Application $app)

View File

@@ -95,8 +95,12 @@ class Indexer
$bulk->setAutoFlushLimit(1000);
$this->termIndexer->populateIndex($bulk);
// Record indexing depends on indexed terms so we need to flush
// between the two operations
$bulk->flush();
$this->recordIndexer->populateIndex($bulk);
// Final flush
$bulk->flush();
// Optimize index

View File

@@ -19,6 +19,8 @@ use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Alchemy\Phrasea\SearchEngine\Elastic\RecordFetcher;
use Alchemy\Phrasea\SearchEngine\Elastic\RecordHelper;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper as ThesaurusHelper;
use media_subdef;
class RecordIndexer
@@ -35,8 +37,9 @@ class RecordIndexer
*/
private $elasticSearchEngine;
public function __construct(ElasticSearchEngine $elasticSearchEngine, \appbox $appbox)
public function __construct(Thesaurus $thesaurus, ElasticSearchEngine $elasticSearchEngine, \appbox $appbox)
{
$this->thesaurus = $thesaurus;
$this->appbox = $appbox;
$this->elasticSearchEngine = $elasticSearchEngine;
}
@@ -47,12 +50,11 @@ class RecordIndexer
$recordHelper = new RecordHelper($this->appbox);
foreach ($this->appbox->get_databoxes() as $databox) {
// TODO Pass a BulkOperation object to TermIndexer to muliplex
// indexing queries between types
$fetcher = new RecordFetcher($databox, $recordHelper);
$fetcher->setBatchSize(200);
while ($records = $fetcher->fetch()) {
foreach ($records as $record) {
$record['concept_paths'] = $this->findLinkedConcepts($record);
$params = array();
$params['id'] = $record['id'];
$params['type'] = self::TYPE_NAME;
@@ -63,6 +65,11 @@ class RecordIndexer
}
}
private function findLinkedConcepts($record)
{
return [];
}
public function getMapping()
{
$mapping = new Mapping();
@@ -81,6 +88,10 @@ class RecordIndexer
// Dates
->add('created_on', 'date')->format(Mapping::DATE_FORMAT_MYSQL)
->add('updated_on', 'date')->format(Mapping::DATE_FORMAT_MYSQL)
// Inferred thesaurus concepts
->add('concept_paths', 'string')
->analyzer('thesaurus_path', 'indexing')
->analyzer('keyword', 'searching')
;
// Caption mapping
@@ -149,6 +160,13 @@ class RecordIndexer
$field['indexable'] = $fieldStructure->is_indexable();
$field['to_aggregate'] = false; // @todo, dev in progress
// Thesaurus concept inference
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
$helper = new ThesaurusHelper();
// TODO Find thesaurus path prefixes
$field['thesaurus_concept_inference'] = true;
$field['thesaurus_prefix'] = '/categories';
$name = $fieldStructure->get_name();
printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);

View File

@@ -45,7 +45,7 @@ class TermIndexer
$databoxId = $databox->get_sbas_id();
$document = self::thesaurusFromDatabox($databox);
$visitor = new TermVisitor(function ($term) use ($bulk, $databoxId) {
printf("- %s (%s)\n", $term['path'], $term['value']);
// printf("- %s (%s)\n", $term['path'], $term['value']);
// Term structure
$id = $term['id'];
unset($term['id']);
@@ -78,9 +78,7 @@ class TermIndexer
->add('raw_value', 'string')->notAnalyzed()
->add('value', 'string')
->add('context', 'string')
->add('path', 'string')
->analyzer('thesaurus_path', 'indexing')
->analyzer('keyword', 'searching')
->add('path', 'string')->notAnalyzed()
->add('lang', 'string')->notAnalyzed()
->add('databox_id', 'integer')
;

View File

@@ -0,0 +1,31 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic;
use Elasticsearch\Client;
class Thesaurus
{
private $client;
private $index;
public function __construct(Client $client, $index)
{
$this->client = $client;
$this->index = $index;
}
public function findConceptPath($term, $context = null, $lang = null)
{
}
}

View File

@@ -0,0 +1,35 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMXPath;
use Elasticsearch\Client;
class Helper
{
public function findNodesByXPath($document, $xpath)
{
$tbranch = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
$xpath = new \DOMXPath($document);
$nodeList = $xpath->query($tbranch);
$conceptIds = [];
foreach ($nodeList as $node) {
if ($node->hasAttribute('id')) {
$conceptIds[] = $node->getAttribute('id');
}
}
}
}