Thesaurus terms indexing

This commit is contained in:
Mathieu Darse
2014-09-16 16:13:17 +02:00
parent 84d09f1b54
commit c3e2c25787
4 changed files with 56 additions and 13 deletions

View File

@@ -202,6 +202,16 @@ class Indexer
'type' => 'custom',
'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'asciifolding']
],
// Thesaurus specific
'thesaurus_path' => [
'type' => 'custom',
'tokenizer' => 'thesaurus_path'
]
],
'tokenizer' => [
'thesaurus_path' => [
'type' => 'path_hierarchy'
]
],
'filter' => [

View File

@@ -31,30 +31,33 @@ class TermIndexer
*/
private $appbox;
private $navigator;
public function __construct(\appbox $appbox)
{
$this->appbox = $appbox;
$this->navigator = new Navigator();
}
public function populateIndex(BulkOperation $bulk)
{
// TODO Create object to query thesaurus for term paths/synonyms
$navigator = new Navigator();
foreach ($this->appbox->get_databoxes() as $databox) {
$databoxId = $databox->get_sbas_id();
$document = self::thesaurusFromDatabox($databox);
$visitor = new TermVisitor(function ($term) use ($bulk) {
$visitor = new TermVisitor(function ($term) use ($bulk, $databoxId) {
printf("- %s (%s)\n", $term['path'], $term['value']);
});
$navigator->walk($document, $visitor);
while ($record = false) {
// Term structure
$id = $term['id'];
unset($term['id']);
$term['databox_id'] = $databoxId;
// Index request
$params = array();
$params['id'] = $record['id'];
$params['body'] = $record;
$params['id'] = $id;
$params['type'] = self::TYPE_NAME;
$params['body'] = $term;
$bulk->index($params);
}
});
$this->navigator->walk($document, $visitor);
}
}
@@ -72,9 +75,12 @@ class TermIndexer
{
$mapping = new Mapping();
$mapping
->add('raw_value', 'string')->notAnalyzed()
->add('value', 'string')
->add('context', 'string')
->add('path', 'string')
->analyzer('thesaurus_path', 'indexing')
->analyzer('keyword', 'searching')
->add('lang', 'string')->notAnalyzed()
->add('databox_id', 'integer')
;

View File

@@ -90,6 +90,31 @@ class Mapping
return $properties;
}
public function analyzer($analyzer, $type = null)
{
$field = &$this->currentField();
if ($field['type'] !== self::TYPE_STRING) {
throw new LogicException('Only string fields can be analyzed');
}
switch ($type) {
case null:
$field['analyzer'] = $analyzer;
unset($field['index_analyzer'], $field['search_analyzer']);
break;
case 'indexing':
$field['index_analyzer'] = $analyzer;
break;
case 'searching':
$field['search_analyzer'] = $analyzer;
break;
default:
throw new LogicException(sprintf('Invalid analyzer type "%s".', $type));
}
$field['index'] = 'analyzed';
return $this;
}
public function notAnalyzed()
{
$field = &$this->currentField();

View File

@@ -24,6 +24,7 @@ use DOMNode;
class TermVisitor implements VisitorInterface
{
const TERM_TAG_NAME = 'sy';
const TERM_ID_ATTR = 'id';
const TERM_LANG_ATTR = 'lng';
const TERM_VALUE_ATTR = 'v';
// So, this is a huuuge regex to match a group of words eventually followed
@@ -54,7 +55,8 @@ class TermVisitor implements VisitorInterface
$term = $this->parseTermValue($value);
$term += [
'path' => $this->getCurrentPathAsString(),
'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR)
'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR),
'id' => $this->getTermAttribute($element, self::TERM_ID_ATTR)
];
call_user_func($this->termCallback, $term);