Thesaurus terms indexing

This commit is contained in:
Mathieu Darse
2014-09-16 16:13:17 +02:00
parent 84d09f1b54
commit c3e2c25787
4 changed files with 56 additions and 13 deletions

View File

@@ -202,6 +202,16 @@ class Indexer
'type' => 'custom', 'type' => 'custom',
'tokenizer' => 'icu_tokenizer', 'tokenizer' => 'icu_tokenizer',
'filter' => ['nfkc_normalizer', 'asciifolding'] 'filter' => ['nfkc_normalizer', 'asciifolding']
],
// Thesaurus specific
'thesaurus_path' => [
'type' => 'custom',
'tokenizer' => 'thesaurus_path'
]
],
'tokenizer' => [
'thesaurus_path' => [
'type' => 'path_hierarchy'
] ]
], ],
'filter' => [ 'filter' => [

View File

@@ -31,30 +31,33 @@ class TermIndexer
*/ */
private $appbox; private $appbox;
private $navigator;
public function __construct(\appbox $appbox) public function __construct(\appbox $appbox)
{ {
$this->appbox = $appbox; $this->appbox = $appbox;
$this->navigator = new Navigator();
} }
public function populateIndex(BulkOperation $bulk) public function populateIndex(BulkOperation $bulk)
{ {
// TODO Create object to query thesaurus for term paths/synonyms
$navigator = new Navigator();
foreach ($this->appbox->get_databoxes() as $databox) { foreach ($this->appbox->get_databoxes() as $databox) {
$databoxId = $databox->get_sbas_id();
$document = self::thesaurusFromDatabox($databox); $document = self::thesaurusFromDatabox($databox);
$visitor = new TermVisitor(function ($term) use ($bulk) { $visitor = new TermVisitor(function ($term) use ($bulk, $databoxId) {
printf("- %s (%s)\n", $term['path'], $term['value']); printf("- %s (%s)\n", $term['path'], $term['value']);
}); // Term structure
$navigator->walk($document, $visitor); $id = $term['id'];
unset($term['id']);
while ($record = false) { $term['databox_id'] = $databoxId;
// Index request
$params = array(); $params = array();
$params['id'] = $record['id']; $params['id'] = $id;
$params['body'] = $record; $params['type'] = self::TYPE_NAME;
$params['body'] = $term;
$bulk->index($params); $bulk->index($params);
} });
$this->navigator->walk($document, $visitor);
} }
} }
@@ -72,9 +75,12 @@ class TermIndexer
{ {
$mapping = new Mapping(); $mapping = new Mapping();
$mapping $mapping
->add('raw_value', 'string')->notAnalyzed()
->add('value', 'string') ->add('value', 'string')
->add('context', 'string') ->add('context', 'string')
->add('path', 'string') ->add('path', 'string')
->analyzer('thesaurus_path', 'indexing')
->analyzer('keyword', 'searching')
->add('lang', 'string')->notAnalyzed() ->add('lang', 'string')->notAnalyzed()
->add('databox_id', 'integer') ->add('databox_id', 'integer')
; ;

View File

@@ -90,6 +90,31 @@ class Mapping
return $properties; return $properties;
} }
public function analyzer($analyzer, $type = null)
{
$field = &$this->currentField();
if ($field['type'] !== self::TYPE_STRING) {
throw new LogicException('Only string fields can be analyzed');
}
switch ($type) {
case null:
$field['analyzer'] = $analyzer;
unset($field['index_analyzer'], $field['search_analyzer']);
break;
case 'indexing':
$field['index_analyzer'] = $analyzer;
break;
case 'searching':
$field['search_analyzer'] = $analyzer;
break;
default:
throw new LogicException(sprintf('Invalid analyzer type "%s".', $type));
}
$field['index'] = 'analyzed';
return $this;
}
public function notAnalyzed() public function notAnalyzed()
{ {
$field = &$this->currentField(); $field = &$this->currentField();

View File

@@ -24,6 +24,7 @@ use DOMNode;
class TermVisitor implements VisitorInterface class TermVisitor implements VisitorInterface
{ {
const TERM_TAG_NAME = 'sy'; const TERM_TAG_NAME = 'sy';
const TERM_ID_ATTR = 'id';
const TERM_LANG_ATTR = 'lng'; const TERM_LANG_ATTR = 'lng';
const TERM_VALUE_ATTR = 'v'; const TERM_VALUE_ATTR = 'v';
// So, this is a huuuge regex to match a group of words eventually followed // So, this is a huuuge regex to match a group of words eventually followed
@@ -54,7 +55,8 @@ class TermVisitor implements VisitorInterface
$term = $this->parseTermValue($value); $term = $this->parseTermValue($value);
$term += [ $term += [
'path' => $this->getCurrentPathAsString(), 'path' => $this->getCurrentPathAsString(),
'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR) 'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR),
'id' => $this->getTermAttribute($element, self::TERM_ID_ATTR)
]; ];
call_user_func($this->termCallback, $term); call_user_func($this->termCallback, $term);