Thesaurus flattening using hierarchical visitor

This commit is contained in:
Mathieu Darse
2014-09-15 19:11:45 +02:00
parent eadf0f2396
commit e34e4ce255
5 changed files with 250 additions and 7 deletions

View File

@@ -0,0 +1,16 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Exception;
class ThesaurusException extends Exception
{
}

View File

@@ -14,6 +14,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation; use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine; use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping; use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Navigator;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermVisitor;
use Closure; use Closure;
use Elasticsearch\Client; use Elasticsearch\Client;
use databox; use databox;
@@ -31,20 +33,22 @@ class TermIndexer
public function __construct(\appbox $appbox) public function __construct(\appbox $appbox)
{ {
//$this->document = self::thesaurusFromDatabox($databox);
$this->appbox = $appbox; $this->appbox = $appbox;
} }
public function populateIndex(BulkOperation $bulk) public function populateIndex(BulkOperation $bulk)
{ {
// Helper to fetch record related data // TODO Create object to query thesaurus for term paths/synonyms
//$recordHelper = new RecordHelper($this->appbox);
$navigator = new Navigator();
foreach ($this->appbox->get_databoxes() as $databox) { foreach ($this->appbox->get_databoxes() as $databox) {
// TODO Create object to query thesaurus for term paths/synonyms $document = self::thesaurusFromDatabox($databox);
// TODO Extract record indexing logic in a RecordIndexer class $visitor = new TermVisitor(function ($term) use ($bulk) {
//$fetcher = new RecordFetcher($databox, $recordHelper); printf("- %s (%s)\n", $term['path'], $term['value']);
//$fetcher->setBatchSize(200); });
$navigator->walk($document, $visitor);
while ($record = false) { while ($record = false) {
$params = array(); $params = array();
$params['id'] = $record['id']; $params['id'] = $record['id'];

View File

@@ -0,0 +1,57 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
use Closure;
use Elasticsearch\Client;
use databox;
use DOMDocument;
use DOMElement;
use DOMNode;
class Navigator
{
const THESAURUS_TAG_NAME = 'thesaurus';
const CONCEPT_TAG_NAME = 'te';
const TERM_TAG_NAME = 'sy';
public function walk(DOMNode $node, VisitorInterface $visitor)
{
if ($this->isConcept($node)) {
$visitor->visitConcept($node);
foreach ($node->childNodes as $child) {
$this->walk($child, $visitor);
}
$visitor->leaveConcept($node);
} elseif ($this->isTerm($node)) {
$visitor->visitTerm($node);
} else {
foreach ($node->childNodes as $child) {
$this->walk($child, $visitor);
}
}
}
private function isConcept(DOMNode $node)
{
return $node instanceof DOMElement && $node->tagName === self::CONCEPT_TAG_NAME;
}
private function isTerm(DOMNode $node)
{
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
}
}

View File

@@ -0,0 +1,145 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\ThesaurusException;
use Closure;
use Elasticsearch\Client;
use DOMDocument;
use DOMNodeList;
use DOMElement;
use DOMNode;
class TermVisitor implements VisitorInterface
{
const TERM_TAG_NAME = 'sy';
const TERM_LANG_ATTR = 'lng';
const TERM_VALUE_ATTR = 'v';
// So, this is a huuuge regex to match a group of words eventually followed
// by another group of words in parenthesis. It also takes care of trimming
// spaces.
const TERM_REGEX = '/^\s*(\w[^\(\)]*\w|\w)\s*(?:\(\s*([^\(\)]*[^\s\(\)])\s*\))?/u';
// [_____term______] ( [_____context_____] )
const PATH_LANG = 'en';
private $path = [];
private $termCallback;
public function __construct(Closure $termCallback)
{
$this->termCallback = $termCallback;
}
public function visitConcept(DOMElement $element)
{
array_push($this->path, $this->getConceptPathSegment($element));
}
public function visitTerm(DOMElement $element)
{
$value = $this->getTermValue($element);
$term = $this->parseTermValue($value);
$term += [
'path' => $this->getCurrentPathAsString(),
'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR)
];
call_user_func($this->termCallback, $term);
}
public function leaveConcept(DOMElement $element)
{
array_pop($this->path);
}
private function parseTermValue($value)
{
preg_match(self::TERM_REGEX, $value, $matches);
return [
'raw_value' => $value,
'value' => isset($matches[1]) ? $matches[1] : null,
'context' => isset($matches[2]) ? $matches[2] : null
];
}
private function getCurrentPathAsString()
{
return sprintf('/%s', implode('/', $this->path));
}
private function getConceptPathSegment(DOMElement $element)
{
// Path segment is named according to the first english term, and
// default to the first term.
$terms = $this->filter($element->childNodes, array($this, 'isTerm'));
$term = $this->find($terms, array($this, 'isPathLang'));
if (!$term) {
if (isset($terms[0])) {
$term = $terms[0];
} else {
throw new ThesaurusException(sprintf('No term linked to concept at path "%s".', $element->getNodePath()));
}
}
return StringUtils::slugify($this->getTermValue($term));
}
private function isTerm(DOMNode $node)
{
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
}
private function isPathLang(DOMElement $element)
{
return $element->getAttribute(self::TERM_LANG_ATTR) === self::PATH_LANG;
}
private function getTermValue(DOMElement $term)
{
return $this->getTermAttribute($term, self::TERM_VALUE_ATTR);
}
private function getTermAttribute(DOMElement $term, $attribute)
{
if ($term->hasAttribute($attribute)) {
return $term->getAttribute($attribute);
}
}
// DOM Helpers
private function filter(DOMNodeList $list, Callable $callback)
{
$filtered = [];
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
$filtered[] = $node;
}
}
return $filtered;
}
private function find(array $list, Callable $callback)
{
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
return $node;
}
}
}
}

View File

@@ -0,0 +1,21 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2014 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use DOMElement;
interface VisitorInterface
{
public function visitConcept(DOMElement $element);
public function visitTerm(DOMElement $element);
public function leaveConcept(DOMElement $element);
}