mirror of
https://github.com/alchemy-fr/Phraseanet.git
synced 2025-10-18 15:33:15 +00:00
Thesaurus flattening using hierarchical visitor
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of Phraseanet
|
||||
*
|
||||
* (c) 2005-2014 Alchemy
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\Exception;
|
||||
|
||||
class ThesaurusException extends Exception
|
||||
{
|
||||
}
|
@@ -14,6 +14,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Navigator;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermVisitor;
|
||||
use Closure;
|
||||
use Elasticsearch\Client;
|
||||
use databox;
|
||||
@@ -31,20 +33,22 @@ class TermIndexer
|
||||
|
||||
public function __construct(\appbox $appbox)
|
||||
{
|
||||
//$this->document = self::thesaurusFromDatabox($databox);
|
||||
$this->appbox = $appbox;
|
||||
}
|
||||
|
||||
public function populateIndex(BulkOperation $bulk)
|
||||
{
|
||||
// Helper to fetch record related data
|
||||
//$recordHelper = new RecordHelper($this->appbox);
|
||||
// TODO Create object to query thesaurus for term paths/synonyms
|
||||
|
||||
$navigator = new Navigator();
|
||||
|
||||
foreach ($this->appbox->get_databoxes() as $databox) {
|
||||
// TODO Create object to query thesaurus for term paths/synonyms
|
||||
// TODO Extract record indexing logic in a RecordIndexer class
|
||||
//$fetcher = new RecordFetcher($databox, $recordHelper);
|
||||
//$fetcher->setBatchSize(200);
|
||||
$document = self::thesaurusFromDatabox($databox);
|
||||
$visitor = new TermVisitor(function ($term) use ($bulk) {
|
||||
printf("- %s (%s)\n", $term['path'], $term['value']);
|
||||
});
|
||||
$navigator->walk($document, $visitor);
|
||||
|
||||
while ($record = false) {
|
||||
$params = array();
|
||||
$params['id'] = $record['id'];
|
||||
|
@@ -0,0 +1,57 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of Phraseanet
|
||||
*
|
||||
* (c) 2005-2014 Alchemy
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
|
||||
use Closure;
|
||||
use Elasticsearch\Client;
|
||||
use databox;
|
||||
|
||||
use DOMDocument;
|
||||
use DOMElement;
|
||||
use DOMNode;
|
||||
|
||||
class Navigator
|
||||
{
|
||||
const THESAURUS_TAG_NAME = 'thesaurus';
|
||||
const CONCEPT_TAG_NAME = 'te';
|
||||
const TERM_TAG_NAME = 'sy';
|
||||
|
||||
public function walk(DOMNode $node, VisitorInterface $visitor)
|
||||
{
|
||||
if ($this->isConcept($node)) {
|
||||
$visitor->visitConcept($node);
|
||||
foreach ($node->childNodes as $child) {
|
||||
$this->walk($child, $visitor);
|
||||
}
|
||||
$visitor->leaveConcept($node);
|
||||
} elseif ($this->isTerm($node)) {
|
||||
$visitor->visitTerm($node);
|
||||
} else {
|
||||
foreach ($node->childNodes as $child) {
|
||||
$this->walk($child, $visitor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function isConcept(DOMNode $node)
|
||||
{
|
||||
return $node instanceof DOMElement && $node->tagName === self::CONCEPT_TAG_NAME;
|
||||
}
|
||||
|
||||
private function isTerm(DOMNode $node)
|
||||
{
|
||||
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
|
||||
}
|
||||
}
|
@@ -0,0 +1,145 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of Phraseanet
|
||||
*
|
||||
* (c) 2005-2014 Alchemy
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
|
||||
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\ThesaurusException;
|
||||
use Closure;
|
||||
use Elasticsearch\Client;
|
||||
use DOMDocument;
|
||||
use DOMNodeList;
|
||||
use DOMElement;
|
||||
use DOMNode;
|
||||
|
||||
class TermVisitor implements VisitorInterface
|
||||
{
|
||||
const TERM_TAG_NAME = 'sy';
|
||||
const TERM_LANG_ATTR = 'lng';
|
||||
const TERM_VALUE_ATTR = 'v';
|
||||
// So, this is a huuuge regex to match a group of words eventually followed
|
||||
// by another group of words in parenthesis. It also takes care of trimming
|
||||
// spaces.
|
||||
const TERM_REGEX = '/^\s*(\w[^\(\)]*\w|\w)\s*(?:\(\s*([^\(\)]*[^\s\(\)])\s*\))?/u';
|
||||
// [_____term______] ( [_____context_____] )
|
||||
|
||||
const PATH_LANG = 'en';
|
||||
|
||||
private $path = [];
|
||||
private $termCallback;
|
||||
|
||||
public function __construct(Closure $termCallback)
|
||||
{
|
||||
$this->termCallback = $termCallback;
|
||||
}
|
||||
|
||||
public function visitConcept(DOMElement $element)
|
||||
{
|
||||
array_push($this->path, $this->getConceptPathSegment($element));
|
||||
}
|
||||
|
||||
public function visitTerm(DOMElement $element)
|
||||
{
|
||||
$value = $this->getTermValue($element);
|
||||
|
||||
$term = $this->parseTermValue($value);
|
||||
$term += [
|
||||
'path' => $this->getCurrentPathAsString(),
|
||||
'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR)
|
||||
];
|
||||
|
||||
call_user_func($this->termCallback, $term);
|
||||
}
|
||||
|
||||
public function leaveConcept(DOMElement $element)
|
||||
{
|
||||
array_pop($this->path);
|
||||
}
|
||||
|
||||
private function parseTermValue($value)
|
||||
{
|
||||
preg_match(self::TERM_REGEX, $value, $matches);
|
||||
|
||||
return [
|
||||
'raw_value' => $value,
|
||||
'value' => isset($matches[1]) ? $matches[1] : null,
|
||||
'context' => isset($matches[2]) ? $matches[2] : null
|
||||
];
|
||||
}
|
||||
|
||||
private function getCurrentPathAsString()
|
||||
{
|
||||
return sprintf('/%s', implode('/', $this->path));
|
||||
}
|
||||
|
||||
private function getConceptPathSegment(DOMElement $element)
|
||||
{
|
||||
// Path segment is named according to the first english term, and
|
||||
// default to the first term.
|
||||
$terms = $this->filter($element->childNodes, array($this, 'isTerm'));
|
||||
$term = $this->find($terms, array($this, 'isPathLang'));
|
||||
if (!$term) {
|
||||
if (isset($terms[0])) {
|
||||
$term = $terms[0];
|
||||
} else {
|
||||
throw new ThesaurusException(sprintf('No term linked to concept at path "%s".', $element->getNodePath()));
|
||||
}
|
||||
}
|
||||
|
||||
return StringUtils::slugify($this->getTermValue($term));
|
||||
}
|
||||
|
||||
private function isTerm(DOMNode $node)
|
||||
{
|
||||
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
|
||||
}
|
||||
|
||||
private function isPathLang(DOMElement $element)
|
||||
{
|
||||
return $element->getAttribute(self::TERM_LANG_ATTR) === self::PATH_LANG;
|
||||
}
|
||||
|
||||
private function getTermValue(DOMElement $term)
|
||||
{
|
||||
return $this->getTermAttribute($term, self::TERM_VALUE_ATTR);
|
||||
}
|
||||
|
||||
private function getTermAttribute(DOMElement $term, $attribute)
|
||||
{
|
||||
if ($term->hasAttribute($attribute)) {
|
||||
return $term->getAttribute($attribute);
|
||||
}
|
||||
}
|
||||
|
||||
// DOM Helpers
|
||||
|
||||
private function filter(DOMNodeList $list, Callable $callback)
|
||||
{
|
||||
$filtered = [];
|
||||
foreach ($list as $node) {
|
||||
if (call_user_func($callback, $node)) {
|
||||
$filtered[] = $node;
|
||||
}
|
||||
}
|
||||
|
||||
return $filtered;
|
||||
}
|
||||
|
||||
private function find(array $list, Callable $callback)
|
||||
{
|
||||
foreach ($list as $node) {
|
||||
if (call_user_func($callback, $node)) {
|
||||
return $node;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,21 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* This file is part of Phraseanet
|
||||
*
|
||||
* (c) 2005-2014 Alchemy
|
||||
*
|
||||
* For the full copyright and license information, please view the LICENSE
|
||||
* file that was distributed with this source code.
|
||||
*/
|
||||
|
||||
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||
|
||||
use DOMElement;
|
||||
|
||||
interface VisitorInterface
|
||||
{
|
||||
public function visitConcept(DOMElement $element);
|
||||
public function visitTerm(DOMElement $element);
|
||||
public function leaveConcept(DOMElement $element);
|
||||
}
|
Reference in New Issue
Block a user