mirror of
https://github.com/alchemy-fr/Phraseanet.git
synced 2025-10-18 07:23:13 +00:00
Thesaurus flattening using hierarchical visitor
This commit is contained in:
@@ -0,0 +1,16 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Phraseanet
|
||||||
|
*
|
||||||
|
* (c) 2005-2014 Alchemy
|
||||||
|
*
|
||||||
|
* For the full copyright and license information, please view the LICENSE
|
||||||
|
* file that was distributed with this source code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Alchemy\Phrasea\SearchEngine\Elastic\Exception;
|
||||||
|
|
||||||
|
class ThesaurusException extends Exception
|
||||||
|
{
|
||||||
|
}
|
@@ -14,6 +14,8 @@ namespace Alchemy\Phrasea\SearchEngine\Elastic\Indexer;
|
|||||||
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
|
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
|
||||||
use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
|
use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
|
||||||
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
|
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
|
||||||
|
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Navigator;
|
||||||
|
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\TermVisitor;
|
||||||
use Closure;
|
use Closure;
|
||||||
use Elasticsearch\Client;
|
use Elasticsearch\Client;
|
||||||
use databox;
|
use databox;
|
||||||
@@ -31,20 +33,22 @@ class TermIndexer
|
|||||||
|
|
||||||
public function __construct(\appbox $appbox)
|
public function __construct(\appbox $appbox)
|
||||||
{
|
{
|
||||||
//$this->document = self::thesaurusFromDatabox($databox);
|
|
||||||
$this->appbox = $appbox;
|
$this->appbox = $appbox;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function populateIndex(BulkOperation $bulk)
|
public function populateIndex(BulkOperation $bulk)
|
||||||
{
|
{
|
||||||
// Helper to fetch record related data
|
// TODO Create object to query thesaurus for term paths/synonyms
|
||||||
//$recordHelper = new RecordHelper($this->appbox);
|
|
||||||
|
$navigator = new Navigator();
|
||||||
|
|
||||||
foreach ($this->appbox->get_databoxes() as $databox) {
|
foreach ($this->appbox->get_databoxes() as $databox) {
|
||||||
// TODO Create object to query thesaurus for term paths/synonyms
|
$document = self::thesaurusFromDatabox($databox);
|
||||||
// TODO Extract record indexing logic in a RecordIndexer class
|
$visitor = new TermVisitor(function ($term) use ($bulk) {
|
||||||
//$fetcher = new RecordFetcher($databox, $recordHelper);
|
printf("- %s (%s)\n", $term['path'], $term['value']);
|
||||||
//$fetcher->setBatchSize(200);
|
});
|
||||||
|
$navigator->walk($document, $visitor);
|
||||||
|
|
||||||
while ($record = false) {
|
while ($record = false) {
|
||||||
$params = array();
|
$params = array();
|
||||||
$params['id'] = $record['id'];
|
$params['id'] = $record['id'];
|
||||||
|
@@ -0,0 +1,57 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Phraseanet
|
||||||
|
*
|
||||||
|
* (c) 2005-2014 Alchemy
|
||||||
|
*
|
||||||
|
* For the full copyright and license information, please view the LICENSE
|
||||||
|
* file that was distributed with this source code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||||
|
|
||||||
|
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
|
||||||
|
use Alchemy\Phrasea\SearchEngine\Elastic\ElasticSearchEngine;
|
||||||
|
use Alchemy\Phrasea\SearchEngine\Elastic\Mapping;
|
||||||
|
use Closure;
|
||||||
|
use Elasticsearch\Client;
|
||||||
|
use databox;
|
||||||
|
|
||||||
|
use DOMDocument;
|
||||||
|
use DOMElement;
|
||||||
|
use DOMNode;
|
||||||
|
|
||||||
|
class Navigator
|
||||||
|
{
|
||||||
|
const THESAURUS_TAG_NAME = 'thesaurus';
|
||||||
|
const CONCEPT_TAG_NAME = 'te';
|
||||||
|
const TERM_TAG_NAME = 'sy';
|
||||||
|
|
||||||
|
public function walk(DOMNode $node, VisitorInterface $visitor)
|
||||||
|
{
|
||||||
|
if ($this->isConcept($node)) {
|
||||||
|
$visitor->visitConcept($node);
|
||||||
|
foreach ($node->childNodes as $child) {
|
||||||
|
$this->walk($child, $visitor);
|
||||||
|
}
|
||||||
|
$visitor->leaveConcept($node);
|
||||||
|
} elseif ($this->isTerm($node)) {
|
||||||
|
$visitor->visitTerm($node);
|
||||||
|
} else {
|
||||||
|
foreach ($node->childNodes as $child) {
|
||||||
|
$this->walk($child, $visitor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isConcept(DOMNode $node)
|
||||||
|
{
|
||||||
|
return $node instanceof DOMElement && $node->tagName === self::CONCEPT_TAG_NAME;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isTerm(DOMNode $node)
|
||||||
|
{
|
||||||
|
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,145 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Phraseanet
|
||||||
|
*
|
||||||
|
* (c) 2005-2014 Alchemy
|
||||||
|
*
|
||||||
|
* For the full copyright and license information, please view the LICENSE
|
||||||
|
* file that was distributed with this source code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||||
|
|
||||||
|
use Alchemy\Phrasea\SearchEngine\Elastic\BulkOperation;
|
||||||
|
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
|
||||||
|
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\ThesaurusException;
|
||||||
|
use Closure;
|
||||||
|
use Elasticsearch\Client;
|
||||||
|
use DOMDocument;
|
||||||
|
use DOMNodeList;
|
||||||
|
use DOMElement;
|
||||||
|
use DOMNode;
|
||||||
|
|
||||||
|
class TermVisitor implements VisitorInterface
|
||||||
|
{
|
||||||
|
const TERM_TAG_NAME = 'sy';
|
||||||
|
const TERM_LANG_ATTR = 'lng';
|
||||||
|
const TERM_VALUE_ATTR = 'v';
|
||||||
|
// So, this is a huuuge regex to match a group of words eventually followed
|
||||||
|
// by another group of words in parenthesis. It also takes care of trimming
|
||||||
|
// spaces.
|
||||||
|
const TERM_REGEX = '/^\s*(\w[^\(\)]*\w|\w)\s*(?:\(\s*([^\(\)]*[^\s\(\)])\s*\))?/u';
|
||||||
|
// [_____term______] ( [_____context_____] )
|
||||||
|
|
||||||
|
const PATH_LANG = 'en';
|
||||||
|
|
||||||
|
private $path = [];
|
||||||
|
private $termCallback;
|
||||||
|
|
||||||
|
public function __construct(Closure $termCallback)
|
||||||
|
{
|
||||||
|
$this->termCallback = $termCallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function visitConcept(DOMElement $element)
|
||||||
|
{
|
||||||
|
array_push($this->path, $this->getConceptPathSegment($element));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function visitTerm(DOMElement $element)
|
||||||
|
{
|
||||||
|
$value = $this->getTermValue($element);
|
||||||
|
|
||||||
|
$term = $this->parseTermValue($value);
|
||||||
|
$term += [
|
||||||
|
'path' => $this->getCurrentPathAsString(),
|
||||||
|
'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR)
|
||||||
|
];
|
||||||
|
|
||||||
|
call_user_func($this->termCallback, $term);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function leaveConcept(DOMElement $element)
|
||||||
|
{
|
||||||
|
array_pop($this->path);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function parseTermValue($value)
|
||||||
|
{
|
||||||
|
preg_match(self::TERM_REGEX, $value, $matches);
|
||||||
|
|
||||||
|
return [
|
||||||
|
'raw_value' => $value,
|
||||||
|
'value' => isset($matches[1]) ? $matches[1] : null,
|
||||||
|
'context' => isset($matches[2]) ? $matches[2] : null
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getCurrentPathAsString()
|
||||||
|
{
|
||||||
|
return sprintf('/%s', implode('/', $this->path));
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getConceptPathSegment(DOMElement $element)
|
||||||
|
{
|
||||||
|
// Path segment is named according to the first english term, and
|
||||||
|
// default to the first term.
|
||||||
|
$terms = $this->filter($element->childNodes, array($this, 'isTerm'));
|
||||||
|
$term = $this->find($terms, array($this, 'isPathLang'));
|
||||||
|
if (!$term) {
|
||||||
|
if (isset($terms[0])) {
|
||||||
|
$term = $terms[0];
|
||||||
|
} else {
|
||||||
|
throw new ThesaurusException(sprintf('No term linked to concept at path "%s".', $element->getNodePath()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return StringUtils::slugify($this->getTermValue($term));
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isTerm(DOMNode $node)
|
||||||
|
{
|
||||||
|
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function isPathLang(DOMElement $element)
|
||||||
|
{
|
||||||
|
return $element->getAttribute(self::TERM_LANG_ATTR) === self::PATH_LANG;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getTermValue(DOMElement $term)
|
||||||
|
{
|
||||||
|
return $this->getTermAttribute($term, self::TERM_VALUE_ATTR);
|
||||||
|
}
|
||||||
|
|
||||||
|
private function getTermAttribute(DOMElement $term, $attribute)
|
||||||
|
{
|
||||||
|
if ($term->hasAttribute($attribute)) {
|
||||||
|
return $term->getAttribute($attribute);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// DOM Helpers
|
||||||
|
|
||||||
|
private function filter(DOMNodeList $list, Callable $callback)
|
||||||
|
{
|
||||||
|
$filtered = [];
|
||||||
|
foreach ($list as $node) {
|
||||||
|
if (call_user_func($callback, $node)) {
|
||||||
|
$filtered[] = $node;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $filtered;
|
||||||
|
}
|
||||||
|
|
||||||
|
private function find(array $list, Callable $callback)
|
||||||
|
{
|
||||||
|
foreach ($list as $node) {
|
||||||
|
if (call_user_func($callback, $node)) {
|
||||||
|
return $node;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@@ -0,0 +1,21 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This file is part of Phraseanet
|
||||||
|
*
|
||||||
|
* (c) 2005-2014 Alchemy
|
||||||
|
*
|
||||||
|
* For the full copyright and license information, please view the LICENSE
|
||||||
|
* file that was distributed with this source code.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
|
||||||
|
|
||||||
|
use DOMElement;
|
||||||
|
|
||||||
|
interface VisitorInterface
|
||||||
|
{
|
||||||
|
public function visitConcept(DOMElement $element);
|
||||||
|
public function visitTerm(DOMElement $element);
|
||||||
|
public function leaveConcept(DOMElement $element);
|
||||||
|
}
|
Reference in New Issue
Block a user