Thesaurus prefixes in field structure

Also fixes candidates collected from all string fields
This commit is contained in:
Mathieu Darse
2015-04-22 20:46:37 +02:00
parent f2082f5c51
commit c6075fcc1a
5 changed files with 106 additions and 102 deletions

View File

@@ -36,9 +36,9 @@ class ThesaurusHydrator implements HydratorInterface
// Fields with concept inference enabled // Fields with concept inference enabled
$structure = $this->helper->getFieldsStructure(); $structure = $this->helper->getFieldsStructure();
$fields = array(); $fields = array();
foreach ($structure as $field => $options) { foreach ($structure as $name => $options) {
if ($options['thesaurus_concept_inference']) { if ($options['thesaurus_concept_inference']) {
$fields[$field] = $options['thesaurus_prefix']; $fields[$name] = $options['thesaurus_prefixes'];
} }
} }
// Hydrate records with concepts // Hydrate records with concepts
@@ -54,12 +54,13 @@ class ThesaurusHydrator implements HydratorInterface
} }
$terms = array(); $terms = array();
$fieldMap = array(); $bulkFieldMap = array();
foreach ($fields as $field => $prefix) { foreach ($fields as $name => $prefixes) {
if (isset($record['caption'][$field])) { if (isset($record['caption'][$name])) {
foreach ($record['caption'][$field] as $value) { // Loop through all values to prepare bulk query
foreach ($record['caption'][$name] as $value) {
$terms[] = Term::parse($value); $terms[] = Term::parse($value);
$fieldMap[] = $field; $bulkFieldMap[] = $name;
} }
} }
} }
@@ -70,12 +71,12 @@ class ThesaurusHydrator implements HydratorInterface
foreach ($bulk as $offset => $item_concepts) { foreach ($bulk as $offset => $item_concepts) {
if ($item_concepts) { if ($item_concepts) {
$field = $fieldMap[$offset]; $name = $bulkFieldMap[$offset];
foreach ($item_concepts as $concept) { foreach ($item_concepts as $concept) {
$record['concept_path'][$field][] = $concept->getPath(); $record['concept_path'][$name][] = $concept->getPath();
} }
} else { } else {
$this->candidateTerms->insert($field, $value); $this->candidateTerms->insert($name, $value);
} }
} }
} }

View File

@@ -169,13 +169,14 @@ class RecordHelper
$field['to_aggregate'] = (bool) $fieldStructure->isAggregable(); $field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
// Thesaurus concept inference // Thesaurus concept inference
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']"; $xpath = $fieldStructure->get_tbranch();
$helper = new ThesaurusHelper(); if ($field['type'] === Mapping::TYPE_STRING && $xpath ==! '') {
$field['thesaurus_concept_inference'] = true;
// TODO Not the real option yet $field['thesaurus_prefixes'] = ThesaurusHelper::findPrefixesByXPath($databox, $xpath);
$field['thesaurus_concept_inference'] = $field['type'] === Mapping::TYPE_STRING; } else {
// TODO Find thesaurus path prefixes $field['thesaurus_concept_inference'] = false;
$field['thesaurus_prefix'] = '/categories'; $field['thesaurus_prefixes'] = null;
}
//printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']); //printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);

View File

@@ -11,27 +11,53 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\ThesaurusException;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
use databox; use databox;
use DOMDocument; use DOMDocument;
use DOMElement; use DOMElement;
use DOMNode; use DOMNode;
use DOMNodeList;
use DOMXPath; use DOMXPath;
use Elasticsearch\Client;
class Helper class Helper
{ {
public function findNodesByXPath($document, $xpath) const TERM_LANG_ATTR = 'lng';
const TERM_VALUE_ATTR = 'v';
const PATH_LANG = 'en';
public static function findPrefixesByXPath(databox $databox, $expression)
{ {
$tbranch = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']"; $document = self::thesaurusFromDatabox($databox);
$xpath = new \DOMXPath($document); $xpath = new DOMXPath($document);
$nodeList = $xpath->query($tbranch); $nodes = $xpath->query($expression);
$conceptIds = []; $prefixes = [];
foreach ($nodeList as $node) { foreach ($nodes as $node) {
if ($node->hasAttribute('id')) { $path_segments = [];
$conceptIds[] = $node->getAttribute('id'); $me_and_parents = [$node];
foreach (self::getElementAncestors($node) as $me_and_parents[]);
foreach ($me_and_parents as $node) {
if (Navigator::isConcept($node)) {
$path_segments[] = self::conceptPathSegment($node);
} else {
// Silently skips invalid targeted nodes
break;
} }
} }
$prefixes[] = sprintf('/%s', implode('/', array_reverse($path_segments)));
}
return $prefixes;
}
private static function getElementAncestors(DOMElement $element)
{
$parents = [];
while ($element = $element->parentNode) {
$parents[] = $element;
}
return $parents;
} }
public static function thesaurusFromDatabox(databox $databox) public static function thesaurusFromDatabox(databox $databox)
@@ -64,4 +90,49 @@ class Helper
return $document; return $document;
} }
public static function conceptPathSegment(DOMElement $element)
{
// Path segment is named according to the first english term, and
// default to the first term.
$terms = self::filter($element->childNodes, array(Navigator::class, 'isTerm'));
$term = self::find($terms, array('self', 'isPathLang'));
if (!$term) {
if (isset($terms[0])) {
$term = $terms[0];
} else {
throw new ThesaurusException(sprintf('No term linked to concept at path "%s".', $element->getNodePath()));
}
}
return StringUtils::slugify($term->getAttribute(self::TERM_VALUE_ATTR));
}
private static function isPathLang(DOMElement $element)
{
return $element->getAttribute(self::TERM_LANG_ATTR) === self::PATH_LANG;
}
// DOM Helpers
private static function filter(DOMNodeList $list, callable $callback)
{
$filtered = [];
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
$filtered[] = $node;
}
}
return $filtered;
}
private static function find(array $list, callable $callback)
{
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
return $node;
}
}
}
} }

View File

@@ -45,12 +45,12 @@ class Navigator
} }
} }
private function isConcept(DOMNode $node) public static function isConcept(DOMNode $node)
{ {
return $node instanceof DOMElement && $node->tagName === self::CONCEPT_TAG_NAME; return $node instanceof DOMElement && $node->tagName === self::CONCEPT_TAG_NAME;
} }
private function isTerm(DOMNode $node) public static function isTerm(DOMNode $node)
{ {
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME; return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
} }

View File

@@ -11,24 +11,17 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus; namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils; use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper;
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\ThesaurusException;
use Closure; use Closure;
use Elasticsearch\Client; use Elasticsearch\Client;
use DOMDocument;
use DOMNodeList;
use DOMElement; use DOMElement;
use DOMNode;
class TermVisitor implements VisitorInterface class TermVisitor implements VisitorInterface
{ {
const TERM_TAG_NAME = 'sy';
const TERM_ID_ATTR = 'id'; const TERM_ID_ATTR = 'id';
const TERM_LANG_ATTR = 'lng'; const TERM_LANG_ATTR = 'lng';
const TERM_VALUE_ATTR = 'v'; const TERM_VALUE_ATTR = 'v';
const PATH_LANG = 'en';
private $path = []; private $path = [];
private $termCallback; private $termCallback;
@@ -39,20 +32,20 @@ class TermVisitor implements VisitorInterface
public function visitConcept(DOMElement $element) public function visitConcept(DOMElement $element)
{ {
array_push($this->path, $this->getConceptPathSegment($element)); array_push($this->path, Helper::conceptPathSegment($element));
} }
public function visitTerm(DOMElement $element) public function visitTerm(DOMElement $element)
{ {
$raw_value = $this->getTermValue($element); $raw_value = $element->getAttribute(self::TERM_VALUE_ATTR);
$object = Term::parse($raw_value); $object = Term::parse($raw_value);
$term = [ $term = [
'raw_value' => $raw_value, 'raw_value' => $raw_value,
'value' => $object->getValue(), 'value' => $object->getValue(),
'context' => $object->getContext(), 'context' => $object->getContext(),
'path' => $this->getCurrentPathAsString(), 'path' => $this->getCurrentPathAsString(),
'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR), 'lang' => $element->getAttribute(self::TERM_LANG_ATTR),
'id' => $this->getTermAttribute($element, self::TERM_ID_ATTR) 'id' => $element->getAttribute(self::TERM_ID_ATTR)
]; ];
call_user_func($this->termCallback, $term); call_user_func($this->termCallback, $term);
@@ -67,66 +60,4 @@ class TermVisitor implements VisitorInterface
{ {
return sprintf('/%s', implode('/', $this->path)); return sprintf('/%s', implode('/', $this->path));
} }
private function getConceptPathSegment(DOMElement $element)
{
// Path segment is named according to the first english term, and
// default to the first term.
$terms = $this->filter($element->childNodes, array($this, 'isTerm'));
$term = $this->find($terms, array($this, 'isPathLang'));
if (!$term) {
if (isset($terms[0])) {
$term = $terms[0];
} else {
throw new ThesaurusException(sprintf('No term linked to concept at path "%s".', $element->getNodePath()));
}
}
return StringUtils::slugify($this->getTermValue($term));
}
private function isTerm(DOMNode $node)
{
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
}
private function isPathLang(DOMElement $element)
{
return $element->getAttribute(self::TERM_LANG_ATTR) === self::PATH_LANG;
}
private function getTermValue(DOMElement $term)
{
return $this->getTermAttribute($term, self::TERM_VALUE_ATTR);
}
private function getTermAttribute(DOMElement $term, $attribute)
{
if ($term->hasAttribute($attribute)) {
return $term->getAttribute($attribute);
}
}
// DOM Helpers
private function filter(DOMNodeList $list, Callable $callback)
{
$filtered = [];
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
$filtered[] = $node;
}
}
return $filtered;
}
private function find(array $list, Callable $callback)
{
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
return $node;
}
}
}
} }