Thesaurus prefixes in field structure

Also fixes candidates collected from all string fields
This commit is contained in:
Mathieu Darse
2015-04-22 20:46:37 +02:00
parent f2082f5c51
commit c6075fcc1a
5 changed files with 106 additions and 102 deletions

View File

@@ -36,9 +36,9 @@ class ThesaurusHydrator implements HydratorInterface
// Fields with concept inference enabled
$structure = $this->helper->getFieldsStructure();
$fields = array();
foreach ($structure as $field => $options) {
foreach ($structure as $name => $options) {
if ($options['thesaurus_concept_inference']) {
$fields[$field] = $options['thesaurus_prefix'];
$fields[$name] = $options['thesaurus_prefixes'];
}
}
// Hydrate records with concepts
@@ -54,12 +54,13 @@ class ThesaurusHydrator implements HydratorInterface
}
$terms = array();
$fieldMap = array();
foreach ($fields as $field => $prefix) {
if (isset($record['caption'][$field])) {
foreach ($record['caption'][$field] as $value) {
$bulkFieldMap = array();
foreach ($fields as $name => $prefixes) {
if (isset($record['caption'][$name])) {
// Loop through all values to prepare bulk query
foreach ($record['caption'][$name] as $value) {
$terms[] = Term::parse($value);
$fieldMap[] = $field;
$bulkFieldMap[] = $name;
}
}
}
@@ -70,12 +71,12 @@ class ThesaurusHydrator implements HydratorInterface
foreach ($bulk as $offset => $item_concepts) {
if ($item_concepts) {
$field = $fieldMap[$offset];
$name = $bulkFieldMap[$offset];
foreach ($item_concepts as $concept) {
$record['concept_path'][$field][] = $concept->getPath();
$record['concept_path'][$name][] = $concept->getPath();
}
} else {
$this->candidateTerms->insert($field, $value);
$this->candidateTerms->insert($name, $value);
}
}
}

View File

@@ -169,13 +169,14 @@ class RecordHelper
$field['to_aggregate'] = (bool) $fieldStructure->isAggregable();
// Thesaurus concept inference
// $xpath = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
$helper = new ThesaurusHelper();
// TODO Not the real option yet
$field['thesaurus_concept_inference'] = $field['type'] === Mapping::TYPE_STRING;
// TODO Find thesaurus path prefixes
$field['thesaurus_prefix'] = '/categories';
$xpath = $fieldStructure->get_tbranch();
if ($field['type'] === Mapping::TYPE_STRING && $xpath ==! '') {
$field['thesaurus_concept_inference'] = true;
$field['thesaurus_prefixes'] = ThesaurusHelper::findPrefixesByXPath($databox, $xpath);
} else {
$field['thesaurus_concept_inference'] = false;
$field['thesaurus_prefixes'] = null;
}
//printf("Field \"%s\" <%s> (private: %b)\n", $name, $field['type'], $field['private']);

View File

@@ -11,27 +11,53 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\ThesaurusException;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
use databox;
use DOMDocument;
use DOMElement;
use DOMNode;
use DOMNodeList;
use DOMXPath;
use Elasticsearch\Client;
class Helper
{
public function findNodesByXPath($document, $xpath)
const TERM_LANG_ATTR = 'lng';
const TERM_VALUE_ATTR = 'v';
const PATH_LANG = 'en';
public static function findPrefixesByXPath(databox $databox, $expression)
{
$tbranch = "/thesaurus/te[@id='T26'] | /thesaurus/te[@id='T24']";
$xpath = new \DOMXPath($document);
$nodeList = $xpath->query($tbranch);
$conceptIds = [];
foreach ($nodeList as $node) {
if ($node->hasAttribute('id')) {
$conceptIds[] = $node->getAttribute('id');
$document = self::thesaurusFromDatabox($databox);
$xpath = new DOMXPath($document);
$nodes = $xpath->query($expression);
$prefixes = [];
foreach ($nodes as $node) {
$path_segments = [];
$me_and_parents = [$node];
foreach (self::getElementAncestors($node) as $me_and_parents[]);
foreach ($me_and_parents as $node) {
if (Navigator::isConcept($node)) {
$path_segments[] = self::conceptPathSegment($node);
} else {
// Silently skips invalid targeted nodes
break;
}
}
$prefixes[] = sprintf('/%s', implode('/', array_reverse($path_segments)));
}
return $prefixes;
}
private static function getElementAncestors(DOMElement $element)
{
$parents = [];
while ($element = $element->parentNode) {
$parents[] = $element;
}
return $parents;
}
public static function thesaurusFromDatabox(databox $databox)
@@ -64,4 +90,49 @@ class Helper
return $document;
}
public static function conceptPathSegment(DOMElement $element)
{
// Path segment is named according to the first english term, and
// default to the first term.
$terms = self::filter($element->childNodes, array(Navigator::class, 'isTerm'));
$term = self::find($terms, array('self', 'isPathLang'));
if (!$term) {
if (isset($terms[0])) {
$term = $terms[0];
} else {
throw new ThesaurusException(sprintf('No term linked to concept at path "%s".', $element->getNodePath()));
}
}
return StringUtils::slugify($term->getAttribute(self::TERM_VALUE_ATTR));
}
private static function isPathLang(DOMElement $element)
{
return $element->getAttribute(self::TERM_LANG_ATTR) === self::PATH_LANG;
}
// DOM Helpers
private static function filter(DOMNodeList $list, callable $callback)
{
$filtered = [];
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
$filtered[] = $node;
}
}
return $filtered;
}
private static function find(array $list, callable $callback)
{
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
return $node;
}
}
}
}

View File

@@ -45,12 +45,12 @@ class Navigator
}
}
private function isConcept(DOMNode $node)
public static function isConcept(DOMNode $node)
{
return $node instanceof DOMElement && $node->tagName === self::CONCEPT_TAG_NAME;
}
private function isTerm(DOMNode $node)
public static function isTerm(DOMNode $node)
{
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
}

View File

@@ -11,24 +11,17 @@
namespace Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus;
use Alchemy\Phrasea\SearchEngine\Elastic\StringUtils;
use Alchemy\Phrasea\SearchEngine\Elastic\Exception\ThesaurusException;
use Alchemy\Phrasea\SearchEngine\Elastic\Thesaurus\Helper;
use Closure;
use Elasticsearch\Client;
use DOMDocument;
use DOMNodeList;
use DOMElement;
use DOMNode;
class TermVisitor implements VisitorInterface
{
const TERM_TAG_NAME = 'sy';
const TERM_ID_ATTR = 'id';
const TERM_LANG_ATTR = 'lng';
const TERM_VALUE_ATTR = 'v';
const PATH_LANG = 'en';
private $path = [];
private $termCallback;
@@ -39,20 +32,20 @@ class TermVisitor implements VisitorInterface
public function visitConcept(DOMElement $element)
{
array_push($this->path, $this->getConceptPathSegment($element));
array_push($this->path, Helper::conceptPathSegment($element));
}
public function visitTerm(DOMElement $element)
{
$raw_value = $this->getTermValue($element);
$raw_value = $element->getAttribute(self::TERM_VALUE_ATTR);
$object = Term::parse($raw_value);
$term = [
'raw_value' => $raw_value,
'value' => $object->getValue(),
'context' => $object->getContext(),
'path' => $this->getCurrentPathAsString(),
'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR),
'id' => $this->getTermAttribute($element, self::TERM_ID_ATTR)
'lang' => $element->getAttribute(self::TERM_LANG_ATTR),
'id' => $element->getAttribute(self::TERM_ID_ATTR)
];
call_user_func($this->termCallback, $term);
@@ -67,66 +60,4 @@ class TermVisitor implements VisitorInterface
{
return sprintf('/%s', implode('/', $this->path));
}
private function getConceptPathSegment(DOMElement $element)
{
// Path segment is named according to the first english term, and
// default to the first term.
$terms = $this->filter($element->childNodes, array($this, 'isTerm'));
$term = $this->find($terms, array($this, 'isPathLang'));
if (!$term) {
if (isset($terms[0])) {
$term = $terms[0];
} else {
throw new ThesaurusException(sprintf('No term linked to concept at path "%s".', $element->getNodePath()));
}
}
return StringUtils::slugify($this->getTermValue($term));
}
private function isTerm(DOMNode $node)
{
return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME;
}
private function isPathLang(DOMElement $element)
{
return $element->getAttribute(self::TERM_LANG_ATTR) === self::PATH_LANG;
}
private function getTermValue(DOMElement $term)
{
return $this->getTermAttribute($term, self::TERM_VALUE_ATTR);
}
private function getTermAttribute(DOMElement $term, $attribute)
{
if ($term->hasAttribute($attribute)) {
return $term->getAttribute($attribute);
}
}
// DOM Helpers
private function filter(DOMNodeList $list, Callable $callback)
{
$filtered = [];
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
$filtered[] = $node;
}
}
return $filtered;
}
private function find(array $list, Callable $callback)
{
foreach ($list as $node) {
if (call_user_func($callback, $node)) {
return $node;
}
}
}
}