diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Exception/ThesaurusException.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Exception/ThesaurusException.php new file mode 100644 index 0000000000..bab7e3260a --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Exception/ThesaurusException.php @@ -0,0 +1,16 @@ +document = self::thesaurusFromDatabox($databox); $this->appbox = $appbox; } public function populateIndex(BulkOperation $bulk) { - // Helper to fetch record related data - //$recordHelper = new RecordHelper($this->appbox); + // TODO Create object to query thesaurus for term paths/synonyms + + $navigator = new Navigator(); foreach ($this->appbox->get_databoxes() as $databox) { - // TODO Create object to query thesaurus for term paths/synonyms - // TODO Extract record indexing logic in a RecordIndexer class - //$fetcher = new RecordFetcher($databox, $recordHelper); - //$fetcher->setBatchSize(200); + $document = self::thesaurusFromDatabox($databox); + $visitor = new TermVisitor(function ($term) use ($bulk) { + printf("- %s (%s)\n", $term['path'], $term['value']); + }); + $navigator->walk($document, $visitor); + while ($record = false) { $params = array(); $params['id'] = $record['id']; diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Navigator.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Navigator.php new file mode 100644 index 0000000000..0756a51e90 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/Navigator.php @@ -0,0 +1,57 @@ +isConcept($node)) { + $visitor->visitConcept($node); + foreach ($node->childNodes as $child) { + $this->walk($child, $visitor); + } + $visitor->leaveConcept($node); + } elseif ($this->isTerm($node)) { + $visitor->visitTerm($node); + } else { + foreach ($node->childNodes as $child) { + $this->walk($child, $visitor); + } + } + } + + private function isConcept(DOMNode $node) + { + return $node instanceof DOMElement && $node->tagName === self::CONCEPT_TAG_NAME; + } + + private function isTerm(DOMNode $node) + { + return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME; + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermVisitor.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermVisitor.php new file mode 100644 index 0000000000..489147a893 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/TermVisitor.php @@ -0,0 +1,145 @@ +termCallback = $termCallback; + } + + public function visitConcept(DOMElement $element) + { + array_push($this->path, $this->getConceptPathSegment($element)); + } + + public function visitTerm(DOMElement $element) + { + $value = $this->getTermValue($element); + + $term = $this->parseTermValue($value); + $term += [ + 'path' => $this->getCurrentPathAsString(), + 'lang' => $this->getTermAttribute($element, self::TERM_LANG_ATTR) + ]; + + call_user_func($this->termCallback, $term); + } + + public function leaveConcept(DOMElement $element) + { + array_pop($this->path); + } + + private function parseTermValue($value) + { + preg_match(self::TERM_REGEX, $value, $matches); + + return [ + 'raw_value' => $value, + 'value' => isset($matches[1]) ? $matches[1] : null, + 'context' => isset($matches[2]) ? $matches[2] : null + ]; + } + + private function getCurrentPathAsString() + { + return sprintf('/%s', implode('/', $this->path)); + } + + private function getConceptPathSegment(DOMElement $element) + { + // Path segment is named according to the first english term, and + // default to the first term. + $terms = $this->filter($element->childNodes, array($this, 'isTerm')); + $term = $this->find($terms, array($this, 'isPathLang')); + if (!$term) { + if (isset($terms[0])) { + $term = $terms[0]; + } else { + throw new ThesaurusException(sprintf('No term linked to concept at path "%s".', $element->getNodePath())); + } + } + + return StringUtils::slugify($this->getTermValue($term)); + } + + private function isTerm(DOMNode $node) + { + return $node instanceof DOMElement && $node->tagName === self::TERM_TAG_NAME; + } + + private function isPathLang(DOMElement $element) + { + return $element->getAttribute(self::TERM_LANG_ATTR) === self::PATH_LANG; + } + + private function getTermValue(DOMElement $term) + { + return $this->getTermAttribute($term, self::TERM_VALUE_ATTR); + } + + private function getTermAttribute(DOMElement $term, $attribute) + { + if ($term->hasAttribute($attribute)) { + return $term->getAttribute($attribute); + } + } + + // DOM Helpers + + private function filter(DOMNodeList $list, Callable $callback) + { + $filtered = []; + foreach ($list as $node) { + if (call_user_func($callback, $node)) { + $filtered[] = $node; + } + } + + return $filtered; + } + + private function find(array $list, Callable $callback) + { + foreach ($list as $node) { + if (call_user_func($callback, $node)) { + return $node; + } + } + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/VisitorInterface.php b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/VisitorInterface.php new file mode 100644 index 0000000000..9d463df6aa --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/Elastic/Thesaurus/VisitorInterface.php @@ -0,0 +1,21 @@ +