From a9ee52c6c852ea2f4d89a97d622201e8e5bb1279 Mon Sep 17 00:00:00 2001 From: Romain Neutron Date: Fri, 24 Aug 2012 11:27:02 +0200 Subject: [PATCH] [SearchEngine] Add refactored search engines --- .../Provider/SearchEngineServiceProvider.php | 31 + .../Service/SearchEngine/PhraseaEngine.php | 42 + .../Service/SearchEngine/SphinxSearch.php | 44 + .../Phrasea/SearchEngine/PhraseaEngine.php | 517 +++++ .../SearchEngine/PhraseaEngineQueryParser.php | 1954 +++++++++++++++++ .../SearchEngine/SearchEngineInterface.php | 155 ++ .../SearchEngine/SearchEngineOptions.php | 478 ++++ .../SearchEngine/SearchEngineResult.php | 108 + .../SearchEngine/SearchEngineSuggestion.php | 41 + .../Phrasea/SearchEngine/SphinxSearch.php | 689 ++++++ 10 files changed, 4059 insertions(+) create mode 100644 lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php create mode 100644 lib/Alchemy/Phrasea/Core/Service/SearchEngine/PhraseaEngine.php create mode 100644 lib/Alchemy/Phrasea/Core/Service/SearchEngine/SphinxSearch.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/PhraseaEngine.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/PhraseaEngineQueryParser.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/SearchEngineInterface.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/SearchEngineOptions.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/SearchEngineResult.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/SearchEngineSuggestion.php create mode 100644 lib/Alchemy/Phrasea/SearchEngine/SphinxSearch.php diff --git a/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php b/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php new file mode 100644 index 0000000000..d2596c1b02 --- /dev/null +++ b/lib/Alchemy/Phrasea/Core/Provider/SearchEngineServiceProvider.php @@ -0,0 +1,31 @@ +share(function($app) { + return $app['phraseanet.core']['SearchEngine']; + }); + } + + public function boot(Application $app) + { + + } +} diff --git a/lib/Alchemy/Phrasea/Core/Service/SearchEngine/PhraseaEngine.php b/lib/Alchemy/Phrasea/Core/Service/SearchEngine/PhraseaEngine.php new file mode 100644 index 0000000000..16198c93bd --- /dev/null +++ b/lib/Alchemy/Phrasea/Core/Service/SearchEngine/PhraseaEngine.php @@ -0,0 +1,42 @@ +searchEngine = new PhraseaSearchEngine(); + + return $this; + } + + public function getDriver() + { + return $this->searchEngine; + } + + public function getType() + { + return 'phrasea'; + } + + public function getMandatoryOptions() + { + return array(); + } +} diff --git a/lib/Alchemy/Phrasea/Core/Service/SearchEngine/SphinxSearch.php b/lib/Alchemy/Phrasea/Core/Service/SearchEngine/SphinxSearch.php new file mode 100644 index 0000000000..c7054a145e --- /dev/null +++ b/lib/Alchemy/Phrasea/Core/Service/SearchEngine/SphinxSearch.php @@ -0,0 +1,44 @@ +getOptions(); + + $this->searchEngine = new SphinxSearchEngine($options['host'], $options['port'], $options['rt_host'], $options['rt_port']); + + return $this; + } + + public function getDriver() + { + return $this->searchEngine; + } + + public function getType() + { + return 'sphinx-search'; + } + + public function getMandatoryOptions() + { + return array('host', 'port', 'rt_host', 'rt_port'); + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/PhraseaEngine.php b/lib/Alchemy/Phrasea/SearchEngine/PhraseaEngine.php new file mode 100644 index 0000000000..bdc8644aff --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/PhraseaEngine.php @@ -0,0 +1,517 @@ +options = new SearchEngineOptions(); + } + + /** + * {@inheritdoc} + */ + public function status() + { + return true; + } + + /** + * {@inheritdoc} + */ + public function availableTypes() + { + return array(self::GEM_TYPE_RECORD, self::GEM_TYPE_STORY); + } + + /** + * {@inheritdoc} + */ + public function addRecord(\record_adapter $record) + { + return $this->updateRecord($record); + } + + /** + * {@inheritdoc} + */ + public function removeRecord(\record_adapter $record) + { + $connbas = $record->get_databox()->get_connection(); + + $sql = "DELETE FROM prop WHERE record_id = :record_id"; + $stmt = $connbas->prepare($sql); + $stmt->execute(array(':record_id' => $record->get_record_id())); + $stmt->closeCursor(); + + $sql = "DELETE FROM idx WHERE record_id = :record_id"; + $stmt = $connbas->prepare($sql); + $stmt->execute(array(':record_id' => $record->get_record_id())); + $stmt->closeCursor(); + + $sql = "DELETE FROM thit WHERE record_id = :record_id"; + $stmt = $connbas->prepare($sql); + $stmt->execute(array(':record_id' => $record->get_record_id())); + $stmt->closeCursor(); + + unset($stmt, $connbas); + + return $this; + } + + /** + * {@inheritdoc} + */ + public function updateRecord(\record_adapter $record) + { + $record->set_binary_status(\databox_status::dec2bin(bindec($record->get_status()) & ~7 | 4)); + + return $this; + } + + /** + * {@inheritdoc} + */ + public function addStory(\record_adapter $record) + { + return $this->updateRecord($record); + } + + /** + * {@inheritdoc} + */ + public function removeStory(\record_adapter $record) + { + return $this->removeRecord($record); + } + + /** + * {@inheritdoc} + */ + public function updateStory(\record_adapter $record) + { + return $this->updateRecord($record); + } + + /** + * {@inheritdoc} + */ + public function addFeedEntry(\Feed_Entry_Adapter $entry) + { + throw new RuntimeException('Feed Entry indexing not supported by Phrasea Engine'); + } + + /** + * {@inheritdoc} + */ + public function removeFeedEntry(\Feed_Entry_Adapter $entry) + { + throw new RuntimeException('Feed Entry indexing not supported by Phrasea Engine'); + } + + /** + * {@inheritdoc} + */ + public function updateFeedEntry(\Feed_Entry_Adapter $entry) + { + throw new RuntimeException('Feed Entry indexing not supported by Phrasea Engine'); + } + + /** + * {@inheritdoc} + */ + public function setOptions(SearchEngineOptions $options) + { + $this->options = $options; + } + + /** + * {@inheritdoc} + */ + public function resetOptions() + { + $this->options = new SearchEngineOptions(); + } + + /** + * {@inheritdoc} + */ + public function query($query, $offset, $perPage) + { + assert(is_int($offset)); + assert($offset >= 0); + assert(is_int($perPage)); + + if (trim($query) === '') { + $query = "all"; + } + + if ($this->options->getRecordType()) { + $query .= ' AND recordtype=' . $this->options->getRecordType(); + } + + $appbox = \appbox::get_instance(\bootstrap::getCore()); + $session = $appbox->get_session(); + + $sql = 'SELECT query, query_time, duration, total FROM cache WHERE session_id = :ses_id'; + $stmt = $appbox->get_connection()->prepare($sql); + $stmt->execute(array(':ses_id' => $session->get_ses_id())); + $row = $stmt->fetch(\PDO::FETCH_ASSOC); + $stmt->closeCursor(); + + $date_obj = new \DateTime('-10 min'); + $date_quest = new \DateTime($row['query_time']); + + if ($query != $row['query']) { + $this->resetCacheNextQuery = true; + } + if ($date_obj > $date_quest) { + $this->resetCacheNextQuery = true; + } + + if ($this->resetCacheNextQuery === true) { + phrasea_clear_cache($session->get_ses_id()); + $this->addQuery($query); + $this->executeQuery($query); + + $sql = 'SELECT query, query_time, duration, total FROM cache WHERE session_id = :ses_id'; + $stmt = $appbox->get_connection()->prepare($sql); + $stmt->execute(array(':ses_id' => $session->get_ses_id())); + $row = $stmt->fetch(\PDO::FETCH_ASSOC); + $stmt->closeCursor(); + } else { + /** + * @todo clean this in DB + */ + $this->total_available = $this->total_results = $session->get_session_prefs('phrasea_engine_n_results'); + } + + $res = phrasea_fetch_results( + $session->get_ses_id(), $offset + 1, $perPage, false + ); + + $rs = array(); + $error = _('Unable to execute query'); + + if (isset($res['results']) && is_array($res['results'])) { + $rs = $res['results']; + $error = ''; + } + + $resultNumber = $offset; + $records = new ArrayCollection(); + + foreach ($rs as $data) { + try { + $records->add(new \record_adapter( + \phrasea::sbasFromBas($data['base_id']), + $data['record_id'], + $resultNumber + )); + } catch (Exception $e) { + + } + $resultNumber ++; + } + + + return new SearchEngineResult($records, $query, $row['duration'], $offset, $row['total'], $row['total'], $error, '', new ArrayCollection(), new ArrayCollection(), ''); + } + + /** + * {@inheritdoc} + */ + private function executeQuery($query) + { + $appbox = \appbox::get_instance(\bootstrap::getCore()); + $session = $appbox->get_session(); + $registry = $appbox->get_registry(); + + $dateLog = date("Y-m-d H:i:s"); + $nbanswers = $total_time = 0; + $sort = ''; + + if ($this->options->sortBy()) { + switch ($this->options->sortOrder()) { + case SearchEngineOptions::SORT_MODE_ASC: + $sort = '+'; + break; + case SearchEngineOptions::SORT_MODE_DESC: + default: + $sort = '-'; + break; + } + $sort .= '0' . $this->options->sortBy(); + } + + foreach ($this->queries as $sbas_id => $qry) { + $BF = array(); + + foreach ($this->options->businessFieldsOn() as $collection) { + $BF[] = $collection->get_base_id(); + } + + $results = phrasea_query2( + $session->get_ses_id() + , $sbas_id + , $this->colls[$sbas_id] + , $this->arrayq[$sbas_id] + , $registry->get('GV_sit') + , (string) $session->get_usr_id() + , false + , $this->options->searchType() == SearchEngineOptions::RECORD_GROUPING ? PHRASEA_MULTIDOC_REGONLY : PHRASEA_MULTIDOC_DOCONLY + , $sort + , $BF + ); + + if ($results) { + $total_time += $results['time_all']; + $nbanswers += $results["nbanswers"]; + } + + $logger = $session->get_logger($appbox->get_databox($sbas_id)); + + $conn2 = \connection::getPDOConnection($sbas_id); + + $sql3 = "INSERT INTO log_search + (id, log_id, date, search, results, coll_id ) + VALUES + (null, :log_id, :date, :query, :nbresults, :colls)"; + + $params = array( + ':log_id' => $logger->get_id() + , ':date' => $dateLog + , ':query' => $query + , ':nbresults' => $results["nbanswers"] + , ':colls' => implode(',', $this->colls[$sbas_id]) + ); + + $stmt = $conn2->prepare($sql3); + $stmt->execute($params); + $stmt->closeCursor(); + } + + $sql = 'UPDATE cache + SET query = :query, query_time = NOW(), duration = :duration, total = :total + WHERE session_id = :ses_id'; + + $params = array( + 'query' => $query, + ':ses_id' => $session->get_ses_id(), + ':duration' => $total_time, + ':total' => $nbanswers, + ); + + $stmt = $appbox->get_connection()->prepare($sql); + $stmt->execute($params); + $stmt->closeCursor(); + + \User_Adapter::saveQuery($query); + + return $this; + } + + /** + * {@inheritdoc} + */ + public function autocomplete($query) + { + return new ArrayCollection(); + } + + /** + * {@inheritdoc} + */ + public function excerpt($query, $fields, \record_adapter $record) + { + $ret = array(); + + $appbox = \appbox::get_instance(\bootstrap::getCore()); + $session = $appbox->get_session(); + $res = phrasea_fetch_results( + $session->get_ses_id(), ($record->get_number() + 1), 1, true, "[[em]]", "[[/em]]" + ); + + if ( ! isset($res['results']) || ! is_array($res['results'])) { + return array(); + } + + $rs = $res['results']; + $res = array_shift($rs); + if ( ! isset($res['xml'])) { + return array(); + } + + $sxe = simplexml_load_string($res['xml']); + + foreach ($fields as $name => $field) { + if ($sxe && $sxe->description && $sxe->description->$name) { + $val = array(); + foreach ($sxe->description->$name as $value) { + $val[] = str_replace(array('[[em]]', '[[/em]]'), array('', ''), (string) $value); + } + $separator = $field['separator'] ? $field['separator'][0] : ''; + $val = implode(' ' . $separator . ' ', $val); + } else { + $val = $field['value']; + } + + $ret[] = $val; + } + + return $ret; + } + + /** + * {@inheritdoc} + */ + public function resetCache() + { + $this->resetCacheNextQuery = true; + $this->queries = $this->arrayq = $this->colls = $this->qp = $this->needthesaurus = array(); + + return $this; + } + + private function addQuery($query) + { + foreach ($this->options->databoxes() as $databox) { + $this->queries[$databox->get_sbas_id()] = $query; + } + + $status = $this->options->getStatus(); + + foreach ($this->queries as $sbas => $qs) { + if ($status) { + $requestStat = 'xxxx'; + + for ($i = 4; ($i <= 64); $i ++ ) { + if ( ! isset($status[$i])) { + $requestStat = 'x' . $requestStat; + continue; + } + $val = 'x'; + if (isset($status[$i][$sbas])) { + if ($status[$i][$sbas] == '0') { + $val = '0'; + } elseif ($status[$i][$sbas] == '1') { + $val = '1'; + } + } + $requestStat = $val . $requestStat; + } + + $requestStat = ltrim($requestStat, 'x'); + + if ($requestStat !== '') { + $this->queries[$sbas] .= ' AND (recordstatus=' . $requestStat . ')'; + } + } + if ($this->options->fields()) { + $this->queries[$sbas] .= ' IN (' . implode(' OR ', $this->options->fields()) . ')'; + } + if (($this->options->getMinDate() || $this->options->getMaxDate()) && $this->options->getDateFields()) { + if ($this->options->getMinDate()) { + $this->queries[$sbas] .= ' AND ( ' . implode(' >= ' . $this->options->getMinDate()->format('Y-m-d') . ' OR ', $this->options->getDateFields()) . ' >= ' . $this->options->getMinDate()->format('Y-m-d') . ' ) '; + } + if ($this->options->getMaxDate()) { + $this->queries[$sbas] .= ' AND ( ' . implode(' <= ' . $this->options->getMaxDate()->format('Y-m-d') . ' OR ', $this->options->getDateFields()) . ' <= ' . $this->options->getMaxDate()->format('Y-m-d') . ' ) '; + } + } + } + + $this->singleParse('main', $query); + + foreach ($this->queries as $sbas => $db_query) { + $this->singleParse($sbas, $query); + } + + $base_ids = array_map(function(\collection $collection) { + return $collection->get_base_id(); + }, $this->options->collections()); + + foreach ($this->options->databoxes() as $databox) { + $sbas_id = $databox->get_sbas_id(); + + $this->colls[$sbas_id] = array(); + + foreach ($databox->get_collections() as $collection) { + if (in_array($collection->get_base_id(), $base_ids)) { + $this->colls[$sbas_id][] = $collection->get_base_id(); + } + } + + if (sizeof($this->colls[$sbas_id]) <= 0) { + continue; + } + + if ($this->needthesaurus[$sbas_id]) { + if ($databox->get_dom_thesaurus()) { + $this->qp[$sbas_id]->thesaurus2($this->indep_treeq[$sbas_id], $sbas_id, $databox->get_dbname(), $databox->get_dom_thesaurus(), true); + $this->qp['main']->thesaurus2($this->indep_treeq['main'], $sbas_id, $databox->get_dbname(), $databox->get_dom_thesaurus(), true); + } + } + + $emptyw = false; + + $this->qp[$sbas_id]->set_default($this->indep_treeq[$sbas_id], $emptyw); + $this->qp[$sbas_id]->distrib_in($this->indep_treeq[$sbas_id]); + $this->qp[$sbas_id]->factor_or($this->indep_treeq[$sbas_id]); + $this->qp[$sbas_id]->setNumValue($this->indep_treeq[$sbas_id], $databox->get_sxml_structure()); + $this->qp[$sbas_id]->thesaurus2_apply($this->indep_treeq[$sbas_id], $sbas_id); + $this->arrayq[$sbas_id] = $this->qp[$sbas_id]->makequery($this->indep_treeq[$sbas_id]); + } + + return $this; + } + + private function singleParse($sbas, $query) + { + $this->qp[$sbas] = new PhraseaEngineQueryParser($this->options->getLocale()); + $this->qp[$sbas]->debug = false; + + if ($sbas == 'main') { + $simple_treeq = $this->qp[$sbas]->parsequery($query); + } else { + $simple_treeq = $this->qp[$sbas]->parsequery($this->queries[$sbas]); + } + + $this->qp[$sbas]->priority_opk($simple_treeq); + $this->qp[$sbas]->distrib_opk($simple_treeq); + $this->needthesaurus[$sbas] = false; + + $this->indep_treeq[$sbas] = $this->qp[$sbas]->extendThesaurusOnTerms($simple_treeq, true, true, false); + $this->needthesaurus[$sbas] = $this->qp[$sbas]->containsColonOperator($this->indep_treeq[$sbas]); + + return $this; + } +} + diff --git a/lib/Alchemy/Phrasea/SearchEngine/PhraseaEngineQueryParser.php b/lib/Alchemy/Phrasea/SearchEngine/PhraseaEngineQueryParser.php new file mode 100644 index 0000000000..49b96bc8ab --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/PhraseaEngineQueryParser.php @@ -0,0 +1,1954 @@ + array("NODETYPE" => PHRASEA_OP_AND, "CANNUM" => false), + "and" => array("NODETYPE" => PHRASEA_OP_AND, "CANNUM" => false), + "ou" => array("NODETYPE" => PHRASEA_OP_OR, "CANNUM" => false), + "or" => array("NODETYPE" => PHRASEA_OP_OR, "CANNUM" => false), + "sauf" => array("NODETYPE" => PHRASEA_OP_EXCEPT, "CANNUM" => false), + "except" => array("NODETYPE" => PHRASEA_OP_EXCEPT, "CANNUM" => false), + "pres" => array("NODETYPE" => PHRASEA_OP_NEAR, "CANNUM" => true), + "near" => array("NODETYPE" => PHRASEA_OP_NEAR, "CANNUM" => true), + "avant" => array("NODETYPE" => PHRASEA_OP_BEFORE, "CANNUM" => true), + "before" => array("NODETYPE" => PHRASEA_OP_BEFORE, "CANNUM" => true), + "apres" => array("NODETYPE" => PHRASEA_OP_AFTER, "CANNUM" => true), + "after" => array("NODETYPE" => PHRASEA_OP_AFTER, "CANNUM" => true), + "dans" => array("NODETYPE" => PHRASEA_OP_IN, "CANNUM" => false), + "in" => array("NODETYPE" => PHRASEA_OP_IN, "CANNUM" => false) + ); + public $opk = array( + "<" => array("NODETYPE" => PHRASEA_OP_LT, "CANNUM" => false), + ">" => array("NODETYPE" => PHRASEA_OP_GT, "CANNUM" => false), + "<=" => array("NODETYPE" => PHRASEA_OP_LEQT, "CANNUM" => false), + ">=" => array("NODETYPE" => PHRASEA_OP_GEQT, "CANNUM" => false), + "<>" => array("NODETYPE" => PHRASEA_OP_NOTEQU, "CANNUM" => false), + "=" => array("NODETYPE" => PHRASEA_OP_EQUAL, "CANNUM" => false), + ":" => array("NODETYPE" => PHRASEA_OP_COLON, "CANNUM" => false) + ); + public $spw = array( + "all" => array( + "CLASS" => "PHRASEA_KW_ALL", "NODETYPE" => PHRASEA_KW_ALL, "CANNUM" => false + ), + "last" => array( + "CLASS" => "PHRASEA_KW_LAST", "NODETYPE" => PHRASEA_KW_LAST, "CANNUM" => true + ), + // "first" => array("CLASS"=>PHRASEA_KW_FIRST, "CANNUM"=>true), + // "premiers" => array("CLASS"=>PHRASEA_KW_FIRST, "CANNUM"=>true), + "tout" => array( + "CLASS" => "PHRASEA_KW_ALL", "NODETYPE" => PHRASEA_KW_ALL, "CANNUM" => false + ), + "derniers" => array( + "CLASS" => "PHRASEA_KW_LAST", "NODETYPE" => PHRASEA_KW_LAST, "CANNUM" => true + ) + ); + public $quoted_defaultop = array( + "VALUE" => "default_avant", "NODETYPE" => PHRASEA_OP_BEFORE, "PNUM" => 0 + ); + public $defaultop = array( + "VALUE" => "and", "NODETYPE" => PHRASEA_OP_AND, "PNUM" => NULL + ); + public $defaultlast = 12; + public $phq; + public $errmsg = ""; + + /** + * + * @var boolean + */ + public $debug = false; + + /** + * un tableau qui contiendra des propositions de thesaurus + * pour les termes de l'arbre simple + * + * @var array + */ + public $proposals = Array("QRY" => "", "BASES" => array(), "QUERIES" => array()); + + /** + * Current language for thesaurus + * @var + */ + public $lng = null; + protected $unicode; + + public function __construct($lng = "???") + { + $this->lng = $lng; + $this->unicode = new \unicode(); + + return $this; + } + + public function mb_trim($s, $encoding) + { + return(trim($s)); + } + + public function mb_ltrim($s, $encoding) + { + return(ltrim($s)); + } + + public function parsequery($phq) + { + if ($this->debug) { + for ($i = 0; $i < mb_strlen($phq, 'UTF-8'); $i ++ ) { + $c = mb_substr($phq, $i, 1, 'UTF-8'); + printf("// %s : '%s' (%d octets)\n", $i, $c, strlen($c)); + } + } + + $this->proposals = Array("QRY" => "", "BASES" => array(), "QUERIES" => array()); + $this->phq = $this->mb_trim($phq, 'UTF-8'); + if ($this->phq != "") { + return($this->maketree(0)); + } else { + + if ($this->errmsg != "") { + $this->errmsg .= sprintf("\\n"); + } + + $this->errmsg .= _('qparser::la question est vide'); + + return(null); + } + } + + public function astext($tree) + { + switch ($tree["CLASS"]) { + case "SIMPLE": + if (is_array($tree["VALUE"])) { + return(implode(" ", $tree["VALUE"])); + } else { + + return($tree["VALUE"]); + } + break; + case "QSIMPLE": + if (is_array($tree["VALUE"])) { + return("\"" . implode(" ", $tree["VALUE"]) . "\""); + } else { + return("\"" . $tree["VALUE"] . "\""); + } + break; + case "PHRASEA_KW_ALL": + return($tree["VALUE"][0]); + break; + case "PHRASEA_KW_LAST": + if ($tree["PNUM"] !== null) { + return("" . $tree["VALUE"][0] . "[" . $tree["PNUM"] . "]"); + } else { + return($tree["VALUE"][0]); + } + break; + case "OPS": + case "OPK": + if (isset($tree["PNUM"])) { + return("(" . $this->astext($tree["LB"]) . " " . $tree["VALUE"] . "[" . $tree["PNUM"] . "] " . $this->astext($tree["RB"]) . ")"); + } else { + return("(" . $this->astext($tree["LB"]) . " " . $tree["VALUE"] . " " . $this->astext($tree["RB"]) . ")"); + } + break; + } + } + + public function astable(&$tree) + { + $this->calc_complexity($tree); + $txt = ""; + $this->astable2($txt, $tree); + $txt = "\n\n" . $txt . "\n
\n"; + + return($txt); + } + + public function calc_complexity(&$tree) + { + if ($tree) { + if ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + return($tree["COMPLEXITY"] = $this->calc_complexity($tree["LB"]) + $this->calc_complexity($tree["RB"])); + } else { + return($tree["COMPLEXITY"] = 1); + } + } + } + + public function astable2(&$out, &$tree, $depth = 0) + { + switch ($tree["CLASS"]) { + case "SIMPLE": + if (is_array($tree["VALUE"])) + $txt = implode(" ", $tree["VALUE"]); + else + $txt = $tree["VALUE"]; + $out .= "\t" . $txt . "\n"; + break; + case "QSIMPLE": + if (is_array($tree["VALUE"])) + $txt = implode(" ", $tree["VALUE"]); + else + $txt = $tree["VALUE"]; + $out .= "\t"" . $txt . ""\n"; + break; + case "PHRASEA_KW_ALL": + $out .= "\t" . $tree["VALUE"][0] . "\n"; + break; + case "PHRASEA_KW_LAST": + if ($tree["PNUM"] !== null) + $out .= "\t" . $tree["VALUE"][0] . "[" . $tree["PNUM"] . "]" . "\n"; + else + $out .= "\t" . $tree["VALUE"][0] . "\n"; + break; + case "OPS": + case "OPK": + $op = $tree["VALUE"]; + if (isset($tree["PNUM"])) + $op .= "[" . $tree["PNUM"] . "]"; + $out .= "\t$op\n"; + $this->astable2($out, $tree["LB"], $depth + 1); + $this->astable2($out, $tree["RB"], $depth + 1); + $out .= "\n\n"; + break; + } + } + + public function dumpDiv(&$tree) + { + print("
\n"); + $this->dumpDiv2($tree); + print("
\n"); + } + + public function dumpDiv2(&$tree, $depth = 0) + { + switch ($tree["CLASS"]) { + case "SIMPLE": + if (is_array($tree["VALUE"])) + $s = implode(" , ", $tree["VALUE"]); + else + $s = $tree["VALUE"]; + print(str_repeat("\t", $depth) . "" . $s . "\n"); + case "QSIMPLE": + $s = ""; + if (is_array($tree["VALUE"])) + $s = implode(" , ", $tree["VALUE"]); + else + $s = $tree["VALUE"]; + print(str_repeat("\t", $depth) . """ . $s . ""\n"); + break; + case "PHRASEA_KW_ALL": + printf(str_repeat("\t", $depth) . "%s\n", $tree["VALUE"][0]); + break; + case "PHRASEA_KW_LAST": + if ($tree["PNUM"] !== null) + printf(str_repeat("\t", $depth) . "%s %s\n", $tree["VALUE"][0], $tree["PNUM"]); + else + printf(str_repeat("\t", $depth) . "%s\n", $tree["VALUE"][0]); + break; + // case PHRASEA_KW_FIRST: + // if($tree["PNUM"]!==null) + // printf("%s %s", $tree["VALUE"], $tree["PNUM"]); + // else + // printf("%s", $tree["VALUE"]); + // break; + case "OPS": + case "OPK": + print(str_repeat("\t", $depth) . "
\n"); + $this->dumpDiv2($tree["LB"], $depth + 1); + print(str_repeat("\t", $depth) . "
\n"); + print(str_repeat("\t", $depth) . "
\n"); + if (isset($tree["PNUM"])) + printf(str_repeat("\t", $depth + 1) . " %s[%s]\n", $tree["VALUE"], $tree["PNUM"]); + else + printf(str_repeat("\t", $depth + 1) . " %s\n", $tree["VALUE"]); + print(str_repeat("\t", $depth) . "
\n"); + print(str_repeat("\t", $depth) . "
\n"); + $this->dumpDiv2($tree["RB"], $depth + 1); + print(str_repeat("\t", $depth) . "
\n"); + + break; + } + } + + public function dump($tree) + { + switch ($tree["CLASS"]) { + case "SIMPLE": + if (is_array($tree["VALUE"])) + $s = implode("
, ", $tree["VALUE"]); + else + $s = $tree["VALUE"]; + print("" . $s . ""); + break; + case "QSIMPLE": + if (is_array($tree["VALUE"])) + $s = implode(" , ", $tree["VALUE"]); + else + $s = $tree["VALUE"]; + print(""" . $s . """); + break; + case "PHRASEA_KW_ALL": + printf("%s", $tree["VALUE"][0]); + break; + case "PHRASEA_KW_LAST": + if ($tree["PNUM"] !== null) + printf("%s %s", $tree["VALUE"][0], $tree["PNUM"]); + else + printf("%s", $tree["VALUE"][0]); + break; + // case PHRASEA_KW_FIRST: + // if($tree["PNUM"]!==null) + // printf("%s %s", $tree["VALUE"], $tree["PNUM"]); + // else + // printf("%s", $tree["VALUE"]); + // break; + case "OPS": + case "OPK": + print(""); + print(""); + print(""); + print(""); + print(""); + print(""); + print(""); + print(""); + print("
"); + if (isset($tree["PNUM"])) + printf(" %s[%s] ", $tree["VALUE"], $tree["PNUM"]); + else + printf(" %s ", $tree["VALUE"]); + print("
"); + print($this->dump($tree["LB"])); + print(""); + print($this->dump($tree["RB"])); + print("
"); + break; + } + } + + public function priority_opk(&$tree, $depth = 0) + { + if ( ! $tree) { + return; + } + + if ($tree["CLASS"] == "OPK" && ($tree["LB"]["CLASS"] == "OPS" || $tree["LB"]["CLASS"] == "OPK")) { + // on a un truc du genre ((a ou b) < 5), on le transforme en (a ou (b < 5)) + $t = $tree["LB"]; + $tree["LB"] = $t["RB"]; + $t["RB"] = $tree; + $tree = $t; + } + if (isset($tree["LB"])) { + $this->priority_opk($tree["LB"], $depth + 1); + }if (isset($tree["RB"])) { + $this->priority_opk($tree["RB"], $depth + 1); + } + } + + public function distrib_opk(&$tree, $depth = 0) + { + if ( ! $tree) { + return; + } + + if ($tree["CLASS"] == "OPK" && ($tree["RB"]["CLASS"] == "OPS")) { + // on a un truc du genre (a = (5 ou 6)), on le transforme en ((a = 5) ou (a = 6)) + $tmp = array("CLASS" => $tree["CLASS"], + "NODETYPE" => $tree["NODETYPE"], + "VALUE" => $tree["VALUE"], + "PNUM" => $tree["PNUM"], + "LB" => $tree["LB"], + "RB" => $tree["RB"]["RB"], + "DEPTH" => $tree["LB"]["DEPTH"]); + $t = $tree["RB"]; + $tree["RB"] = $t["LB"]; + $t["LB"] = $tree; + $t["RB"] = $tmp; + $tree = $t; + } + if (isset($tree["LB"])) + $this->distrib_opk($tree["LB"], $depth + 1); + if (isset($tree["RB"])) + $this->distrib_opk($tree["RB"], $depth + 1); + } + + public function thesaurus2_apply(&$tree, $bid) + { + if ( ! $tree) { + return; + } + + if (($tree["CLASS"] == "SIMPLE" || $tree["CLASS"] == "QSIMPLE") && isset($tree["SREF"]) && isset($tree["SREF"]["TIDS"])) { + $tids = array(); + foreach ($tree["SREF"]["TIDS"] as $tid) { + if ($tid["bid"] == $bid) + $tids[] = $tid["pid"]; + } + if (count($tids) >= 1) { + /* + if (count($tids)==1) { + // on cherche un id simple, on utilisera la syntaxe sql 'like' (l'extension repérera elle méme la syntaxe car la value finit par '%') + $val = str_replace(".", "d", $tids[0]) . "d%"; + $tree["VALUE"] = array($val); + } else { + // on cherche plusieurs id's, on utilisera la syntaxe 'regexp' (l'extension repérera elle méme la syntaxe car la value finit par '$' + $val = ""; + foreach($tids as $tid) + $val .= ($val?"|":"") . "(" . str_replace(".", "d", $tid) . "d.*)"; + $tree["VALUE"] = array("^" . $val); + } + */ + $tree["VALUE"] = array(); + foreach ($tids as $tid) + $tree["VALUE"][] = str_replace(".", "d", $tid) . "d%";; + } else { + // le mot n'est pas dans le thesaurus + } + /* + */ + } + if ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + $this->thesaurus2_apply($tree["LB"], $bid); + $this->thesaurus2_apply($tree["RB"], $bid); + } + } + + // étend (ou remplace) la recherche sur les termes simples en recherche sur thesaurus + // ex: (a et b) + // full-text only : ==> (a et b) + // thesaurus only : ==> ((th:a) et (th:b)) + // ft et thesaurus : ==> ((a ou (th:a)) et (b ou (th:b))) + // RETOURNE l'arbre résultat sans modifier l'arbre d'origine + public function extendThesaurusOnTerms(&$tree, $useFullText, $useThesaurus, $keepfuzzy) + { + $copy = $tree; + $this->_extendThesaurusOnTerms($tree, $copy, $useFullText, $useThesaurus, $keepfuzzy, 0, ""); + + $this->proposals["QRY"] = "" . $this->_queryAsHTML($tree) . ""; + + return($copy); + } + + public function _extendThesaurusOnTerms(&$tree, &$copy, $useFullText, $useThesaurus, $keepfuzzy, $depth, $path) + { + if ($depth == 0) + $ret = $tree; + if ( ! $useThesaurus) { + return; // full-text only : inchangé + } + + if (($tree["CLASS"] == "SIMPLE" || $tree["CLASS"] == "QSIMPLE")) { + if (isset($tree["CONTEXT"])) + $copy = $this->_extendToThesaurus_Simple($tree, false, $keepfuzzy, $path); + else + $copy = $this->_extendToThesaurus_Simple($tree, $useFullText, $keepfuzzy, $path); + } else { + if ($tree["CLASS"] == "OPK" && $tree["NODETYPE"] == PHRASEA_OP_COLON) { + // on a 'field:value' , on traite 'value' + $tree["RB"]["PATH"] = $copy["RB"]["PATH"] = $path . "R"; + if (isset($tree["RB"]["CONTEXT"])) + $copy["CONTEXT"] = $tree["CONTEXT"] = $tree["RB"]["CONTEXT"]; + else + if ( ! $keepfuzzy) + $copy["CONTEXT"] = $tree["CONTEXT"] = "*"; + + $copy["RB"]["SREF"] = &$tree["RB"]; + } else { + $recursL = $recursR = false; + if ($tree["CLASS"] == "OPS" && ($tree["NODETYPE"] == PHRASEA_OP_AND || $tree["NODETYPE"] == PHRASEA_OP_OR || $tree["NODETYPE"] == PHRASEA_OP_EXCEPT)) { + // on a une branche à gauche de 'ET', 'OU', 'SAUF' + $recursL = true; + } + if ($tree["CLASS"] == "OPS" && ($tree["NODETYPE"] == PHRASEA_OP_AND || $tree["NODETYPE"] == PHRASEA_OP_OR || $tree["NODETYPE"] == PHRASEA_OP_EXCEPT)) { + // on a une branche à droite de 'ET', 'OU', 'SAUF' + $recursR = true; + } + if ($recursL) + $this->_extendThesaurusOnTerms($tree["LB"], $copy["LB"], $useFullText, $useThesaurus, $keepfuzzy, $depth + 1, $path . "L"); + if ($recursR) + $this->_extendThesaurusOnTerms($tree["RB"], $copy["RB"], $useFullText, $useThesaurus, $keepfuzzy, $depth + 1, $path . "R"); + } + } + } + + // étend (ou remplace) un terme cherché en 'full-text' à une recherche thesaurus (champ non spécifié, tout le thésaurus = '*') + // le contexte éventuel est rapporté à l'opérateur ':' + // ex : a[k] ==> (a ou (TH :[k] a)) + public function _extendToThesaurus_Simple(&$simple, $keepFullText, $keepfuzzy, $path) + { + $simple["PATH"] = $path; + $context = null; + if (isset($simple["CONTEXT"])) { + $context = $simple["CONTEXT"]; + // unset($simple["CONTEXT"]); + } + if ($keepFullText) { + // on fait un OU entre la recherche ft et une recherche th + $tmp = array("CLASS" => "OPS", + "NODETYPE" => PHRASEA_OP_OR, + "VALUE" => "OR", + "PNUM" => null, + "DEPTH" => $simple["DEPTH"], + "LB" => $simple, + "RB" => array("CLASS" => "OPK", + "NODETYPE" => PHRASEA_OP_COLON, + "VALUE" => ":", + // "CONTEXT"=>$context, + "PNUM" => null, + "DEPTH" => $simple["DEPTH"] + 1, + "LB" => array("CLASS" => "SIMPLE", + "NODETYPE" => PHRASEA_KEYLIST, + "VALUE" => array("*"), + "DEPTH" => $simple["DEPTH"] + 2 + ), + "RB" => $simple + ) + ); + // on vire le contexte du coté fulltext + unset($tmp["LB"]["CONTEXT"]); + // ajoute le contexte si nécéssaire + if ($context !== null) + $tmp["RB"]["CONTEXT"] = $context; + else + if ( ! $keepfuzzy) + $tmp["RB"]["CONTEXT"] = "*"; + // corrige les profondeurs des 2 copies du 'simple' d'origine + $tmp["LB"]["DEPTH"] += 1; + $tmp["RB"]["RB"]["DEPTH"] += 2; + // note une référence vers le terme d'origine + $tmp["RB"]["RB"]["SREF"] = &$simple; + $tmp["RB"]["RB"]["PATH"] = $path; + } else { + // on remplace le ft par du th + $tmp = array("CLASS" => "OPK", + "NODETYPE" => PHRASEA_OP_COLON, + "VALUE" => ":", + // "CONTEXT"=>$context, + "PNUM" => null, + "DEPTH" => $simple["DEPTH"] + 1, + "LB" => array("CLASS" => "SIMPLE", + "NODETYPE" => PHRASEA_KEYLIST, + "VALUE" => array("*"), + "DEPTH" => $simple["DEPTH"] + 1 + ), + "RB" => $simple + ); + // ajoute le contexte si nécéssaire + if ($context !== null) + $tmp["CONTEXT"] = $context; + else + if ( ! $keepfuzzy) + $tmp["CONTEXT"] = "*"; + // corrige la profondeur de la copie du 'simple' d'origine + $tmp["RB"]["DEPTH"] += 1; + // note une référence vers le terme d'origine + $tmp["RB"]["SREF"] = &$simple; + $tmp["RB"]["PATH"] = $path; + } + + return($tmp); + } + + public function thesaurus2(&$tree, $bid, $name, &$domthe, $searchsynonyms = true, $depth = 0) + { + if ($this->debug) + print("thesaurus2:\n\$tree=" . var_export($tree, true) . "\n"); + + if ($depth == 0) + $this->proposals["BASES"]["b$bid"] = array("BID" => $bid, "NAME" => $name, "TERMS" => array()); + + if ( ! $tree) { + return(0); + } + + $ambigus = 0; + if ($tree["CLASS"] == "OPK" && $tree["NODETYPE"] == PHRASEA_OP_COLON) { +// $ambigus = $this->setTids($tree, $tree["RB"], $bid, $domthe, $searchsynonyms); + $ambigus = $this->setTids($tree, $bid, $domthe, $searchsynonyms); + } elseif ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + $ambigus += $this->thesaurus2($tree["LB"], $bid, $name, $domthe, $searchsynonyms, $depth + 1); + $ambigus += $this->thesaurus2($tree["RB"], $bid, $name, $domthe, $searchsynonyms, $depth + 1); + } + + return($ambigus); + } + + public function propAsHTML(&$node, &$html, $path, $depth = 0) + { + global $parm; + if ($depth > 0) { + $tsy = array(); + $lngfound = "?"; + for ($n = $node->firstChild; $n; $n = $n->nextSibling) { + if ($n->nodeName == "sy") { + $lng = $n->getAttribute("lng"); + if ( ! array_key_exists($lng, $tsy)) + $tsy[$lng] = array(); + $zsy = array("v" => $n->getAttribute("v"), "w" => $n->getAttribute("w"), "k" => $n->getAttribute("k")); + + if ($lngfound == "?" || ($lng == $this->lng && $lngfound != $lng)) { + $lngfound = $lng; + $syfound = $zsy; + } else { + + } + $tsy[$lng][] = $zsy; + } + } + $alt = ""; + foreach ($tsy as $lng => $tsy2) { + foreach ($tsy2 as $sy) { + $alt .= $alt ? "\n" : ""; + $alt .= "" . $lng . ": " . p4string::MakeString($sy["v"], "js"); + } + } + + $this->proposals['QUERIES'][$syfound["w"]] = $syfound["w"]; + + $thtml = $syfound["v"]; + $kjs = $syfound["k"] ? ("'" . p4string::MakeString($syfound["k"], "js") . "'") : "null"; + $wjs = "'" . p4string::MakeString($syfound["w"], "js") . "'"; + + if ($node->getAttribute("term")) { + $thtml = "" . $thtml . ""; + $node->removeAttribute("term"); + } + + $tab = str_repeat("\t", $depth); + $html .= $tab . "
\n"; + $html .= $tab . "\t" . $thtml . "\n"; + } + + $tsort = array(); + for ($n = $node->firstChild; $n; $n = $n->nextSibling) { + if ($n->nodeType == XML_ELEMENT_NODE && $n->getAttribute("marked")) { // only 'te' marked + $lngfound = '?'; + $syfound = '?'; + for ($n2 = $n->firstChild; $n2; $n2 = $n2->nextSibling) { + if ($n2->nodeName == 'sy') { + $lng = $n2->getAttribute('lng'); + if ($lngfound == "?" || ($lng == $this->lng && $lngfound != $lng)) { + $lngfound = $lng; + $syfound = $n2->getAttribute('w'); + } + } + } + $n->removeAttribute("marked"); + for ($i = 0; array_key_exists($syfound . $i, $tsort) && $i < 9999; $i ++ ) + ; + $tsort[$syfound . $i] = $n; + } + } + ksort($tsort); + + foreach ($tsort as $n) { + $this->propAsHTML($n, $html, $path, $depth + 1); + } + + if ($depth > 0) + $html .= $tab . "
\n"; + } + + public function _queryAsHTML($tree, $depth = 0) + { + if ($depth == 0) { + $ambiguites = array("n" => 0, "refs" => array()); + } + switch ($tree["CLASS"]) { + case "SIMPLE": + case "QSIMPLE": + $w = is_array($tree["VALUE"]) ? implode(' ', $tree["VALUE"]) : $tree["VALUE"]; + if (isset($tree["PATH"])) { + $path = $tree["PATH"]; + if (isset($tree["CONTEXT"])) + $w .= ' [' . $tree["CONTEXT"] . ']'; + $txt = '"' . $w . '"'; + } else { + if (isset($tree["CONTEXT"])) + $w .= '[' . $tree["CONTEXT"] . ']'; + if ($tree["CLASS"] == "QSIMPLE") + $txt = '"' . $w . '"'; + else + $txt = $w; + } + + return($txt); + break; + case "PHRASEA_KW_ALL": + return($tree["VALUE"][0]); + break; + case "PHRASEA_KW_LAST": + if ($tree["PNUM"] !== null) { + return("" . $tree["VALUE"][0] . "[" . $tree["PNUM"] . "]"); + } else { + return($tree["VALUE"][0]); + } + break; + case "OPS": + case "OPK": + if (isset($tree["PNUM"])) { + return('(' . $this->_queryAsHTML($tree["LB"], $depth + 1) . ' ' . $tree["VALUE"] . '[' . $tree["PNUM"] . '] ' . $this->_queryAsHTML($tree["RB"], $depth + 1) . ')'); + } else { + return('(' . $this->_queryAsHTML($tree["LB"], $depth + 1) . ' ' . $tree["VALUE"] . ' ' . $this->_queryAsHTML($tree["RB"], $depth + 1) . ')'); + } + break; + } + } + + public function setTids(&$tree, $bid, &$domthe, $searchsynonyms) + { + if ($this->debug) + print("============================ setTids:\n\$tree=" . var_export($tree, true) . "\n"); + + // $this->proposals["BASES"]["b$bid"] = array("BID"=>$bid, "TERMS"=>array()); + + $ambigus = 0; + if (is_array($w = $tree["RB"]["VALUE"])) + $t = $w = implode(" ", $w); + + if (isset($tree["CONTEXT"])) { + if ( ! $tree["CONTEXT"]) { + $x0 = "@w=\"" . $w . "\" and not(@k)"; + } else { + if ($tree["CONTEXT"] == "*") { + $x0 = "@w=\"" . $w . "\""; + } else { + $x0 = "@w=\"" . $w . "\" and @k=\"" . $tree["CONTEXT"] . "\""; + $t .= " (" . $tree["CONTEXT"] . ")"; + } + } + } else { + $x0 = "@w=\"" . $w . "\""; + } + + $x = "/thesaurus//sy[" . $x0 . "]"; + + if ($this->debug) + printf("searching thesaurus with xpath='%s'
\n", $x); + + $dxp = new DOMXPath($domthe); + $nodes = $dxp->query($x); + + if ( ! isset($tree["RB"]["SREF"]["TIDS"])) + $tree["RB"]["SREF"]["TIDS"] = array(); + if ($nodes->length >= 1) { + if ($nodes->length == 1) { + // on cherche un id simple, on utilisera la syntaxe sql 'like' (l'extension repérera elle méme la syntaxe car la value finira par '%') + $this->addtoTIDS($tree["RB"], $bid, $nodes->item(0)); + // $this->thesaurusDOMNodes[] = $nodes->item(0); + } else { + // on cherche plusieurs id's, on utilisera la syntaxe 'regexp' (l'extension repérera elle meme la syntaxe car la value finira par '$') + $val = ""; + foreach ($nodes as $node) { + if ( ! isset($tree["CONTEXT"])) + $ambigus ++; + $this->addtoTIDS($tree["RB"], $bid, $node); + } + } + $path = $tree["RB"]["SREF"]["PATH"]; + $prophtml = ""; + $this->propAsHTML($domthe->documentElement, $prophtml, $path); + $this->proposals["BASES"]["b$bid"]["TERMS"][$path]["HTML"] = $prophtml; + } else { + // le mot n'est pas dans le thesaurus + } + + return($ambigus); + } + /* + function dead_setTids(&$tree, &$simple, $bid, &$domthe, $searchsynonyms) + { + // if($this->debug) + print("setTids:\n\$tree=" . var_export($tree, true) . "\n"); + + $ambigus = 0; + if(is_array($w = $simple["VALUE"])) + $t = $w = implode(" ", $w); + + if (isset($tree["CONTEXT"])) { + if (!$tree["CONTEXT"]) { + $x0 = "@w=\"" . $w ."\" and not(@k)"; + } else { + if ($tree["CONTEXT"]=="*") { + $x0 = "@w=\"" . $w ."\""; + } else { + $x0 = "@w=\"" . $w ."\" and @k=\"" . $tree["CONTEXT"] . "\""; + $t .= " (" . $tree["CONTEXT"] . ")"; + } + } + } else { + $x0 = "@w=\"" . $w ."\""; + } + + $x = "/thesaurus//sy[" . $x0 ."]"; + + if($this->debug) + printf("searching thesaurus with xpath='%s'
\n", $x); + + $dxp = new DOMXPath($domthe); + $nodes = $dxp->query($x); + + if(!isset($tree["RB"]["SREF"]["TIDS"])) + $tree["RB"]["SREF"]["TIDS"] = array(); + if ($nodes->length >= 1) { + if ($nodes->length == 1) { + // on cherche un id simple, on utilisera la syntaxe sql 'like' (l'extension repérera elle méme la syntaxe car la value finira par '%') + $this->addtoTIDS($tree["RB"], $bid, $nodes->item(0)); + // $this->thesaurusDOMNodes[] = $nodes->item(0); + } else { + // on cherche plusieurs id's, on utilisera la syntaxe 'regexp' (l'extension repérera elle meme la syntaxe car la value finira par '$') + $val = ""; + foreach ($nodes as $node) { + if(!isset($tree["CONTEXT"])) + $ambigus++; + $this->addtoTIDS($tree["RB"], $bid, $node); + } + } + $path = $tree["RB"]["SREF"]["PATH"]; + $prophtml = ""; + $this->propAsHTML($domthe->documentElement, $prophtml, $path); + $this->proposals["TERMS"][$path]["HTML"] = $prophtml; + } else { + // le mot n'est pas dans le thesaurus + } + + return($ambigus); + } + */ + + public function containsColonOperator(&$tree) + { + if ( ! $tree) { + return(false); + } + if ($tree["CLASS"] == "OPK" && $tree["NODETYPE"] == PHRASEA_OP_COLON && ($tree["RB"]["CLASS"] == "SIMPLE" || $tree["RB"]["CLASS"] == "QSIMPLE")) { + return(true); + } + $ret = false; + if ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + $ret |= $this->containsColonOperator($tree["LB"]); + $ret |= $this->containsColonOperator($tree["RB"]); + } + + return($ret); + } + + public function addtoTIDS(&$extendednode, $bid, $DOMnode) // ajoute un tid en évitant les doublons + { + $id = $DOMnode->getAttribute("id"); + $pid = $DOMnode->parentNode->getAttribute("id"); + $lng = $DOMnode->getAttribute("lng"); + $w = $DOMnode->getAttribute("w"); + $k = $DOMnode->getAttribute("k"); + $p = $DOMnode->parentNode->getAttribute("v"); // le terme général (pére) du terme recherché : utile pour la levée d'ambiguité + + $path = $extendednode["SREF"]["PATH"]; + if ($this->debug) + printf("found node id='%s', v='%s' w='%s', k='%s', p='%s' for node-path=%s \n", $id, $DOMnode->getAttribute("v"), $w, $k, $p, $path); + + if ( ! $k) + $k = null; + + $found = false; + foreach ($extendednode["SREF"]["TIDS"] as $ztid) { + if ($ztid["bid"] != $bid) + continue; + if ($ztid["pid"] == $pid) { + $found = true; + } else { +// if($ztid["w"]==$w && $ztid["k"]==$k && $ztid["lng"]==$lng) +// { +// // FATAL : il y a un doublon réel dans le thesaurus de cette base (méme terme, méme contexte) +// // printf("FATAL doublon on base %d (%s[%s])\n", $bid, $w, $k); +// $found = true; +// break; +// } + } + } + if ( ! $found) + $extendednode["SREF"]["TIDS"][] = array("bid" => $bid, "pid" => $pid, "id" => $id, "w" => $w, "k" => $k, "lng" => $lng, "p" => $p); + + // on liste les propositions de thésaurus pour ce node (dans l'arbre simple) + if ( ! isset($this->proposals["BASES"]["b$bid"]["TERMS"][$path])) { + // $this->proposals["TERMS"][$path] = array("TERM"=>implode(" ", $extendednode["VALUE"]), "PROPOSALS"=>array()); + $term = implode(" ", $extendednode["VALUE"]); + if (isset($extendednode["CONTEXT"]) && $extendednode["CONTEXT"]) { + $term .= " (" . $extendednode["CONTEXT"] . ")"; + } + $this->proposals["BASES"]["b$bid"]["TERMS"][$path] = array("TERM" => $term); // , "PROPOSALS"=>array() ); //, "PROPOSALS_TREE"=>new DOMDocument("1.0", "UTF-8")); + } +// printf("<%s id='%s'>
\n", $DOMnode->tagName, $DOMnode->getAttribute("id")); +// printf("found node <%s id='%s' w='%s' k='%s'>
\n", $DOMnode->nodeName, $DOMnode->getAttribute('id'), $DOMnode->getAttribute('w'), $DOMnode->getAttribute('k')); + // on marque le terme principal + $DOMnode->parentNode->setAttribute("term", "1"); + // on commence par marquer les fils directs. rappel:$DOMnode pointe sur un sy + for ($node = $DOMnode->parentNode->firstChild; $node; $node = $node->nextSibling) { + if ($node->nodeName == "te") { + $node->setAttribute("marked", "1"); + } + } + // puis par remonter au père + for ($node = $DOMnode->parentNode; $node && $node->nodeType == XML_ELEMENT_NODE && $node->parentNode; $node = $node->parentNode) { + $id = $node->getAttribute("id"); + if ( ! $id) + break; // on a dépassé la racine du thésaurus + $node->setAttribute("marked", "1"); + } + } + + public function astext_ambigu($tree, &$ambiguites, $mouseCallback = "void", $depth = 0) + { + if ($depth == 0) { + $ambiguites = array("n" => 0, "refs" => array()); + } + switch ($tree["CLASS"]) { + case "SIMPLE": + case "QSIMPLE": + $prelink = $postlink = ""; + $w = is_array($tree["VALUE"]) ? implode(" ", $tree["VALUE"]) : $tree["VALUE"]; + $tab = "\n" . str_repeat("\t", $depth); + if (isset($tree["TIDS"]) && count($tree["TIDS"]) > 1) { + $ambiguites["refs"][$n = $ambiguites["n"]] = &$tree; + $txt = $tab . ""; + $txt .= $tab . "\t\"" . $w . ""; + $txt .= $tab . "\t\""; + $txt .= $tab . "\n"; + $ambiguites["n"] ++; + } else { + if (isset($tree["CONTEXT"])) + $w .= "[" . $tree["CONTEXT"] . "]"; + if ($tree["CLASS"] == "QSIMPLE") + $txt = $tab . "\"" . $w . "\"\n"; + else + $txt = $tab . "" . $w . "\n"; + } + + return($txt); + break; + case "PHRASEA_KW_ALL": + return($tree["VALUE"][0]); + break; + case "PHRASEA_KW_LAST": + if ($tree["PNUM"] !== null) { + return("" . $tree["VALUE"][0] . "[" . $tree["PNUM"] . "]"); + } else { + return($tree["VALUE"][0]); + } + break; + case "OPS": + case "OPK": + if (isset($tree["PNUM"])) { + return("(" . $this->astext_ambigu($tree["LB"], $ambiguites, $mouseCallback, $depth + 1) . " " . $tree["VALUE"] . "[" . $tree["PNUM"] . "] " . $this->astext_ambigu($tree["RB"], $ambiguites, $mouseCallback, $depth + 1) . ")"); + } else { + return("(" . $this->astext_ambigu($tree["LB"], $ambiguites, $mouseCallback, $depth + 1) . " " . $tree["VALUE"] . " " . $this->astext_ambigu($tree["RB"], $ambiguites, $mouseCallback, $depth + 1) . ")"); + } + break; + } + } + + public function get_ambigu(&$tree, $mouseCallback = "void", $depth = 0) + { + if ( ! $tree) { + return(""); + } + + unset($tree["DEPTH"]); + if ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + $this->get_ambigu($tree["LB"], $mouseCallback, $depth + 1); + $this->get_ambigu($tree["RB"], $mouseCallback, $depth + 1); + } else { + + } + if ($depth == 0) { + $t_ambiguites = array(); + $r = ($this->astext_ambigu($tree, $t_ambiguites, $mouseCallback)); + $t_ambiguites["query"] = $r; + + return($t_ambiguites); + } + } + + public function set_default(&$tree, &$emptyw, $depth = 0) + { + if ( ! $tree) { + return(true); + } + + if ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + if ($tree["CLASS"] == "OPS") { + if ( ! $this->set_default($tree["LB"], $emptyw, $depth + 1)) { + return(false); + } + if ( ! $this->set_default($tree["RB"], $emptyw, $depth + 1)) { + return(false); + } + } else { // OPK ! + // jy 20041223 : ne pas appliquer d'op. par def. derriere un op arith. + // ex : "d < 1/2/2003" : grouper la liste "1","2","2004" en "mot" unique + if ( ! $tree["LB"] || ($tree["LB"]["CLASS"] != "SIMPLE" && $tree["LB"]["CLASS"] != "QSIMPLE") || (is_array($tree["LB"]["VALUE"]) && count($tree["LB"]["VALUE"]) != 1)) { + // un op. arith. doit étre précédé d'un seul nom de champ + if ($this->errmsg != "") + $this->errmsg .= sprintf("\\n"); + $this->errmsg .= sprintf(_('qparser::Formulation incorrecte, un nom de champs est attendu avant l operateur %s'), $tree["VALUE"]); + + return(false); + } + if ( ! $tree["RB"] || ($tree["RB"]["CLASS"] != "SIMPLE" && $tree["RB"]["CLASS"] != "QSIMPLE")) { + // un op. arith. doit étre suivi d'une valeur + if ($this->errmsg != "") + $this->errmsg .= sprintf("\\n"); + $this->errmsg .= sprintf(_('qparser::Formulation incorrecte, une valeur est attendue apres l operateur %s'), $tree["VALUE"]); + + return(false); + } + if (is_array($tree["RB"]["VALUE"])) { + $lw = ""; + foreach ($tree["RB"]["VALUE"] as $w) + $lw .= ( $lw == "" ? "" : " ") . $w; + $tree["RB"]["VALUE"] = $lw; + } + } + + /** gestion des branches null + * a revoir car ca ppete pas d'erreur mais corrige automatiquement + * ** */ + if ( ! isset($tree["RB"])) + $tree = $tree["LB"]; + else + if ( ! isset($tree["LB"])) + $tree = $tree["RB"]; + } else { + if (($tree["CLASS"] == "SIMPLE" || $tree["CLASS"] == "QSIMPLE")) { + if (is_array($tree["VALUE"])) { + $treetmp = null; + $pnum = 0; + for ($i = 0; $i < count($tree["VALUE"]); $i ++ ) { + // gestion mot vide + if (isset($emptyw[$tree["VALUE"][$i]]) || $tree["VALUE"][$i] == "?" || $tree["VALUE"][$i] == "*") { + // on a forcé les '?' ou '*' isolés comme des mots vides + $pnum ++; + } else { + if ($treetmp == null) { + $treetmp = array("CLASS" => $tree["CLASS"], + "NODETYPE" => $tree["NODETYPE"], + "VALUE" => $tree["VALUE"][$i], + "PNUM" => $tree["PNUM"], + "DEPTH" => $tree["DEPTH"]); + $pnum = 0; + } else { + $dop = $tree["CLASS"] == "QSIMPLE" ? $this->quoted_defaultop : $this->defaultop; + $treetmp = array("CLASS" => "OPS", + "VALUE" => $dop["VALUE"], + "NODETYPE" => $dop["NODETYPE"], + "PNUM" => $pnum, // peut-être écrasé par defaultop + "DEPTH" => $depth, + "LB" => $treetmp, + "RB" => array("CLASS" => $tree["CLASS"], + "NODETYPE" => $tree["NODETYPE"], + "VALUE" => $tree["VALUE"][$i], + "PNUM" => $tree["PNUM"], + "DEPTH" => $tree["DEPTH"]) + ); + if (array_key_exists("PNUM", $dop)) + $treetmp["PNUM"] = $dop["PNUM"]; + $pnum = 0; + } + } + } + $tree = $treetmp; + } + } + } + + return(true); + } + + public function factor_or(&$tree) + { + do + $n = $this->factor_or2($tree); while ($n > 0); + } + + public function factor_or2(&$tree, $depth = 0) + { + $nmodif = 0; + if ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + if ($tree["NODETYPE"] == PHRASEA_OP_OR && ($tree["LB"]["CLASS"] == "SIMPLE" || $tree["LB"]["CLASS"] == "QSIMPLE") && ($tree["RB"]["CLASS"] == "SIMPLE" || $tree["RB"]["CLASS"] == "QSIMPLE")) { + $tree["CLASS"] = "SIMPLE"; + $tree["NODETYPE"] = PHRASEA_KEYLIST; + $tree["VALUE"] = is_array($tree["LB"]["VALUE"]) ? $tree["LB"]["VALUE"] : array($tree["LB"]["VALUE"]); + if (is_array($tree["RB"]["VALUE"])) { + foreach ($tree["RB"]["VALUE"] as $v) + $tree["VALUE"][] = $v; + } else + $tree["VALUE"][] = $tree["RB"]["VALUE"]; + unset($tree["LB"]); + unset($tree["RB"]); + unset($tree["PNUM"]); + $nmodif ++; + } else { + $nmodif += $this->factor_or2($tree["LB"], $depth + 1); + $nmodif += $this->factor_or2($tree["RB"], $depth + 1); + } + } + + return($nmodif); + } + + public function setNumValue(&$tree, \SimpleXMLElement $sxml_struct, $depth = 0) + { + if ($tree["CLASS"] == "OPK") { + if (isset($tree["RB"]) && ($tree["RB"]["CLASS"] == "SIMPLE" || $tree["RB"]["CLASS"] == "QSIMPLE") && ($tree["LB"]["CLASS"] == "SIMPLE" || $tree["LB"]["CLASS"] == "QSIMPLE")) { + $z = $sxml_struct->xpath('/record/description'); + if ($z && is_array($z)) { + foreach ($z[0] as $ki => $vi) { + $champ = null; + if (is_array($tree["LB"]["VALUE"])) + $champ = $tree["LB"]["VALUE"][0]; + else + $champ = $tree["LB"]["VALUE"]; + if ($champ && strtoupper($ki) == strtoupper($champ)) { + foreach ($vi->attributes() as $propname => $val) { + if (strtoupper($propname) == strtoupper("type")) { + if ($tree["NODETYPE"] == PHRASEA_OP_EQUAL) // cas particulier du "=" sur une date + $this->changeNodeEquals($tree, $val); + else + $this->setNumValue2($tree["RB"], $val); + } + } + } + } + } + } + } + if (isset($tree["LB"])) + $this->setNumValue($tree["LB"], $sxml_struct, $depth + 1); + if (isset($tree["RB"])) + $this->setNumValue($tree["RB"], $sxml_struct, $depth + 1); + } + + public function changeNodeEquals(&$branch, $type) + { + if (strtoupper($type) == strtoupper("Date")) { + $branch = $this->changeNodeEquals2($branch); + } + } + + public function changeNodeEquals2($oneBranch) + { + ## creation branche gauche avec ">=" +// print("changeNodeEquals2\n"); +// print("creation branche gauche ( '>=' ) \n"); + $newTreeLB = array("CLASS" => "OPK", + "VALUE" => ">=", + "NODETYPE" => PHRASEA_OP_GEQT, + "PNUM" => NULL, + "DEPTH" => 0, + "LB" => $oneBranch["LB"], + "RB" => array("CLASS" => "SIMPLE", + "VALUE" => $this->isoDate($oneBranch["RB"]["VALUE"], false), + "NODETYPE" => PHRASEA_KEYLIST, + "PNUM" => NULL, + "DEPTH" => 0) + ); + + $newTreeRB = array("CLASS" => "OPK", + "VALUE" => "<=", + "NODETYPE" => PHRASEA_OP_LEQT, + "PNUM" => NULL, + "DEPTH" => 0, + "LB" => $oneBranch["LB"], + "RB" => array("CLASS" => "SIMPLE", + "VALUE" => $this->isoDate($oneBranch["RB"]["VALUE"], true), + "NODETYPE" => PHRASEA_KEYLIST, + "PNUM" => NULL, + "DEPTH" => 0) + ); +// print("fin creation branche droite avec '<=' \n"); + ## fin creation branche droite ( "<=" ) + + $tree = array("CLASS" => "OPS", + "VALUE" => "et", + "NODETYPE" => PHRASEA_OP_AND, + "PNUM" => NULL, + "DEPTH" => 0, + "LB" => $newTreeLB, + "RB" => $newTreeRB); + + + return $tree; + } + + public function setNumValue2(&$branch, $type) + { + if (strtoupper($type) == strtoupper("Date")) { + $dateEnIso = $this->isoDate($branch["VALUE"]); + $branch["VALUE"] = $dateEnIso; + } + } + + public function isoDate($onedate, $max = false) + { + $v_y = "1900"; + $v_m = "01"; + $v_d = "01"; + + $v_h = $v_minutes = $v_s = "00"; + if ($max) { + $v_h = $v_minutes = $v_s = "99"; + } + $tmp = $onedate; + + if ( ! is_array($tmp)) + $tmp = explode(" ", $tmp); + + switch (sizeof($tmp)) { + // on a une date complete séparé avec des espaces, slash ou tiret + case 3 : + if (strlen($tmp[0]) == 4) { + $v_y = $tmp[0]; + $v_m = $tmp[1]; + $v_d = $tmp[2]; + // on a l'année en premier, on suppose alors que c'est de la forme YYYY MM DD + } elseif (strlen($tmp[2]) == 4) { + // on a l'année en dernier, on suppose alors que c'est de la forme DD MM YYYY + $v_y = $tmp[2]; + $v_m = $tmp[1]; + $v_d = $tmp[0]; + } else { + // l'année est sur un 2 chiffre et pas 4 + // ca fou la zone + + $v_d = $tmp[0]; + $v_m = $tmp[1]; + if ($tmp[2] < 20) + $v_y = "20" . $tmp[2]; + else + $v_y = "19" . $tmp[2]; + } + break; + + case 2 : + // On supposerait n'avoir que le mois et l'année + if (strlen($tmp[0]) == 4) { + $v_y = $tmp[0]; + $v_m = $tmp[1]; + // on a l'année en premier, on suppose alors que c'est de la forme YYYY MM DD + if ($max) + $v_d = "99"; + else + $v_d = "00"; + } elseif (strlen($tmp[1]) == 4) { + // on a l'année en premier, on suppose alors que c'est de la forme DD MM YYYY + $v_y = $tmp[1]; + $v_m = $tmp[0]; + if ($max) + $v_d = "99"; + else + $v_d = "00"; + } else { + // on a l'anné sur 2 chiffres + if ($tmp[1] < 20) + $v_y = "20" . $tmp[1]; + else + $v_y = "19" . $tmp[1]; + $v_m = $tmp[0]; + if ($max) + $v_d = "99"; + else + $v_d = "00"; + } + break; + + + // lé ca devient la zone pour savoir si on a que l'année ou si c'est une date sans espaces,slash ou tiret + case 1 : + switch (strlen($tmp[0])) { + case 14 : + // date iso YYYYMMDDHHMMSS + $v_y = substr($tmp[0], 0, 4); + $v_m = substr($tmp[0], 4, 2); + $v_d = substr($tmp[0], 6, 2); + $v_h = substr($tmp[0], 8, 2); + $v_minutes = substr($tmp[0], 10, 2); + $v_s = substr($tmp[0], 12, 2); + break; + case 8 : + // date iso YYYYMMDD + $v_y = substr($tmp[0], 0, 4); + $v_m = substr($tmp[0], 4, 2); + $v_d = substr($tmp[0], 6, 2); + break; + case 6 : + // date iso YYYYMM + $v_y = substr($tmp[0], 0, 4); + $v_m = substr($tmp[0], 4, 2); + if ($max) + $v_d = "99"; + else + $v_d = "00"; + break; + case 4 : + // date iso YYYY + $v_y = $tmp[0]; + + if ($max) + $v_m = "99"; + else + $v_m = "00"; + + if ($max) + $v_d = "99"; + else + $v_d = "00"; + break; + case 2 : + // date iso YY + if ($tmp[0] < 20) + $v_y = "20" . $tmp[0]; + else + $v_y = "19" . $tmp[0]; + + if ($max) + $v_m = "99"; + else + $v_m = "00"; + + if ($max) + $v_d = "99"; + else + $v_d = "00"; + break; + } + + + + break; + } + + return("" . $v_y . $v_m . $v_d . $v_h . $v_minutes . $v_s); + } + + public function distrib_in(&$tree, $depth = 0) + { + $opdistrib = array(PHRASEA_OP_AND, PHRASEA_OP_OR, PHRASEA_OP_EXCEPT, PHRASEA_OP_NEAR, PHRASEA_OP_BEFORE, PHRASEA_OP_AFTER); // ces opérateurs sont 'distribuables' autour d'un 'IN' + + if ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + if ($tree["NODETYPE"] == PHRASEA_OP_IN || $tree["CLASS"] == "OPK") { + if ($tree["LB"]["CLASS"] == "OPK") { + // on a un truc du genre '(t1 = t2) dans t3' + // ... on ne fait rien + } + if ($tree["LB"]["CLASS"] == "OPS" && in_array($tree["LB"]["NODETYPE"], $opdistrib)) { + // on a un truc du genre '(t1 op t2) {dans|=} t3', on distribue le dans é t1 et t2 + // ==> ((t1 dans t3) op (t2 dans t3)) + $m_v = $tree["VALUE"]; + $m_t = $tree["CLASS"]; + $m_o = $tree["NODETYPE"]; + $m_n = $tree["PNUM"]; + + $tree["CLASS"] = $tree["LB"]["CLASS"]; + $tree["NODETYPE"] = $tree["LB"]["NODETYPE"]; + $tree["VALUE"] = $tree["LB"]["VALUE"]; + $tree["PNUM"] = $tree["LB"]["PNUM"]; + + $tree["LB"]["CLASS"] = $m_t; + $tree["LB"]["NODETYPE"] = $m_o; + $tree["LB"]["VALUE"] = $m_v; + $tree["LB"]["PNUM"] = $m_n; + + $tree["RB"] = array("CLASS" => $m_t, + "NODETYPE" => $m_o, + "VALUE" => $m_v, + "PNUM" => $m_n, + "LB" => $tree["LB"]["RB"], + "RB" => $tree["RB"]); + + $tree["LB"]["RB"] = $tree["RB"]["RB"]; + // return; + } + + if ($tree["RB"]["CLASS"] == "OPS" && in_array($tree["RB"]["NODETYPE"], $opdistrib)) { + + // on a un truc du genre 't1 {dans|=} (t2 op t3)', on distribue le dans é t2 et t3 + // ==> ((t1 dans t2) ou (t1 dans t3)) + $m_v = $tree["VALUE"]; + $m_t = $tree["CLASS"]; + $m_o = $tree["NODETYPE"]; + $m_n = $tree["PNUM"]; + + $tree["CLASS"] = $tree["RB"]["CLASS"]; + $tree["NODETYPE"] = $tree["RB"]["NODETYPE"]; + $tree["VALUE"] = $tree["RB"]["VALUE"]; + $tree["PNUM"] = $tree["RB"]["PNUM"]; + + $tree["RB"]["CLASS"] = $m_t; + $tree["RB"]["NODETYPE"] = $m_o; + $tree["RB"]["VALUE"] = $m_v; + $tree["RB"]["PNUM"] = $m_n; + + $tree["LB"] = array("CLASS" => $m_t, + "NODETYPE" => $m_o, + "VALUE" => $m_v, + "PNUM" => $m_n, + "LB" => $tree["LB"], + "RB" => $tree["RB"]["LB"]); + + $tree["RB"]["LB"] = $tree["LB"]["LB"]; + } + } + $this->distrib_in($tree["LB"], $depth + 1); + $this->distrib_in($tree["RB"], $depth + 1); + } + } + + public function makequery($tree) + { + $a = array($tree["NODETYPE"]); + switch ($tree["CLASS"]) { + case "PHRASEA_KW_LAST": + if ($tree["PNUM"] !== NULL) + $a[] = $tree["PNUM"]; + break; + case "PHRASEA_KW_ALL": + break; + case "SIMPLE": + case "QSIMPLE": + // pas de tid, c'est un terme normal + if (is_array($tree["VALUE"])) { + foreach ($tree["VALUE"] as $k => $v) + $a[] = $v; + } else { + $a[] = $tree["VALUE"]; + } + break; + case "OPK": + if ($tree["LB"] !== NULL) + $a[] = $this->makequery($tree["LB"]); + if ($tree["RB"] !== NULL) + $a[] = $this->makequery($tree["RB"]); + break; + case "OPS": + if ($tree["PNUM"] !== NULL) + $a[] = intval($tree["PNUM"]); + if ($tree["LB"] !== NULL) + $a[] = $this->makequery($tree["LB"]); + if ($tree["RB"] !== NULL) + $a[] = $this->makequery($tree["RB"]); + break; + } + + return($a); + } + + public function maketree($depth, $inquote = false) + { +// printf("\n\n"); + $tree = null; + while ($t = $this->nexttoken($inquote)) { + if ($this->debug) + printf("got token %s of class %s\n", $t["VALUE"], $t["CLASS"]); + switch ($t["CLASS"]) { + case "TOK_RP": + if ($inquote) { + // quand on est entre guillements les tokens perdent leur signification + $tree = $this->addtotree($tree, $t, $depth, $inquote); + if ( ! $tree) { + return(null); + } + } else { + if ($depth <= 0) { // ')' : retour de récursivité + if ($this->errmsg != "") + $this->errmsg .= sprintf("\\n"); + $this->errmsg .= _('qparser:: erreur : trop de parentheses fermantes'); + + return(null); + } + + return($tree); + } + break; + case "TOK_LP": + if ($inquote) { + // quand on est entre guillements les tokens perdent leur signification + $tree = $this->addtotree($tree, $t, $depth, $inquote); + if ( ! $tree) { + return(null); + } + } else { // '(' : appel récursif + if ( ! $tree) + $tree = $this->maketree($depth + 1); + else { + if (($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") && $tree["RB"] == null) { + $tree["RB"] = $this->maketree($depth + 1); + if ( ! $tree["RB"]) + $tree = null; + } else { + // ici on applique l'opérateur par défaut + $tree = array("CLASS" => "OPS", + "VALUE" => $this->defaultop["VALUE"], + "NODETYPE" => $this->defaultop["NODETYPE"], + "PNUM" => $this->defaultop["PNUM"], + "DEPTH" => $depth, + "LB" => $tree, + "RB" => $this->maketree($depth + 1)); + } + } + if ( ! $tree) { + return(null); + } + } + break; + case "TOK_VOID": + // ce token est entre guillemets : on le saute + break; + case "TOK_QUOTE": + // une expr entre guillemets est 'comme entre parenthéses', + // sinon "a b" OU "x y" -> (((a B0 b) OU x) B0 y) au lieu de + // "a b" OU "x y" -> ((a B0 b) OU (x B0 y)) + if ($inquote) { + if ($this->debug) { + print("CLOSING QUOTE!\n"); + } + // fermeture des guillemets -> retour de récursivité + if ($depth <= 0) { // ')' : retour de récursivité + print("\nguillemets fermants en trop
"); + + return(null); + } + + return($tree); + } else { + if ($this->debug) { + print("OPENING QUOTE!
"); + } + // ouverture des guillemets -> récursivité + if ( ! $tree) + $tree = $this->maketree($depth + 1, true); + else { + if (($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") && $tree["RB"] == null) { + $tree["RB"] = $this->maketree($depth + 1, true); + if ( ! $tree["RB"]) + $tree = null; + } else { + // ici on applique l'opérateur par défaut + $tree = array("CLASS" => "OPS", + "VALUE" => $this->defaultop["VALUE"], + "NODETYPE" => $this->defaultop["NODETYPE"], + "PNUM" => $this->defaultop["PNUM"], + "DEPTH" => $depth, + "LB" => $tree, + "RB" => $this->maketree($depth + 1, true)); + } + } + if ( ! $tree) { + return(null); + } + } + break; + default: + $tree = $this->addtotree($tree, $t, $depth, $inquote); + if ($this->debug) { + print("---- après addtotree ----\n"); + var_dump($tree); + print("-------------------------\n"); + } + if ( ! $tree) { + return(null); + } + break; + } + } + if (($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") && $tree["RB"] == null) { + if ($this->errmsg != "") + $this->errmsg .= sprintf("\\n"); + $this->errmsg .= sprintf(_('qparser::Formulation incorrecte, une valeur est attendu apres %s'), $tree["VALUE"]); + $tree = $tree["LB"]; + } + + return($tree); + } + + public function addtotree($tree, $t, $depth, $inquote) + { + if ($this->debug) { + printf("addtotree({tree}, \$t[CLASS]='%s', \$t[VALUE]='%s', \$depth=%d, inquote=%s)\n", $t["CLASS"], $t["VALUE"], $depth, $inquote ? "true" : "false"); + print("---- avant addtotree ----\n"); + var_dump($tree); + print("-------------------------\n"); + } + + if ( ! $t) { + return($tree); + } + + switch ($t["CLASS"]) { + case "TOK_CONTEXT": +// if($this->debug) +// { +// printf("addtotree({tree}, \$t='%s', \$depth=%d, inquote=%s)\n", $t["VALUE"], $depth, $inquote?"true":"false"); +// } + if ($tree["CLASS"] == "SIMPLE" || $tree["CLASS"] == "QSIMPLE") { + // un [xxx] suit un terme : il introduit un contexte + $tree["CONTEXT"] = $t["VALUE"]; + } elseif ($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") { + if ( ! isset($tree["RB"]) || ! $tree["RB"]) { + // un [xxx] peut suivre un opérateur, c'est un paramétre normalement numérique + $tree["PNUM"] = $t["VALUE"]; + } else { + // [xxx] suit un terme déjé en branche droite ? (ex: a ou b[k]) + if ($tree["RB"]["CLASS"] == "SIMPLE" || $tree["RB"]["CLASS"] == "QSIMPLE") + $tree["RB"]["CONTEXT"] = $t["VALUE"]; + else { + if ($this->errmsg != "") + $this->errmsg .= "\\n"; + $this->errmsg .= sprintf("le contexte [%s] ne peut suivre qu'un terme ou un opérateur
", $t["VALUE"]); + + return(null); + } + } + } else { + if ($this->errmsg != "") + $this->errmsg .= "\\n"; + $this->errmsg .= sprintf("le contexte [%s] ne peut suivre qu'un terme ou un opérateur
", $t["VALUE"]); + + return(null); + } + + return($tree); + break; + case "TOK_CMP": + // < > <= >= <> = : sont des opérateurs de comparaison + if ( ! $tree) { + // printf("\nUne question ne peut commencer par '" . $t["VALUE"] . "'
"); + if ($this->errmsg != "") + $this->errmsg .= "\\n"; + $this->errmsg .= sprintf(_('qparser::erreur : une question ne peut commencer par %s'), $t["VALUE"]); + + return(null); + } + if (($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") && $tree["RB"] == null) { + // printf("'" . $t["VALUE"] . "' ne peut suivre un opérateur
"); + if ($this->errmsg != "") + $this->errmsg .= "\\n"; + $this->errmsg .= sprintf(_('qparser::Formulation incorrecte, ne peut suivre un operateur : %s'), $t["VALUE"]); + + return(null); + } + + return(array("CLASS" => "OPK", "VALUE" => $t["VALUE"], "NODETYPE" => $this->opk[$t["VALUE"]]["NODETYPE"], "PNUM" => null, "DEPTH" => $depth, "LB" => $tree, "RB" => null)); + break; + case "TOK_WORD": + if ($t["CLASS"] == "TOK_WORD" && isset($this->ops[$t["VALUE"]]) && ! $inquote) { + // ce mot est un opérateur phrasea + if ( ! $tree) { + // printf("\n581 : Une question ne peut commencer par un opérateur
"); + if ($this->errmsg != "") + $this->errmsg .= "\\n"; + $this->errmsg .= sprintf(_('qparser::erreur : une question ne peut commencer par %s'), $t["VALUE"]); + + return(null); + } + if (($tree["CLASS"] == "OPS" || $tree["CLASS"] == "OPK") && $tree["RB"] == null) { + + // printf("\n586 : Un opérateur ne peut suivre un opérateur
"); + if ($this->errmsg != "") + $this->errmsg .= "\\n"; + $this->errmsg .= sprintf(_('qparser::Formulation incorrecte, %s ne peut suivre un operateur'), $t["VALUE"]); + + return(null); + } + $pnum = null; + if ($this->ops[$t["VALUE"]]["CANNUM"]) { + // cet opérateur peut étre suivi d'un nombre ('near', 'before', 'after') + if ($tn = $this->nexttoken()) { + if ($tn["CLASS"] == "TOK_WORD" && is_numeric($tn["VALUE"])) + $pnum = (int) $tn["VALUE"]; + else + $this->ungettoken($tn["VALUE"]); + } + } + + return(array("CLASS" => "OPS", "VALUE" => $t["VALUE"], "NODETYPE" => $this->ops[$t["VALUE"]]["NODETYPE"], "PNUM" => $pnum, "DEPTH" => $depth, "LB" => $tree, "RB" => null)); + } else { + // ce mot n'est pas un opérateur + $pnum = null; + $nodetype = PHRASEA_KEYLIST; + if ($t["CLASS"] == "TOK_WORD" && isset($this->spw[$t["VALUE"]]) && ! $inquote) { + // mais c'est un mot 'spécial' de phrasea ('last', 'all') + $type = $this->spw[$t["VALUE"]]["CLASS"]; + $nodetype = $this->spw[$t["VALUE"]]["NODETYPE"]; + if ($this->spw[$t["VALUE"]]["CANNUM"]) { + // 'last' peut étre suivi d'un nombre + if ($tn = $this->nexttoken()) { + if ($tn["CLASS"] == "TOK_WORD" && is_numeric($tn["VALUE"])) + $pnum = (int) $tn["VALUE"]; + else + $this->ungettoken($tn["VALUE"]); + } + } + } else { + //printf("sdfsdfsdfsd
"); + $type = $inquote ? "QSIMPLE" : "SIMPLE"; + } + + return($this->addsimple($t, $type, $nodetype, $pnum, $tree, $depth)); + } + break; + } + } + + public function addsimple($t, $type, $nodetype, $pnum, $tree, $depth) + { + $nok = 0; + $registry = \registry::get_instance(); + $w = $t["VALUE"]; + if ($w != "?" && $w != "*") { // on laisse passer les 'isolés' pour les traiter plus tard comme des mots vides + for ($i = 0; $i < strlen($w); $i ++ ) { + $c = substr($w, $i, 1); + if ($c == "?" || $c == "*") { + if ($nok < $registry->get('GV_min_letters_truncation')) { + if ($this->errmsg != "") + $this->errmsg .= sprintf("\\n"); + $this->errmsg .= _('qparser:: Formulation incorrecte, necessite plus de caractere : ') . "
" . $registry->get('GV_min_letters_truncation'); + + return(null); + } + // $nok = 0; + } else + $nok ++; + } + } + if ( ! $tree) { + return(array("CLASS" => $type, "NODETYPE" => $nodetype, "VALUE" => array($t["VALUE"]), "PNUM" => $pnum, "DEPTH" => $depth)); + } + switch ($tree["CLASS"]) { + case "SIMPLE": + case "QSIMPLE": + if ($type == "SIMPLE" || $type == "QSIMPLE") + $tree["VALUE"][] = $t["VALUE"]; + else { + $tree = array("CLASS" => "OPS", + "VALUE" => "et", + "NODETYPE" => PHRASEA_OP_AND, + "PNUM" => null, + "DEPTH" => $depth, + "LB" => $tree, + "RB" => array("CLASS" => $type, + "NODETYPE" => $nodetype, + "VALUE" => array($t["VALUE"]), + "PNUM" => $pnum, + "DEPTH" => $depth)); + } + + return($tree); + case "OPS": + case "OPK": + if ($tree["RB"] == null) { + $tree["RB"] = array("CLASS" => $type, "NODETYPE" => $nodetype, "VALUE" => array($t["VALUE"]), "PNUM" => $pnum, "DEPTH" => $depth); + + return($tree); + } else { + if (($tree["RB"]["CLASS"] == "SIMPLE" || $tree["RB"]["CLASS"] == "QSIMPLE") && $tree["RB"]["DEPTH"] == $depth) { + $tree["RB"]["VALUE"][] = $t["VALUE"]; + + return($tree); + } + if (($tree["RB"]["CLASS"] == "PHRASEA_KW_LAST" || $tree["RB"]["CLASS"] == "PHRASEA_KW_ALL") && $tree["RB"]["DEPTH"] == $depth) { + $tree["RB"] = array("CLASS" => "OPS", + "VALUE" => "et", + "NODETYPE" => PHRASEA_OP_AND, + "PNUM" => null, + "DEPTH" => $depth, + "LB" => $tree["RB"], + "RB" => array("CLASS" => $type, + "NODETYPE" => $nodetype, + "VALUE" => array($t["VALUE"]), + "PNUM" => $pnum, + "DEPTH" => $depth)); + + return($tree); + } + + return(array("CLASS" => "OPS", + "VALUE" => $this->defaultop["VALUE"], + "NODETYPE" => $this->defaultop["NODETYPE"], + "PNUM" => $this->defaultop["PNUM"], + "DEPTH" => $depth, + "LB" => $tree, + "RB" => array("CLASS" => $type, "NODETYPE" => $nodetype, "VALUE" => array($t["VALUE"]), "PNUM" => $pnum, "DEPTH" => $depth) + )); + } + case "PHRASEA_KW_LAST": + case "PHRASEA_KW_ALL": + return(array("CLASS" => "OPS", + "VALUE" => "et", + "NODETYPE" => PHRASEA_OP_AND, + "PNUM" => null, + "DEPTH" => $depth, + "LB" => $tree, + "RB" => array("CLASS" => $type, + "NODETYPE" => $nodetype, + "VALUE" => array($t["VALUE"]), + "PNUM" => $pnum, + "DEPTH" => $depth))); + } + } + + public function ungettoken($s) + { + $this->phq = $s . " " . $this->phq; + } + + public function nexttoken($inquote = false) + { + if ($this->phq == "") { + return(null); + } + + switch ($c = substr($this->phq, 0, 1)) { + case "<": + case ">": + if ($inquote) { + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_VOID", "VALUE" => $c)); + } + $c2 = $c . substr($this->phq, 1, 1); + if ($c2 == "<=" || $c2 == ">=" || $c2 == "<>") { + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 2, 99999, 'UTF-8'), 'UTF-8'); + $c = $c2; + } else { + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + } + + return(array("CLASS" => "TOK_CMP", "VALUE" => $c)); + break; + case "=": + if ($inquote) { + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_VOID", "VALUE" => $c)); + } + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_CMP", "VALUE" => "=")); + break; + case ":": + if ($inquote) { + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_VOID", "VALUE" => $c)); + } + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_CMP", "VALUE" => ":")); + break; + case "(": + if ($inquote) { + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_VOID", "VALUE" => $c)); + } + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_LP", "VALUE" => "(")); + break; + case ")": + if ($inquote) { + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_VOID", "VALUE" => $c)); + } + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_RP", "VALUE" => ")")); + break; + case "[": + // if($inquote) + // { + // $this->phq = ltrim(substr($this->phq, 1)); + // return(array("CLASS"=>"TOK_VOID", "VALUE"=>$c)); + // } + // un '[' introduit un contexte qu'on lit jusqu'au ']' + $closeb = mb_strpos($this->phq, "]", 1, 'UTF-8'); + if ($closeb !== false) { + $context = $this->mb_trim(mb_substr($this->phq, 1, $closeb - 1, 'UTF-8'), 'UTF-8'); + $this->phq = $this->mb_ltrim(mb_substr($this->phq, $closeb + 1, 99999, 'UTF-8'), 'UTF-8'); + } else { + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + $this->phq = ""; + } + $context = $this->unicode->remove_indexer_chars($context); + + return(array("CLASS" => "TOK_CONTEXT", "VALUE" => $context)); + break; + /* + case "]": + // if($inquote) + // { + // $this->phq = ltrim(substr($this->phq, 1)); + // return(array("CLASS"=>"TOK_VOID", "VALUE"=>$c)); + // } + $this->phq = ltrim(substr($this->phq, 1)); + + return(array("CLASS"=>"TOK_RB", "VALUE"=>"]")); + break; + */ + case "\"": + $this->phq = $this->mb_ltrim(mb_substr($this->phq, 1, 99999, 'UTF-8'), 'UTF-8'); + + return(array("CLASS" => "TOK_QUOTE", "VALUE" => "\"")); + break; + default: + $l = mb_strlen($this->phq, 'UTF-8'); + $t = ""; + $c_utf8 = ""; + for ($i = 0; $i < $l; $i ++ ) { + if ( ! $this->unicode->has_indexer_bad_char(($c_utf8 = mb_substr($this->phq, $i, 1, 'UTF-8')))) { + // $c = mb_strtolower($c); + // $t .= isset($this->noaccent[$c]) ? $this->noaccent[$c] : $c; + $t .= $this->unicode->remove_diacritics(mb_strtolower($c_utf8)); + } else + break; + } +// if ($c_utf8 == "(" || $c_utf8 == ")" || $c_utf8 == "[" || $c_utf8 == "]" || $c_utf8 == "=" || $c_utf8 == ":" || $c_utf8 == "<" || $c_utf8 == ">" || $c_utf8 == "\"") + if (in_array($c_utf8, array("(", ")", "[", "]", "=", ":", "<", ">", "\""))) { + // ces caractéres sont des délimiteurs avec un sens, il faut les garder + $this->phq = $this->mb_ltrim(mb_substr($this->phq, $i, 99999, 'UTF-8'), 'UTF-8'); + } else { + // le délimiteur était une simple ponctuation, on le saute + $this->phq = $this->mb_ltrim(mb_substr($this->phq, $i + 1, 99999, 'UTF-8'), 'UTF-8'); + } + if ($t != "") { + return(array("CLASS" => "TOK_WORD", "VALUE" => $t)); + } else { + return(array("CLASS" => "TOK_VOID", "VALUE" => $t)); + } + break; + } + } +} + diff --git a/lib/Alchemy/Phrasea/SearchEngine/SearchEngineInterface.php b/lib/Alchemy/Phrasea/SearchEngine/SearchEngineInterface.php new file mode 100644 index 0000000000..085b8e66b2 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/SearchEngineInterface.php @@ -0,0 +1,155 @@ +i18n = $locale; + } + + /** + * + * @return string + */ + public function getLocale() + { + return $this->i18n; + } + + /** + * + * @param const $sort_by + * @param const $sort_ord + * @return searchEngine_options + */ + public function setSort($sort_by, $sort_ord = self::SORT_MODE_DESC) + { + $this->sort_by = $sort_by; + $this->sort_ord = $sort_ord; + + return $this; + } + + public function allowBusinessFieldsOn(Array $collection) + { + $this->business_fields = $collection; + + return $this; + } + + public function disallowBusinessFields() + { + $this->business_fields = array(); + + return $this; + } + + public function businessFieldsOn() + { + return $this->business_fields; + } + + /** + * + * @return string + */ + public function sortBy() + { + return $this->sort_by; + } + + /** + * + * @return string + */ + public function sortOrder() + { + return $this->sort_ord; + } + + /** + * + * @param boolean $boolean + * @return searchEngine_options + */ + public function useStemming($boolean) + { + $this->stemming = ! ! $boolean; + + return $this; + } + + /** + * + * @return boolean + */ + public function stemmed() + { + return $this->stemming; + } + + /** + * + * @param int $search_type + * @return searchEngine_options + */ + public function setSearchType($search_type) + { + switch ($search_type) { + case self::RECORD_RECORD: + default: + $this->search_type = self::RECORD_RECORD; + break; + case self::RECORD_GROUPING: + $this->search_type = self::RECORD_GROUPING; + break; + } + + return $this; + } + + /** + * + * @return int + */ + public function searchType() + { + return $this->search_type; + } + + public function onCollections(Array $collections) + { + $this->collections = $collections; + + return $this; + } + + /** + * + * @return array + */ + public function collections() + { + return $this->collections; + } + + public function databoxes() + { + $databoxes = array(); + + foreach ($this->collections as $collection) { + $databoxes[$collection->get_databox()->get_sbas_id()] = $collection->get_databox(); + } + + return array_values($databoxes); + } + + /** + * + * @param array $fields An array of Databox fields + */ + public function setFields(Array $fields) + { + $this->fields = $fields; + + return $this; + } + + /** + * + * @return array + */ + public function fields() + { + return $this->fields; + } + + /** + * + * @param array $status + * @return searchEngine_options + */ + public function setStatus(Array $status) + { + $tmp = array(); + foreach ($status as $n => $options) { + if (count($options) > 1) + continue; + if (isset($options['on'])) { + foreach ($options['on'] as $sbas_id) + $tmp[$n][$sbas_id] = 1; + } + if (isset($options['off'])) { + foreach ($options['off'] as $sbas_id) + $tmp[$n][$sbas_id] = 0; + } + } + + $this->status = $tmp; + + return $this; + } + + /** + * + * @return array + */ + public function getStatus() + { + return $this->status; + } + + /** + * + * @param string $record_type + * @return searchEngine_options + */ + public function setRecordType($record_type) + { + switch ($record_type) { + case self::TYPE_ALL: + default: + $this->record_type = self::TYPE_ALL; + break; + case self::TYPE_AUDIO: + $this->record_type = self::TYPE_AUDIO; + break; + case self::TYPE_VIDEO: + $this->record_type = self::TYPE_VIDEO; + break; + case self::TYPE_DOCUMENT: + $this->record_type = self::TYPE_DOCUMENT; + break; + case self::TYPE_FLASH: + $this->record_type = self::TYPE_FLASH; + break; + case self::TYPE_IMAGE: + $this->record_type = self::TYPE_IMAGE; + break; + } + + return $this; + } + + /** + * + * @return string + */ + public function getRecordType() + { + return $this->record_type; + } + + /** + * + * @param string $min_date + * @return searchEngine_options + */ + public function setMinDate($min_date) + { + if ( ! is_null($min_date) && trim($min_date) !== '') { + $this->date_min = DateTime::createFromFormat('Y/m/d H:i:s', $min_date . ' 00:00:00'); + } + + return $this; + } + + /** + * + * @return DateTime + */ + public function getMinDate() + { + return $this->date_min; + } + + /** + * + * @param string $max_date + * @return searchEngine_options + */ + public function setMaxDate($max_date) + { + if ( ! is_null($max_date) && trim($max_date) !== '') { + $this->date_max = DateTime::createFromFormat('Y/m/d H:i:s', $max_date . ' 23:59:59'); + } + + return $this; + } + + /** + * + * @return DateTime + */ + public function getMaxDate() + { + return $this->date_max; + } + + /** + * + * @param array $fields + * @return searchEngine_options + */ + public function setDateFields(Array $fields) + { + $this->date_fields = $fields; + + return $this; + } + + /** + * + * @return array + */ + public function getDateFields() + { + return $this->date_fields; + } + + /** + * + * @return string + */ + public function serialize() + { + $ret = array(); + foreach ($this as $key => $value) { + if ($value instanceof DateTime) { + $value = $value->format('d-m-Y h:i:s'); + } + if (in_array($key, array('collections', 'business_fields'))) { + $value = array_map(function($collection) { + return $collection->get_base_id(); + }, $value); + } + + $ret[$key] = $value; + } + + return \p4string::jsonencode($ret); + } + + /** + * + * @param string $serialized + * @return searchEngine_options + */ + public function unserialize($serialized) + { + $serialized = json_decode($serialized); + + foreach ($serialized as $key => $value) { + if (is_null($value)) { + $value = null; + } elseif (in_array($key, array('date_min', 'date_max'))) { + $value = new DateTime($value); + } elseif ($value instanceof stdClass) { + $tmpvalue = (array) $value; + $value = array(); + + foreach ($tmpvalue as $k => $data) { + $k = ctype_digit($k) ? (int) $k : $k; + $value[$k] = $data; + } + } elseif (in_array($key, array('collections', 'business_fields'))) { + $value = array_map(function($base_id) { + return \collection::get_from_base_id($base_id); + }, $value); + } + + $this->$key = $value; + } + + return $this; + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/SearchEngineResult.php b/lib/Alchemy/Phrasea/SearchEngine/SearchEngineResult.php new file mode 100644 index 0000000000..f370ce5443 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/SearchEngineResult.php @@ -0,0 +1,108 @@ +results = $results; + $this->query = $query; + $this->duration = (float) $duration; + $this->offsetStart = (int) $offsetStart; + $this->available = (int)$available; + $this->total = (int)$total; + $this->error = $error; + $this->warning = $warning; + $this->suggestions = $suggestions; + $this->propositions = $propositions; + $this->indexes = $indexes; + + return $this; + } + + public function results() + { + return $this->results; + } + + + public function query() + { + return $this->query; + } + + public function duration() + { + return $this->duration; + } + + public function totalPages($amountPerPage) + { + return ceil($this->available / $amountPerPage); + } + + public function currentPage($amountPerPage) + { + return ceil($this->offsetStart / $amountPerPage); + } + + public function available() + { + return $this->available; + } + + public function total() + { + return $this->total; + } + + public function error() + { + return $this->error; + } + + public function warning() + { + return $this->warning; + } + + public function suggestions() + { + return $this->suggestions; + } + + public function proposals() + { + return $this->propositions; + } + + public function indexes() + { + return $this->indexes; + } +} + diff --git a/lib/Alchemy/Phrasea/SearchEngine/SearchEngineSuggestion.php b/lib/Alchemy/Phrasea/SearchEngine/SearchEngineSuggestion.php new file mode 100644 index 0000000000..3d6e282a1d --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/SearchEngineSuggestion.php @@ -0,0 +1,41 @@ +query = $query; + $this->suggestion = $suggestion; + $this->hits = (int) $hits; + } + + public function query() + { + return $this->query; + } + + public function suggestion() + { + return $this->suggestion; + } + + public function hits() + { + return $this->hits; + } +} diff --git a/lib/Alchemy/Phrasea/SearchEngine/SphinxSearch.php b/lib/Alchemy/Phrasea/SearchEngine/SphinxSearch.php new file mode 100644 index 0000000000..c9d42bc8c6 --- /dev/null +++ b/lib/Alchemy/Phrasea/SearchEngine/SphinxSearch.php @@ -0,0 +1,689 @@ +options = new SearchEngineOptions(); + + $this->sphinx = new \SphinxClient(); + + $this->sphinx->SetServer($host, $port); + $this->sphinx->SetArrayResult(true); + $this->sphinx->SetConnectTimeout(1); + + try { + $this->rt_conn = @new \PDO(sprintf('mysql:host=%s;port=%s;', $rt_host, $rt_port)); + } catch (\PDOException $e) { + $this->rt_conn = null; + } + + return $this; + } + + public function status() + { + $status = $this->sphinx->Status(); + + if (false === $status) { + throw new Exception(_('Sphinx server is offline')); + } + + if (null === $this->rt_conn) { + throw new RuntimeException('Unable to connect to sphinx rt'); + } + + return $status; + } + + public function availableTypes() + { + return array(self::GEM_TYPE_RECORD, self::GEM_TYPE_STORY); + } + + public function addRecord(\record_adapter $record) + { + $all_datas = array(); + + foreach ($record->get_caption()->get_fields(null, true) as $field) { + if ( ! $field->is_indexable()) { + continue; + } + + $all_datas[] = $field->get_serialized_values(); + + foreach ($field->get_values() as $value) { + + $this->rt_conn->exec("REPLACE INTO " + . "metas_realtime" . $this->CRCdatabox($record->get_databox()) . " VALUES ( + '" . $value->getId() . "' + ,'" . str_replace("'", "\'", $value->getValue()) . "' + ,'" . $value->getDatabox_field()->get_id() . "' + ," . $record->get_record_id() . " + ," . $record->get_sbas_id() . " + ," . $record->get_collection()->get_coll_id() . " + ," . (int) $record->is_grouping() . " + ," . crc32($record->get_sbas_id() . '_' . $value->getDatabox_field()->get_id()) . " + ," . crc32($record->get_sbas_id() . '_' . $record->get_collection()->get_coll_id()) . " + ," . crc32($record->get_sbas_id() . '_' . $record->get_record_id()) . " + ," . crc32($record->get_type()) . " + ,0 + ," . (int) $value->getDatabox_field()->isBusiness() . " + ," . crc32($record->get_collection()->get_coll_id() . '_' . (int) $value->getDatabox_field()->isBusiness()) . " + ," . $record->get_creation_date()->format('U') . " )"); + } + } + + $this->rt_conn->exec("REPLACE INTO " + . "docs_realtime" . $this->CRCdatabox($record->get_databox()) . " VALUES ( + '" . $record->get_record_id() . "' + ,'" . str_replace("'", "\'", implode(' ', $all_datas)) . "' + ," . $record->get_record_id() . " + ," . $record->get_sbas_id() . " + ," . $record->get_collection()->get_coll_id() . " + ," . (int) $record->is_grouping() . " + ," . crc32($record->get_sbas_id() . '_' . $record->get_collection()->get_coll_id()) . " + ," . crc32($record->get_sbas_id() . '_' . $record->get_record_id()) . " + ," . crc32($record->get_type()) . " + ,0 + ," . $record->get_creation_date()->format('U') . " )"); + } + + public function removeRecord(\record_adapter $record) + { + $CRCdatabox = $this->CRCdatabox($record->get_databox()); + $indexes = array( + "metadatas" . $CRCdatabox, + "metadatas" . $CRCdatabox . "_stemmed_en", + "metadatas" . $CRCdatabox . "_stemmed_fr", + ); + + foreach ($record->get_caption()->get_fields(null, true) as $field) { + + foreach ($field->get_values() as $value) { + + foreach ($indexes as $index) { + $this->sphinx->UpdateAttributes($index, array("deleted"), array($value->getId() => array(1))); + } + + $this->rt_conn->exec("DELETE FROM metas_realtime" . $CRCdatabox . " WHERE id = " . $value->getId()); + } + } + + $indexes = array( + "documents" . $CRCdatabox, + "documents" . $CRCdatabox . "_stemmed_fr", + "documents" . $CRCdatabox . "_stemmed_en" + ); + + foreach ($indexes as $index) { + $this->sphinx->UpdateAttributes($index, array("deleted"), array($record->get_record_id() => array(1))); + } + + $this->rt_conn->exec("DELETE FROM docs_realtime" . $CRCdatabox . " WHERE id = " . $record->get_record_id()); + } + + public function updateRecord(\record_adapter $record) + { + $this->removeRecord($record); + $this->addRecord($record); + } + + public function addStory(\record_adapter $record) + { + return $this->addRecord($record); + } + + public function removeStory(\record_adapter $record) + { + return $this->removeRecord($record); + } + + public function updateStory(\record_adapter $record) + { + return $this->updateRecord($record); + } + + public function addFeedEntry(\Feed_Entry_Adapter $entry) + { + throw new RuntimeException('Feed Entry indexing not supported by Sphinx Search Engine'); + } + + public function removeFeedEntry(\Feed_Entry_Adapter $entry) + { + throw new RuntimeException('Feed Entry indexing not supported by Sphinx Search Engine'); + } + + public function updateFeedEntry(\Feed_Entry_Adapter $entry) + { + throw new RuntimeException('Feed Entry indexing not supported by Sphinx Search Engine'); + } + + public function setOptions(SearchEngineOptions $options) + { + $this->options = $options; + $this->applyOptions($options); + } + + public function resetOptions() + { + $this->options = new SearchEngineOptions(); + $this->resetSphinx(); + } + + private function resetSphinx() + { + $this->sphinx->ResetGroupBy(); + $this->sphinx->ResetFilters(); + $this->sphinx->ResetOverrides(); + } + + public function query($query, $offset, $perPage) + { + assert(is_int($offset)); + assert($offset >= 0); + assert(is_int($perPage)); + + $query = $this->parseQuery($query); + + $preg = preg_match('/\s?recordid\s?=\s?([0-9]+)/i', $query, $matches, 0, 0); + + if ($preg > 0) { + $this->sphinx->SetFilter('record_id', array($matches[1])); + $query = ''; + } + + $this->sphinx->SetLimits($offset, $perPage); + $this->sphinx->SetMatchMode(SPH_MATCH_EXTENDED2); + + $index = $this->getQueryIndex($query); + $res = $this->sphinx->Query($query, $index); + + $results = new ArrayCollection(); + + if ($res === false) { + if ($this->sphinx->IsConnectError() === true) { + $error = _('Sphinx server is offline'); + } else { + $error = $this->sphinx->GetLastError(); + } + $warning = $this->sphinx->GetLastWarning(); + + $total = $available = $duration = 0; + $suggestions = $propositions = array(); + } else { + $error = $res['error']; + $warning = $res['warning']; + + $duration = $res['time']; + $total = $res['total_found']; + $available = $res['total']; + + $resultOffset = $offset; + + if (isset($res['matches'])) { + foreach ($res['matches'] as $record_id => $match) { + try { + $record = + new \record_adapter( + $match['attrs']['sbas_id'] + , $match['attrs']['record_id'] + , $resultOffset + ); + + $results->add($record); + } catch (Exception $e) { + + } + $resultOffset ++; + } + } + + $suggestions = $this->getSuggestions($query); + $propositions = array(); + } + + return new SearchEngineResult($results, $query, $duration, $offset, $available, $total, $error, $warning, $suggestions, $propositions, $index); + } + + public function autocomplete($query) + { + $words = explode(" ", $this->cleanupQuery($query)); + + return $this->getSuggestions(array_pop($words)); + } + + public function excerpt($query, $fields, \record_adapter $record) + { + $index = ''; + // in this case search is done on metas + if ($this->options->fields() || $this->options->businessFieldsOn()) { + if ($this->options->stemmed() && $this->options->getLocale()) { + $index = 'metadatas' . $this->CRCdatabox($record->get_databox()) . '_stemmed_' . $this->options->getLocale(); + } else { + $index = 'metadatas' . $this->CRCdatabox($record->get_databox()); + } + } else { + if ($this->options->stemmed()) { + $index = 'documents' . $this->CRCdatabox($record->get_databox()) . '_stemmed_' . $this->options->getLocale(); + } else { + $index = 'documents' . $this->CRCdatabox($record->get_databox()); + } + } + + $opts = array( + 'before_match' => "", + 'after_match' => "", + ); + + $fields_to_send = array(); + + foreach ($fields as $k => $f) { + $fields_to_send[$k] = $f['value']; + } + + return $this->sphinx->BuildExcerpts($fields_to_send, $index, $query, $opts); + } + + public function resetCache() + { + return $this; + } + + /** + * Reset sphinx client and apply the options + * + * Only apply filters and group by + * + * @param SearchEngineOptions $options + * @return SphinxSearch + */ + protected function applyOptions(SearchEngineOptions $options) + { + $this->resetSphinx(); + + $filters = array(); + + foreach ($options->collections() as $collection) { + $filters[] = crc32($collection->get_databox()->get_sbas_id() . '_' . $collection->get_coll_id()); + } + + $this->sphinx->SetFilter('crc_sbas_coll', $filters); + + $this->sphinx->SetFilter('deleted', array(0)); + $this->sphinx->SetFilter('parent_record_id', array($options->searchType())); + + + if ($options->fields()) { + + $filters = array(); + foreach ($options->fields() as $field) { + $filters[] = crc32($field->get_databox()->get_sbas_id() . '_' . $field->get_id()); + } + + $this->sphinx->SetFilter('crc_struct_id', $filters); + } + + if ($options->businessFieldsOn()) { + + $crc_coll_business = array(); + + foreach ($options->businessFieldsOn() as $collection) { + $crc_coll_business[] = crc32($collection->get_coll_id() . '_1'); + $crc_coll_business[] = crc32($collection->get_coll_id() . '_0'); + } + + $non_business = array(); + + foreach ($options->collections() as $collection) { + foreach ($options->businessFieldsOn() as $BFcollection) { + if ($collection->get_base_id() == $BFcollection->get_base_id()) { + continue 2; + } + } + $non_business[] = $collection; + } + + foreach ($non_business as $collection) { + $crc_coll_business[] = crc32($collection->get_coll_id() . '_0'); + } + + $this->sphinx->SetFilter('crc_coll_business', $crc_coll_business); + } elseif ($options->fields()) { + $this->sphinx->SetFilter('business', array(0)); + } + + /** + * @todo : enhance : check status in a better way + */ + foreach ($options->databoxes() as $databox) { + $status_opts = $options->getStatus(); + foreach ($databox->get_statusbits() as $n => $status) { + if ( ! array_key_exists($n, $status_opts)) + continue; + if ( ! array_key_exists($databox->get_sbas_id(), $status_opts[$n])) + continue; + $crc = crc32($databox->get_sbas_id() . '_' . $n); + $this->sphinx->SetFilter('status', array($crc), ($status_opts[$n][$databox->get_sbas_id()] == '0')); + } + } + + + if ($options->getRecordType()) { + $this->sphinx->SetFilter('crc_type', array(crc32($options->getRecordType()))); + } + + + $order = ''; + switch ($options->sortOrder()) { + case SearchEngineOptions::SORT_MODE_ASC: + $order = 'ASC'; + break; + case SearchEngineOptions::SORT_MODE_DESC: + default: + $order = 'DESC'; + break; + } + + switch ($options->sortBy()) { + case SearchEngineOptions::SORT_RANDOM: + $sort = '@random'; + break; + case SearchEngineOptions::SORT_RELEVANCE: + default: + $sort = '@relevance ' . $order . ', created_on ' . $order; + break; + case SearchEngineOptions::SORT_CREATED_ON: + $sort = 'created_on ' . $order; + break; + } + + $this->sphinx->SetGroupBy('crc_sbas_record', SPH_GROUPBY_ATTR, $sort); + + return $this; + } + + /** + * Return unique integer key for a databox + * + * @param \databox $databox + * @return int + */ + private function CRCdatabox(\databox $databox) + { + return crc32( + str_replace( + array('.', '%') + , '_' + , sprintf('%s_%s_%s_%s', $databox->get_host(), $databox->get_port(), $databox->get_user(), $databox->get_dbname()) + ) + ); + } + + /** + * Remove all keywords, operators, quotes from a query string + * + * @param string $query + * @return string + */ + private function cleanupQuery($query) + { + return str_replace(array("all", "last", "et", "ou", "sauf", "and", "or", "except", "in", "dans", "'", '"', "(", ")", "_", "-", "+"), ' ', $query); + } + + /** + * Return a collection of suggestion corresponding a query + * + * @param string $query + * @return ArrayCollection An array collection of SearchEngineSuggestion + */ + private function getSuggestions($query) + { + // First we split the query into simple words + $words = explode(" ", $this->cleanupQuery(mb_strtolower($query))); + + $tmpWords = array(); + + foreach ($words as $word) { + if (trim($word) === '') { + continue; + } + $tmpWords[] = $word; + } + + $words = array_unique($tmpWords); + + $altVersions = array(); + + // As we got words, we look for alternate word for each of them + if (function_exists('enchant_broker_init') && $this->options->getLocale()) { + $broker = enchant_broker_init(); + if (enchant_broker_dict_exists($broker, $this->options->getLocale())) { + $dictionnary = enchant_broker_request_dict($broker, $this->options->getLocale()); + + foreach ($words as $word) { + + if (enchant_dict_check($dictionnary, $word) == false) { + $suggs = array_merge(array($word), enchant_dict_suggest($dictionnary, $word)); + } + + $altVersions[$word] = array_unique($suggs); + } + enchant_broker_free_dict($dictionnary); + } + enchant_broker_free($broker); + } + + /** + * @todo enhance the trigramm query, as it could be sent in one batch + */ + foreach ($altVersions as $word => $versions) { + $altVersions[$word] = array_unique(array_merge($versions, $this->get_sugg_trigrams($word))); + } + + // We now build an array of all possibilities based on the original query + $queries = array($query); + + foreach ($altVersions as $word => $versions) { + $tmp_queries = array(); + foreach ($versions as $version) { + foreach ($queries as $alt_query) { + $tmp_queries[] = $alt_query; + $tmp_queries[] = str_replace($word, $version, $alt_query); + } + $tmp_queries[] = str_replace($word, $version, $query); + } + $queries = array_unique(array_merge($queries, $tmp_queries)); + } + + $suggestions = array(); + $max_results = 0; + + foreach ($queries as $alt_query) { + $results = $this->sphinx->Query($alt_query, $this->getQueryIndex($alt_query)); + + if ($results !== false && isset($results['total_found'])) { + if ($results['total_found'] > 0) { + + $max_results = max($max_results, (int) $results['total_found']); + $suggestions[] = new SearchEngineSuggestion($query, $alt_query, (int) $results['total_found']); + } + } + } + + usort($suggestions, array('self', 'suggestionsHitSorter')); + + $tmpSuggestions = new ArrayCollection(); + foreach ($suggestions as $key => $suggestion) { + if ($suggestion->hits() < ($max_results / 100)) { + continue; + } + $tmpSuggestions->add($suggestion); + } + + return $tmpSuggestions; + } + + private static function suggestionsHitSorter(SearchEngineSuggestion $a, SearchEngineSuggestion $b) + { + if ($a->hits() == $b->hits()) { + return 0; + } + + return ($a->hits() > $b->hits()) ? -1 : 1; + } + + private function BuildTrigrams($keyword) + { + $t = "__" . $keyword . "__"; + + $trigrams = ""; + for ($i = 0; $i < strlen($t) - 2; $i ++ ) { + $trigrams .= substr($t, $i, 3) . " "; + } + + return $trigrams; + } + + private function get_sugg_trigrams($word) + { + $trigrams = $this->BuildTrigrams($word); + $query = "\"$trigrams\"/1"; + $len = strlen($word); + + $this->resetSphinx(); + + $this->sphinx->SetMatchMode(SPH_MATCH_EXTENDED2); + $this->sphinx->SetRankingMode(SPH_RANK_WORDCOUNT); + $this->sphinx->SetFilterRange("len", $len - 2, $len + 4); + + $this->sphinx->SetSortMode(SPH_SORT_EXTENDED, "@weight DESC"); + $this->sphinx->SetLimits(0, 10); + + $indexes = array(); + + foreach ($this->options->databoxes() as $databox) { + $indexes[] = 'suggest' . $this->CRCdatabox($databox); + } + + $index = implode(',', $indexes); + + $res = $this->sphinx->Query($query, $index); + + if ($this->sphinx->Status() === false) { + return array(); + } + + if ( ! $res || ! isset($res["matches"])) { + return array(); + } + + $this->sphinx->ResetGroupBy(); + $this->sphinx->ResetFilters(); + + $words = array(); + foreach ($res["matches"] as $match) { + $words[] = $match['attrs']['keyword']; + } + + $this->applyOptions($this->options); + + return $words; + } + + private function getQueryIndex($query) + { + $index = '*'; + + $index_keys = array(); + + foreach ($this->options->databoxes() as $databox) { + $index_keys[] = $this->CRCdatabox($databox); + } + + if (count($index_keys) > 0) { + if ($this->options->fields() || $this->options->businessFieldsOn()) { + if ($query !== '' && $this->options->stemmed() && $this->options->getLocale()) { + $index = ', metadatas' . implode('_stemmed_' . $this->options->getLocale() . ', metadatas', $index_keys) . '_stemmed_' . $this->options->getLocale(); + } else { + $index = 'metadatas' . implode(',metadatas', $index_keys); + } + $index .= ', metas_realtime' . implode(', metas_realtime', $index_keys); + } else { + if ($query !== '' && $this->options->stemmed() && $this->options->getLocale()) { + $index .= ', documents' . implode('_stemmed_' . $this->options->getLocale() . ', documents', $index_keys) . '_stemmed_' . $this->options->getLocale(); + } else { + $index .= 'documents' . implode(', documents', $index_keys); + } + $index .= ', docs_realtime' . implode(', docs_realtime', $index_keys); + } + } + + return $index; + } + + private function parseQuery($query) + { + $query = trim($query); + + while (substr($query, 0, 1) === '(' && substr($query, -1) === ')') { + $query = substr($query, 1, (mb_strlen($query) - 2)); + } + + if ($query == 'all') { + $query = ''; + } + + while (mb_strpos($query, ' ') !== false) { + $query = str_replace(' ', ' ', $query); + } + + + $offset = 0; + while (false !== $pos = mb_strpos($query, '-', $offset)) { + $offset = $pos + 1; + if ($pos === 0) { + continue; + } + if (mb_substr($query, ($pos - 1), 1) !== ' ') { + $query = mb_substr($query, 0, ($pos)) . ' ' . mb_substr($query, $pos + 1); + } + } + + $query = str_ireplace(array(' ou ', ' or '), '|', $query); + $query = str_ireplace(array(' sauf ', ' except '), ' -', $query); + $query = str_ireplace(array(' and ', ' et '), ' +', $query); + + return $query; + } +} +