[SearchEngine] Add refactored search engines

This commit is contained in:
Romain Neutron
2012-08-24 11:27:02 +02:00
parent a71080b2b3
commit a9ee52c6c8
10 changed files with 4059 additions and 0 deletions

View File

@@ -0,0 +1,31 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\Core\Provider;
use Silex\Application;
use Silex\ServiceProviderInterface;
class SearchENgineServiceProvider implements ServiceProviderInterface
{
public function register(Application $app)
{
$app['phraseanet.SE'] = $app->share(function($app) {
return $app['phraseanet.core']['SearchEngine'];
});
}
public function boot(Application $app)
{
}
}

View File

@@ -0,0 +1,42 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\Core\Service\SearchEngine;
use Alchemy\Phrasea\SearchEngine\PhraseaEngine as PhraseaSearchEngine;
use Alchemy\Phrasea\Core\Service\ServiceAbstract;
class PhraseaEngine extends ServiceAbstract
{
protected $searchEngine;
protected function init()
{
$this->searchEngine = new PhraseaSearchEngine();
return $this;
}
public function getDriver()
{
return $this->searchEngine;
}
public function getType()
{
return 'phrasea';
}
public function getMandatoryOptions()
{
return array();
}
}

View File

@@ -0,0 +1,44 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\Core\Service\SearchEngine;
use Alchemy\Phrasea\SearchEngine\SphinxSearch as SphinxSearchEngine;
use Alchemy\Phrasea\Core\Service\ServiceAbstract;
class SphinxSearch extends ServiceAbstract
{
protected $searchEngine;
protected function init()
{
$options = $this->getOptions();
$this->searchEngine = new SphinxSearchEngine($options['host'], $options['port'], $options['rt_host'], $options['rt_port']);
return $this;
}
public function getDriver()
{
return $this->searchEngine;
}
public function getType()
{
return 'sphinx-search';
}
public function getMandatoryOptions()
{
return array('host', 'port', 'rt_host', 'rt_port');
}
}

View File

@@ -0,0 +1,517 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine;
use Alchemy\Phrasea\Exception\RuntimeException;
use Doctrine\Common\Collections\ArrayCollection;
class PhraseaEngine implements SearchEngineInterface
{
/**
*
* @var SearchEngineOptions
*/
protected $options;
protected $queries = array();
protected $arrayq = array();
protected $colls = array();
protected $qp = array();
protected $needthesaurus = array();
protected $resetCacheNextQuery = false;
/**
* {@inheritdoc}
*/
public function __construct()
{
$this->options = new SearchEngineOptions();
}
/**
* {@inheritdoc}
*/
public function status()
{
return true;
}
/**
* {@inheritdoc}
*/
public function availableTypes()
{
return array(self::GEM_TYPE_RECORD, self::GEM_TYPE_STORY);
}
/**
* {@inheritdoc}
*/
public function addRecord(\record_adapter $record)
{
return $this->updateRecord($record);
}
/**
* {@inheritdoc}
*/
public function removeRecord(\record_adapter $record)
{
$connbas = $record->get_databox()->get_connection();
$sql = "DELETE FROM prop WHERE record_id = :record_id";
$stmt = $connbas->prepare($sql);
$stmt->execute(array(':record_id' => $record->get_record_id()));
$stmt->closeCursor();
$sql = "DELETE FROM idx WHERE record_id = :record_id";
$stmt = $connbas->prepare($sql);
$stmt->execute(array(':record_id' => $record->get_record_id()));
$stmt->closeCursor();
$sql = "DELETE FROM thit WHERE record_id = :record_id";
$stmt = $connbas->prepare($sql);
$stmt->execute(array(':record_id' => $record->get_record_id()));
$stmt->closeCursor();
unset($stmt, $connbas);
return $this;
}
/**
* {@inheritdoc}
*/
public function updateRecord(\record_adapter $record)
{
$record->set_binary_status(\databox_status::dec2bin(bindec($record->get_status()) & ~7 | 4));
return $this;
}
/**
* {@inheritdoc}
*/
public function addStory(\record_adapter $record)
{
return $this->updateRecord($record);
}
/**
* {@inheritdoc}
*/
public function removeStory(\record_adapter $record)
{
return $this->removeRecord($record);
}
/**
* {@inheritdoc}
*/
public function updateStory(\record_adapter $record)
{
return $this->updateRecord($record);
}
/**
* {@inheritdoc}
*/
public function addFeedEntry(\Feed_Entry_Adapter $entry)
{
throw new RuntimeException('Feed Entry indexing not supported by Phrasea Engine');
}
/**
* {@inheritdoc}
*/
public function removeFeedEntry(\Feed_Entry_Adapter $entry)
{
throw new RuntimeException('Feed Entry indexing not supported by Phrasea Engine');
}
/**
* {@inheritdoc}
*/
public function updateFeedEntry(\Feed_Entry_Adapter $entry)
{
throw new RuntimeException('Feed Entry indexing not supported by Phrasea Engine');
}
/**
* {@inheritdoc}
*/
public function setOptions(SearchEngineOptions $options)
{
$this->options = $options;
}
/**
* {@inheritdoc}
*/
public function resetOptions()
{
$this->options = new SearchEngineOptions();
}
/**
* {@inheritdoc}
*/
public function query($query, $offset, $perPage)
{
assert(is_int($offset));
assert($offset >= 0);
assert(is_int($perPage));
if (trim($query) === '') {
$query = "all";
}
if ($this->options->getRecordType()) {
$query .= ' AND recordtype=' . $this->options->getRecordType();
}
$appbox = \appbox::get_instance(\bootstrap::getCore());
$session = $appbox->get_session();
$sql = 'SELECT query, query_time, duration, total FROM cache WHERE session_id = :ses_id';
$stmt = $appbox->get_connection()->prepare($sql);
$stmt->execute(array(':ses_id' => $session->get_ses_id()));
$row = $stmt->fetch(\PDO::FETCH_ASSOC);
$stmt->closeCursor();
$date_obj = new \DateTime('-10 min');
$date_quest = new \DateTime($row['query_time']);
if ($query != $row['query']) {
$this->resetCacheNextQuery = true;
}
if ($date_obj > $date_quest) {
$this->resetCacheNextQuery = true;
}
if ($this->resetCacheNextQuery === true) {
phrasea_clear_cache($session->get_ses_id());
$this->addQuery($query);
$this->executeQuery($query);
$sql = 'SELECT query, query_time, duration, total FROM cache WHERE session_id = :ses_id';
$stmt = $appbox->get_connection()->prepare($sql);
$stmt->execute(array(':ses_id' => $session->get_ses_id()));
$row = $stmt->fetch(\PDO::FETCH_ASSOC);
$stmt->closeCursor();
} else {
/**
* @todo clean this in DB
*/
$this->total_available = $this->total_results = $session->get_session_prefs('phrasea_engine_n_results');
}
$res = phrasea_fetch_results(
$session->get_ses_id(), $offset + 1, $perPage, false
);
$rs = array();
$error = _('Unable to execute query');
if (isset($res['results']) && is_array($res['results'])) {
$rs = $res['results'];
$error = '';
}
$resultNumber = $offset;
$records = new ArrayCollection();
foreach ($rs as $data) {
try {
$records->add(new \record_adapter(
\phrasea::sbasFromBas($data['base_id']),
$data['record_id'],
$resultNumber
));
} catch (Exception $e) {
}
$resultNumber ++;
}
return new SearchEngineResult($records, $query, $row['duration'], $offset, $row['total'], $row['total'], $error, '', new ArrayCollection(), new ArrayCollection(), '');
}
/**
* {@inheritdoc}
*/
private function executeQuery($query)
{
$appbox = \appbox::get_instance(\bootstrap::getCore());
$session = $appbox->get_session();
$registry = $appbox->get_registry();
$dateLog = date("Y-m-d H:i:s");
$nbanswers = $total_time = 0;
$sort = '';
if ($this->options->sortBy()) {
switch ($this->options->sortOrder()) {
case SearchEngineOptions::SORT_MODE_ASC:
$sort = '+';
break;
case SearchEngineOptions::SORT_MODE_DESC:
default:
$sort = '-';
break;
}
$sort .= '0' . $this->options->sortBy();
}
foreach ($this->queries as $sbas_id => $qry) {
$BF = array();
foreach ($this->options->businessFieldsOn() as $collection) {
$BF[] = $collection->get_base_id();
}
$results = phrasea_query2(
$session->get_ses_id()
, $sbas_id
, $this->colls[$sbas_id]
, $this->arrayq[$sbas_id]
, $registry->get('GV_sit')
, (string) $session->get_usr_id()
, false
, $this->options->searchType() == SearchEngineOptions::RECORD_GROUPING ? PHRASEA_MULTIDOC_REGONLY : PHRASEA_MULTIDOC_DOCONLY
, $sort
, $BF
);
if ($results) {
$total_time += $results['time_all'];
$nbanswers += $results["nbanswers"];
}
$logger = $session->get_logger($appbox->get_databox($sbas_id));
$conn2 = \connection::getPDOConnection($sbas_id);
$sql3 = "INSERT INTO log_search
(id, log_id, date, search, results, coll_id )
VALUES
(null, :log_id, :date, :query, :nbresults, :colls)";
$params = array(
':log_id' => $logger->get_id()
, ':date' => $dateLog
, ':query' => $query
, ':nbresults' => $results["nbanswers"]
, ':colls' => implode(',', $this->colls[$sbas_id])
);
$stmt = $conn2->prepare($sql3);
$stmt->execute($params);
$stmt->closeCursor();
}
$sql = 'UPDATE cache
SET query = :query, query_time = NOW(), duration = :duration, total = :total
WHERE session_id = :ses_id';
$params = array(
'query' => $query,
':ses_id' => $session->get_ses_id(),
':duration' => $total_time,
':total' => $nbanswers,
);
$stmt = $appbox->get_connection()->prepare($sql);
$stmt->execute($params);
$stmt->closeCursor();
\User_Adapter::saveQuery($query);
return $this;
}
/**
* {@inheritdoc}
*/
public function autocomplete($query)
{
return new ArrayCollection();
}
/**
* {@inheritdoc}
*/
public function excerpt($query, $fields, \record_adapter $record)
{
$ret = array();
$appbox = \appbox::get_instance(\bootstrap::getCore());
$session = $appbox->get_session();
$res = phrasea_fetch_results(
$session->get_ses_id(), ($record->get_number() + 1), 1, true, "[[em]]", "[[/em]]"
);
if ( ! isset($res['results']) || ! is_array($res['results'])) {
return array();
}
$rs = $res['results'];
$res = array_shift($rs);
if ( ! isset($res['xml'])) {
return array();
}
$sxe = simplexml_load_string($res['xml']);
foreach ($fields as $name => $field) {
if ($sxe && $sxe->description && $sxe->description->$name) {
$val = array();
foreach ($sxe->description->$name as $value) {
$val[] = str_replace(array('[[em]]', '[[/em]]'), array('<em>', '</em>'), (string) $value);
}
$separator = $field['separator'] ? $field['separator'][0] : '';
$val = implode(' ' . $separator . ' ', $val);
} else {
$val = $field['value'];
}
$ret[] = $val;
}
return $ret;
}
/**
* {@inheritdoc}
*/
public function resetCache()
{
$this->resetCacheNextQuery = true;
$this->queries = $this->arrayq = $this->colls = $this->qp = $this->needthesaurus = array();
return $this;
}
private function addQuery($query)
{
foreach ($this->options->databoxes() as $databox) {
$this->queries[$databox->get_sbas_id()] = $query;
}
$status = $this->options->getStatus();
foreach ($this->queries as $sbas => $qs) {
if ($status) {
$requestStat = 'xxxx';
for ($i = 4; ($i <= 64); $i ++ ) {
if ( ! isset($status[$i])) {
$requestStat = 'x' . $requestStat;
continue;
}
$val = 'x';
if (isset($status[$i][$sbas])) {
if ($status[$i][$sbas] == '0') {
$val = '0';
} elseif ($status[$i][$sbas] == '1') {
$val = '1';
}
}
$requestStat = $val . $requestStat;
}
$requestStat = ltrim($requestStat, 'x');
if ($requestStat !== '') {
$this->queries[$sbas] .= ' AND (recordstatus=' . $requestStat . ')';
}
}
if ($this->options->fields()) {
$this->queries[$sbas] .= ' IN (' . implode(' OR ', $this->options->fields()) . ')';
}
if (($this->options->getMinDate() || $this->options->getMaxDate()) && $this->options->getDateFields()) {
if ($this->options->getMinDate()) {
$this->queries[$sbas] .= ' AND ( ' . implode(' >= ' . $this->options->getMinDate()->format('Y-m-d') . ' OR ', $this->options->getDateFields()) . ' >= ' . $this->options->getMinDate()->format('Y-m-d') . ' ) ';
}
if ($this->options->getMaxDate()) {
$this->queries[$sbas] .= ' AND ( ' . implode(' <= ' . $this->options->getMaxDate()->format('Y-m-d') . ' OR ', $this->options->getDateFields()) . ' <= ' . $this->options->getMaxDate()->format('Y-m-d') . ' ) ';
}
}
}
$this->singleParse('main', $query);
foreach ($this->queries as $sbas => $db_query) {
$this->singleParse($sbas, $query);
}
$base_ids = array_map(function(\collection $collection) {
return $collection->get_base_id();
}, $this->options->collections());
foreach ($this->options->databoxes() as $databox) {
$sbas_id = $databox->get_sbas_id();
$this->colls[$sbas_id] = array();
foreach ($databox->get_collections() as $collection) {
if (in_array($collection->get_base_id(), $base_ids)) {
$this->colls[$sbas_id][] = $collection->get_base_id();
}
}
if (sizeof($this->colls[$sbas_id]) <= 0) {
continue;
}
if ($this->needthesaurus[$sbas_id]) {
if ($databox->get_dom_thesaurus()) {
$this->qp[$sbas_id]->thesaurus2($this->indep_treeq[$sbas_id], $sbas_id, $databox->get_dbname(), $databox->get_dom_thesaurus(), true);
$this->qp['main']->thesaurus2($this->indep_treeq['main'], $sbas_id, $databox->get_dbname(), $databox->get_dom_thesaurus(), true);
}
}
$emptyw = false;
$this->qp[$sbas_id]->set_default($this->indep_treeq[$sbas_id], $emptyw);
$this->qp[$sbas_id]->distrib_in($this->indep_treeq[$sbas_id]);
$this->qp[$sbas_id]->factor_or($this->indep_treeq[$sbas_id]);
$this->qp[$sbas_id]->setNumValue($this->indep_treeq[$sbas_id], $databox->get_sxml_structure());
$this->qp[$sbas_id]->thesaurus2_apply($this->indep_treeq[$sbas_id], $sbas_id);
$this->arrayq[$sbas_id] = $this->qp[$sbas_id]->makequery($this->indep_treeq[$sbas_id]);
}
return $this;
}
private function singleParse($sbas, $query)
{
$this->qp[$sbas] = new PhraseaEngineQueryParser($this->options->getLocale());
$this->qp[$sbas]->debug = false;
if ($sbas == 'main') {
$simple_treeq = $this->qp[$sbas]->parsequery($query);
} else {
$simple_treeq = $this->qp[$sbas]->parsequery($this->queries[$sbas]);
}
$this->qp[$sbas]->priority_opk($simple_treeq);
$this->qp[$sbas]->distrib_opk($simple_treeq);
$this->needthesaurus[$sbas] = false;
$this->indep_treeq[$sbas] = $this->qp[$sbas]->extendThesaurusOnTerms($simple_treeq, true, true, false);
$this->needthesaurus[$sbas] = $this->qp[$sbas]->containsColonOperator($this->indep_treeq[$sbas]);
return $this;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,155 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine;
use Alchemy\Phrasea\SearchEngine\SearchEngineOptions;
use Alchemy\Phrasea\SearchEngine\SearchEngineResult;
use Alchemy\Phrasea\Exception\RuntimeException;
use Doctrine\Common\Collections\ArrayCollection;
interface SearchEngineInterface
{
const GEM_TYPE_RECORD = 'record';
const GEM_TYPE_STORY = 'story';
const GEM_TYPE_ENTRY = 'entry';
/**
* Check the status of the search engine
*
* @return array An array of key/value parameters
* @throws RuntimeException if something is wrong
*/
public function status();
/**
*
* @return an array of self::GEM_TYPE_* indexed types
*/
public function availableTypes();
/**
* Add a record to index
*
* @param \record_adapter $record
* @return SearchEngineInterface
* @throws RuntimeException
*/
public function addRecord(\record_adapter $record);
/**
* Remove a record from index
*
* @param \record_adapter $record
* @return SearchEngineInterface
* @throws RuntimeException
*/
public function removeRecord(\record_adapter $record);
/**
* Update a record in index
*
* @param \record_adapter $record
* @return SearchEngineInterface
* @throws RuntimeException
*/
public function updateRecord(\record_adapter $record);
/**
* Add a story to index
*
* @param \record_adapter $story
* @return SearchEngineInterface
* @throws RuntimeException
*/
public function addStory(\record_adapter $story);
/**
* Remove a story from index
*
* @param \record_adapter $story
* @return SearchEngineInterface
* @throws RuntimeException
*/
public function removeStory(\record_adapter $story);
/**
* Update a story in index
*
* @param \record_adapter $story
* @return SearchEngineInterface
* @throws RuntimeException
*/
public function updateStory(\record_adapter $story);
/**
* Add an entry to index
*
* @param \Feed_Entry_Adapter $entry
* @return SearchEngineInterface
* @throws RuntimeException
*/
public function addFeedEntry(\Feed_Entry_Adapter $entry);
/**
* Remove an entry to index
*
* @param \Feed_Entry_Adapter $entry
* @return SearchEngineInterface
* @throws RuntimeException
*/
public function removeFeedEntry(\Feed_Entry_Adapter $entry);
public function updateFeedEntry(\Feed_Entry_Adapter $entry);
public function setOptions(SearchEngineOptions $options);
public function resetOptions();
/**
*
* @param string $query
* @param integer $offset
* @param integer $perPage
*
* @return SearchEngineResult
*/
public function query($query, $offset, $perPage);
/**
* Return an array of suggestions corresponding to the last word of the
* query
*
* @param string $query
*
* @return ArrayCollection A collection of SearchEngineSuggestion
*/
public function autocomplete($query);
/**
* Highlight the fields of a record
*
* @param type $query
* @param type $fields
* @param \record_adapter $record
*
* @return array The array of highlighted fields
*/
public function excerpt($query, $fields, \record_adapter $record);
/**
* Reset the cache of the SE (if applicable)
*
* @return SearchEngineInterface
*/
public function resetCache();
}

View File

@@ -0,0 +1,478 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine;
class SearchEngineOptions implements \Serializable
{
const RECORD_RECORD = 0;
const RECORD_GROUPING = 1;
const TYPE_IMAGE = 'image';
const TYPE_VIDEO = 'video';
const TYPE_AUDIO = 'audio';
const TYPE_DOCUMENT = 'document';
const TYPE_FLASH = 'flash';
const TYPE_ALL = '';
const SORT_RELEVANCE = 'relevance';
const SORT_CREATED_ON = 'created_on';
const SORT_RANDOM = 'random';
const SORT_MODE_ASC = 'asc';
const SORT_MODE_DESC = 'desc';
/**
*
* @var string
*/
protected $record_type;
/**
*
* @var string
*/
protected $search_type = 0;
/**
*
* @var array
*/
protected $collections = array();
/**
*
* @var array
*/
protected $fields = array();
/**
*
* @var array
*/
protected $status = array();
/**
*
* @var DateTime
*/
protected $date_min;
/**
*
* @var DateTime
*/
protected $date_max;
/**
*
* @var array
*/
protected $date_fields = array();
/**
*
* @var string
*/
protected $i18n;
/**
*
* @var boolean
*/
protected $stemming = true;
/**
*
* @var string
*/
protected $sort_by;
/**
*
* @var string
*/
protected $sort_ord = self::SORT_MODE_DESC;
protected $business_fields = array();
/**
* Constructor
*
* @return searchEngine_options
*/
public function __construct()
{
return $this;
}
/**
*
* @param string $locale
*/
public function setLocale($locale)
{
$this->i18n = $locale;
}
/**
*
* @return string
*/
public function getLocale()
{
return $this->i18n;
}
/**
*
* @param const $sort_by
* @param const $sort_ord
* @return searchEngine_options
*/
public function setSort($sort_by, $sort_ord = self::SORT_MODE_DESC)
{
$this->sort_by = $sort_by;
$this->sort_ord = $sort_ord;
return $this;
}
public function allowBusinessFieldsOn(Array $collection)
{
$this->business_fields = $collection;
return $this;
}
public function disallowBusinessFields()
{
$this->business_fields = array();
return $this;
}
public function businessFieldsOn()
{
return $this->business_fields;
}
/**
*
* @return string
*/
public function sortBy()
{
return $this->sort_by;
}
/**
*
* @return string
*/
public function sortOrder()
{
return $this->sort_ord;
}
/**
*
* @param boolean $boolean
* @return searchEngine_options
*/
public function useStemming($boolean)
{
$this->stemming = ! ! $boolean;
return $this;
}
/**
*
* @return boolean
*/
public function stemmed()
{
return $this->stemming;
}
/**
*
* @param int $search_type
* @return searchEngine_options
*/
public function setSearchType($search_type)
{
switch ($search_type) {
case self::RECORD_RECORD:
default:
$this->search_type = self::RECORD_RECORD;
break;
case self::RECORD_GROUPING:
$this->search_type = self::RECORD_GROUPING;
break;
}
return $this;
}
/**
*
* @return int
*/
public function searchType()
{
return $this->search_type;
}
public function onCollections(Array $collections)
{
$this->collections = $collections;
return $this;
}
/**
*
* @return array
*/
public function collections()
{
return $this->collections;
}
public function databoxes()
{
$databoxes = array();
foreach ($this->collections as $collection) {
$databoxes[$collection->get_databox()->get_sbas_id()] = $collection->get_databox();
}
return array_values($databoxes);
}
/**
*
* @param array $fields An array of Databox fields
*/
public function setFields(Array $fields)
{
$this->fields = $fields;
return $this;
}
/**
*
* @return array
*/
public function fields()
{
return $this->fields;
}
/**
*
* @param array $status
* @return searchEngine_options
*/
public function setStatus(Array $status)
{
$tmp = array();
foreach ($status as $n => $options) {
if (count($options) > 1)
continue;
if (isset($options['on'])) {
foreach ($options['on'] as $sbas_id)
$tmp[$n][$sbas_id] = 1;
}
if (isset($options['off'])) {
foreach ($options['off'] as $sbas_id)
$tmp[$n][$sbas_id] = 0;
}
}
$this->status = $tmp;
return $this;
}
/**
*
* @return array
*/
public function getStatus()
{
return $this->status;
}
/**
*
* @param string $record_type
* @return searchEngine_options
*/
public function setRecordType($record_type)
{
switch ($record_type) {
case self::TYPE_ALL:
default:
$this->record_type = self::TYPE_ALL;
break;
case self::TYPE_AUDIO:
$this->record_type = self::TYPE_AUDIO;
break;
case self::TYPE_VIDEO:
$this->record_type = self::TYPE_VIDEO;
break;
case self::TYPE_DOCUMENT:
$this->record_type = self::TYPE_DOCUMENT;
break;
case self::TYPE_FLASH:
$this->record_type = self::TYPE_FLASH;
break;
case self::TYPE_IMAGE:
$this->record_type = self::TYPE_IMAGE;
break;
}
return $this;
}
/**
*
* @return string
*/
public function getRecordType()
{
return $this->record_type;
}
/**
*
* @param string $min_date
* @return searchEngine_options
*/
public function setMinDate($min_date)
{
if ( ! is_null($min_date) && trim($min_date) !== '') {
$this->date_min = DateTime::createFromFormat('Y/m/d H:i:s', $min_date . ' 00:00:00');
}
return $this;
}
/**
*
* @return DateTime
*/
public function getMinDate()
{
return $this->date_min;
}
/**
*
* @param string $max_date
* @return searchEngine_options
*/
public function setMaxDate($max_date)
{
if ( ! is_null($max_date) && trim($max_date) !== '') {
$this->date_max = DateTime::createFromFormat('Y/m/d H:i:s', $max_date . ' 23:59:59');
}
return $this;
}
/**
*
* @return DateTime
*/
public function getMaxDate()
{
return $this->date_max;
}
/**
*
* @param array $fields
* @return searchEngine_options
*/
public function setDateFields(Array $fields)
{
$this->date_fields = $fields;
return $this;
}
/**
*
* @return array
*/
public function getDateFields()
{
return $this->date_fields;
}
/**
*
* @return string
*/
public function serialize()
{
$ret = array();
foreach ($this as $key => $value) {
if ($value instanceof DateTime) {
$value = $value->format('d-m-Y h:i:s');
}
if (in_array($key, array('collections', 'business_fields'))) {
$value = array_map(function($collection) {
return $collection->get_base_id();
}, $value);
}
$ret[$key] = $value;
}
return \p4string::jsonencode($ret);
}
/**
*
* @param string $serialized
* @return searchEngine_options
*/
public function unserialize($serialized)
{
$serialized = json_decode($serialized);
foreach ($serialized as $key => $value) {
if (is_null($value)) {
$value = null;
} elseif (in_array($key, array('date_min', 'date_max'))) {
$value = new DateTime($value);
} elseif ($value instanceof stdClass) {
$tmpvalue = (array) $value;
$value = array();
foreach ($tmpvalue as $k => $data) {
$k = ctype_digit($k) ? (int) $k : $k;
$value[$k] = $data;
}
} elseif (in_array($key, array('collections', 'business_fields'))) {
$value = array_map(function($base_id) {
return \collection::get_from_base_id($base_id);
}, $value);
}
$this->$key = $value;
}
return $this;
}
}

View File

@@ -0,0 +1,108 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine;
use Doctrine\Common\Collections\ArrayCollection;
class SearchEngineResult
{
protected $results;
protected $query;
protected $duration;
protected $offsetStart;
protected $available;
protected $total;
protected $error;
protected $warning;
protected $suggestions;
protected $propositions;
protected $indexes;
public function __construct(ArrayCollection $results, $query, $duration, $offsetStart, $available, $total, $error, $warning, $suggestions, $propositions, $indexes)
{
$this->results = $results;
$this->query = $query;
$this->duration = (float) $duration;
$this->offsetStart = (int) $offsetStart;
$this->available = (int)$available;
$this->total = (int)$total;
$this->error = $error;
$this->warning = $warning;
$this->suggestions = $suggestions;
$this->propositions = $propositions;
$this->indexes = $indexes;
return $this;
}
public function results()
{
return $this->results;
}
public function query()
{
return $this->query;
}
public function duration()
{
return $this->duration;
}
public function totalPages($amountPerPage)
{
return ceil($this->available / $amountPerPage);
}
public function currentPage($amountPerPage)
{
return ceil($this->offsetStart / $amountPerPage);
}
public function available()
{
return $this->available;
}
public function total()
{
return $this->total;
}
public function error()
{
return $this->error;
}
public function warning()
{
return $this->warning;
}
public function suggestions()
{
return $this->suggestions;
}
public function proposals()
{
return $this->propositions;
}
public function indexes()
{
return $this->indexes;
}
}

View File

@@ -0,0 +1,41 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine;
class SearchEngineSuggestion
{
private $query;
private $suggestion;
private $hits;
public function __construct($query, $suggestion, $hits)
{
$this->query = $query;
$this->suggestion = $suggestion;
$this->hits = (int) $hits;
}
public function query()
{
return $this->query;
}
public function suggestion()
{
return $this->suggestion;
}
public function hits()
{
return $this->hits;
}
}

View File

@@ -0,0 +1,689 @@
<?php
/*
* This file is part of Phraseanet
*
* (c) 2005-2012 Alchemy
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace Alchemy\Phrasea\SearchEngine;
use Alchemy\Phrasea\Exception\RuntimeException;
use Doctrine\Common\Collections\ArrayCollection;
require_once __DIR__ . '/../../../vendor/sphinx/sphinxapi.php';
class SphinxSearch implements SearchEngineInterface
{
/**
*
* @var \SphinxClient
*/
protected $sphinx;
/**
*
* @var \PDO
*/
protected $rt_conn;
protected $options;
public function __construct($host, $port, $rt_host, $rt_port)
{
$this->options = new SearchEngineOptions();
$this->sphinx = new \SphinxClient();
$this->sphinx->SetServer($host, $port);
$this->sphinx->SetArrayResult(true);
$this->sphinx->SetConnectTimeout(1);
try {
$this->rt_conn = @new \PDO(sprintf('mysql:host=%s;port=%s;', $rt_host, $rt_port));
} catch (\PDOException $e) {
$this->rt_conn = null;
}
return $this;
}
public function status()
{
$status = $this->sphinx->Status();
if (false === $status) {
throw new Exception(_('Sphinx server is offline'));
}
if (null === $this->rt_conn) {
throw new RuntimeException('Unable to connect to sphinx rt');
}
return $status;
}
public function availableTypes()
{
return array(self::GEM_TYPE_RECORD, self::GEM_TYPE_STORY);
}
public function addRecord(\record_adapter $record)
{
$all_datas = array();
foreach ($record->get_caption()->get_fields(null, true) as $field) {
if ( ! $field->is_indexable()) {
continue;
}
$all_datas[] = $field->get_serialized_values();
foreach ($field->get_values() as $value) {
$this->rt_conn->exec("REPLACE INTO "
. "metas_realtime" . $this->CRCdatabox($record->get_databox()) . " VALUES (
'" . $value->getId() . "'
,'" . str_replace("'", "\'", $value->getValue()) . "'
,'" . $value->getDatabox_field()->get_id() . "'
," . $record->get_record_id() . "
," . $record->get_sbas_id() . "
," . $record->get_collection()->get_coll_id() . "
," . (int) $record->is_grouping() . "
," . crc32($record->get_sbas_id() . '_' . $value->getDatabox_field()->get_id()) . "
," . crc32($record->get_sbas_id() . '_' . $record->get_collection()->get_coll_id()) . "
," . crc32($record->get_sbas_id() . '_' . $record->get_record_id()) . "
," . crc32($record->get_type()) . "
,0
," . (int) $value->getDatabox_field()->isBusiness() . "
," . crc32($record->get_collection()->get_coll_id() . '_' . (int) $value->getDatabox_field()->isBusiness()) . "
," . $record->get_creation_date()->format('U') . " )");
}
}
$this->rt_conn->exec("REPLACE INTO "
. "docs_realtime" . $this->CRCdatabox($record->get_databox()) . " VALUES (
'" . $record->get_record_id() . "'
,'" . str_replace("'", "\'", implode(' ', $all_datas)) . "'
," . $record->get_record_id() . "
," . $record->get_sbas_id() . "
," . $record->get_collection()->get_coll_id() . "
," . (int) $record->is_grouping() . "
," . crc32($record->get_sbas_id() . '_' . $record->get_collection()->get_coll_id()) . "
," . crc32($record->get_sbas_id() . '_' . $record->get_record_id()) . "
," . crc32($record->get_type()) . "
,0
," . $record->get_creation_date()->format('U') . " )");
}
public function removeRecord(\record_adapter $record)
{
$CRCdatabox = $this->CRCdatabox($record->get_databox());
$indexes = array(
"metadatas" . $CRCdatabox,
"metadatas" . $CRCdatabox . "_stemmed_en",
"metadatas" . $CRCdatabox . "_stemmed_fr",
);
foreach ($record->get_caption()->get_fields(null, true) as $field) {
foreach ($field->get_values() as $value) {
foreach ($indexes as $index) {
$this->sphinx->UpdateAttributes($index, array("deleted"), array($value->getId() => array(1)));
}
$this->rt_conn->exec("DELETE FROM metas_realtime" . $CRCdatabox . " WHERE id = " . $value->getId());
}
}
$indexes = array(
"documents" . $CRCdatabox,
"documents" . $CRCdatabox . "_stemmed_fr",
"documents" . $CRCdatabox . "_stemmed_en"
);
foreach ($indexes as $index) {
$this->sphinx->UpdateAttributes($index, array("deleted"), array($record->get_record_id() => array(1)));
}
$this->rt_conn->exec("DELETE FROM docs_realtime" . $CRCdatabox . " WHERE id = " . $record->get_record_id());
}
public function updateRecord(\record_adapter $record)
{
$this->removeRecord($record);
$this->addRecord($record);
}
public function addStory(\record_adapter $record)
{
return $this->addRecord($record);
}
public function removeStory(\record_adapter $record)
{
return $this->removeRecord($record);
}
public function updateStory(\record_adapter $record)
{
return $this->updateRecord($record);
}
public function addFeedEntry(\Feed_Entry_Adapter $entry)
{
throw new RuntimeException('Feed Entry indexing not supported by Sphinx Search Engine');
}
public function removeFeedEntry(\Feed_Entry_Adapter $entry)
{
throw new RuntimeException('Feed Entry indexing not supported by Sphinx Search Engine');
}
public function updateFeedEntry(\Feed_Entry_Adapter $entry)
{
throw new RuntimeException('Feed Entry indexing not supported by Sphinx Search Engine');
}
public function setOptions(SearchEngineOptions $options)
{
$this->options = $options;
$this->applyOptions($options);
}
public function resetOptions()
{
$this->options = new SearchEngineOptions();
$this->resetSphinx();
}
private function resetSphinx()
{
$this->sphinx->ResetGroupBy();
$this->sphinx->ResetFilters();
$this->sphinx->ResetOverrides();
}
public function query($query, $offset, $perPage)
{
assert(is_int($offset));
assert($offset >= 0);
assert(is_int($perPage));
$query = $this->parseQuery($query);
$preg = preg_match('/\s?recordid\s?=\s?([0-9]+)/i', $query, $matches, 0, 0);
if ($preg > 0) {
$this->sphinx->SetFilter('record_id', array($matches[1]));
$query = '';
}
$this->sphinx->SetLimits($offset, $perPage);
$this->sphinx->SetMatchMode(SPH_MATCH_EXTENDED2);
$index = $this->getQueryIndex($query);
$res = $this->sphinx->Query($query, $index);
$results = new ArrayCollection();
if ($res === false) {
if ($this->sphinx->IsConnectError() === true) {
$error = _('Sphinx server is offline');
} else {
$error = $this->sphinx->GetLastError();
}
$warning = $this->sphinx->GetLastWarning();
$total = $available = $duration = 0;
$suggestions = $propositions = array();
} else {
$error = $res['error'];
$warning = $res['warning'];
$duration = $res['time'];
$total = $res['total_found'];
$available = $res['total'];
$resultOffset = $offset;
if (isset($res['matches'])) {
foreach ($res['matches'] as $record_id => $match) {
try {
$record =
new \record_adapter(
$match['attrs']['sbas_id']
, $match['attrs']['record_id']
, $resultOffset
);
$results->add($record);
} catch (Exception $e) {
}
$resultOffset ++;
}
}
$suggestions = $this->getSuggestions($query);
$propositions = array();
}
return new SearchEngineResult($results, $query, $duration, $offset, $available, $total, $error, $warning, $suggestions, $propositions, $index);
}
public function autocomplete($query)
{
$words = explode(" ", $this->cleanupQuery($query));
return $this->getSuggestions(array_pop($words));
}
public function excerpt($query, $fields, \record_adapter $record)
{
$index = '';
// in this case search is done on metas
if ($this->options->fields() || $this->options->businessFieldsOn()) {
if ($this->options->stemmed() && $this->options->getLocale()) {
$index = 'metadatas' . $this->CRCdatabox($record->get_databox()) . '_stemmed_' . $this->options->getLocale();
} else {
$index = 'metadatas' . $this->CRCdatabox($record->get_databox());
}
} else {
if ($this->options->stemmed()) {
$index = 'documents' . $this->CRCdatabox($record->get_databox()) . '_stemmed_' . $this->options->getLocale();
} else {
$index = 'documents' . $this->CRCdatabox($record->get_databox());
}
}
$opts = array(
'before_match' => "<em>",
'after_match' => "</em>",
);
$fields_to_send = array();
foreach ($fields as $k => $f) {
$fields_to_send[$k] = $f['value'];
}
return $this->sphinx->BuildExcerpts($fields_to_send, $index, $query, $opts);
}
public function resetCache()
{
return $this;
}
/**
* Reset sphinx client and apply the options
*
* Only apply filters and group by
*
* @param SearchEngineOptions $options
* @return SphinxSearch
*/
protected function applyOptions(SearchEngineOptions $options)
{
$this->resetSphinx();
$filters = array();
foreach ($options->collections() as $collection) {
$filters[] = crc32($collection->get_databox()->get_sbas_id() . '_' . $collection->get_coll_id());
}
$this->sphinx->SetFilter('crc_sbas_coll', $filters);
$this->sphinx->SetFilter('deleted', array(0));
$this->sphinx->SetFilter('parent_record_id', array($options->searchType()));
if ($options->fields()) {
$filters = array();
foreach ($options->fields() as $field) {
$filters[] = crc32($field->get_databox()->get_sbas_id() . '_' . $field->get_id());
}
$this->sphinx->SetFilter('crc_struct_id', $filters);
}
if ($options->businessFieldsOn()) {
$crc_coll_business = array();
foreach ($options->businessFieldsOn() as $collection) {
$crc_coll_business[] = crc32($collection->get_coll_id() . '_1');
$crc_coll_business[] = crc32($collection->get_coll_id() . '_0');
}
$non_business = array();
foreach ($options->collections() as $collection) {
foreach ($options->businessFieldsOn() as $BFcollection) {
if ($collection->get_base_id() == $BFcollection->get_base_id()) {
continue 2;
}
}
$non_business[] = $collection;
}
foreach ($non_business as $collection) {
$crc_coll_business[] = crc32($collection->get_coll_id() . '_0');
}
$this->sphinx->SetFilter('crc_coll_business', $crc_coll_business);
} elseif ($options->fields()) {
$this->sphinx->SetFilter('business', array(0));
}
/**
* @todo : enhance : check status in a better way
*/
foreach ($options->databoxes() as $databox) {
$status_opts = $options->getStatus();
foreach ($databox->get_statusbits() as $n => $status) {
if ( ! array_key_exists($n, $status_opts))
continue;
if ( ! array_key_exists($databox->get_sbas_id(), $status_opts[$n]))
continue;
$crc = crc32($databox->get_sbas_id() . '_' . $n);
$this->sphinx->SetFilter('status', array($crc), ($status_opts[$n][$databox->get_sbas_id()] == '0'));
}
}
if ($options->getRecordType()) {
$this->sphinx->SetFilter('crc_type', array(crc32($options->getRecordType())));
}
$order = '';
switch ($options->sortOrder()) {
case SearchEngineOptions::SORT_MODE_ASC:
$order = 'ASC';
break;
case SearchEngineOptions::SORT_MODE_DESC:
default:
$order = 'DESC';
break;
}
switch ($options->sortBy()) {
case SearchEngineOptions::SORT_RANDOM:
$sort = '@random';
break;
case SearchEngineOptions::SORT_RELEVANCE:
default:
$sort = '@relevance ' . $order . ', created_on ' . $order;
break;
case SearchEngineOptions::SORT_CREATED_ON:
$sort = 'created_on ' . $order;
break;
}
$this->sphinx->SetGroupBy('crc_sbas_record', SPH_GROUPBY_ATTR, $sort);
return $this;
}
/**
* Return unique integer key for a databox
*
* @param \databox $databox
* @return int
*/
private function CRCdatabox(\databox $databox)
{
return crc32(
str_replace(
array('.', '%')
, '_'
, sprintf('%s_%s_%s_%s', $databox->get_host(), $databox->get_port(), $databox->get_user(), $databox->get_dbname())
)
);
}
/**
* Remove all keywords, operators, quotes from a query string
*
* @param string $query
* @return string
*/
private function cleanupQuery($query)
{
return str_replace(array("all", "last", "et", "ou", "sauf", "and", "or", "except", "in", "dans", "'", '"', "(", ")", "_", "-", "+"), ' ', $query);
}
/**
* Return a collection of suggestion corresponding a query
*
* @param string $query
* @return ArrayCollection An array collection of SearchEngineSuggestion
*/
private function getSuggestions($query)
{
// First we split the query into simple words
$words = explode(" ", $this->cleanupQuery(mb_strtolower($query)));
$tmpWords = array();
foreach ($words as $word) {
if (trim($word) === '') {
continue;
}
$tmpWords[] = $word;
}
$words = array_unique($tmpWords);
$altVersions = array();
// As we got words, we look for alternate word for each of them
if (function_exists('enchant_broker_init') && $this->options->getLocale()) {
$broker = enchant_broker_init();
if (enchant_broker_dict_exists($broker, $this->options->getLocale())) {
$dictionnary = enchant_broker_request_dict($broker, $this->options->getLocale());
foreach ($words as $word) {
if (enchant_dict_check($dictionnary, $word) == false) {
$suggs = array_merge(array($word), enchant_dict_suggest($dictionnary, $word));
}
$altVersions[$word] = array_unique($suggs);
}
enchant_broker_free_dict($dictionnary);
}
enchant_broker_free($broker);
}
/**
* @todo enhance the trigramm query, as it could be sent in one batch
*/
foreach ($altVersions as $word => $versions) {
$altVersions[$word] = array_unique(array_merge($versions, $this->get_sugg_trigrams($word)));
}
// We now build an array of all possibilities based on the original query
$queries = array($query);
foreach ($altVersions as $word => $versions) {
$tmp_queries = array();
foreach ($versions as $version) {
foreach ($queries as $alt_query) {
$tmp_queries[] = $alt_query;
$tmp_queries[] = str_replace($word, $version, $alt_query);
}
$tmp_queries[] = str_replace($word, $version, $query);
}
$queries = array_unique(array_merge($queries, $tmp_queries));
}
$suggestions = array();
$max_results = 0;
foreach ($queries as $alt_query) {
$results = $this->sphinx->Query($alt_query, $this->getQueryIndex($alt_query));
if ($results !== false && isset($results['total_found'])) {
if ($results['total_found'] > 0) {
$max_results = max($max_results, (int) $results['total_found']);
$suggestions[] = new SearchEngineSuggestion($query, $alt_query, (int) $results['total_found']);
}
}
}
usort($suggestions, array('self', 'suggestionsHitSorter'));
$tmpSuggestions = new ArrayCollection();
foreach ($suggestions as $key => $suggestion) {
if ($suggestion->hits() < ($max_results / 100)) {
continue;
}
$tmpSuggestions->add($suggestion);
}
return $tmpSuggestions;
}
private static function suggestionsHitSorter(SearchEngineSuggestion $a, SearchEngineSuggestion $b)
{
if ($a->hits() == $b->hits()) {
return 0;
}
return ($a->hits() > $b->hits()) ? -1 : 1;
}
private function BuildTrigrams($keyword)
{
$t = "__" . $keyword . "__";
$trigrams = "";
for ($i = 0; $i < strlen($t) - 2; $i ++ ) {
$trigrams .= substr($t, $i, 3) . " ";
}
return $trigrams;
}
private function get_sugg_trigrams($word)
{
$trigrams = $this->BuildTrigrams($word);
$query = "\"$trigrams\"/1";
$len = strlen($word);
$this->resetSphinx();
$this->sphinx->SetMatchMode(SPH_MATCH_EXTENDED2);
$this->sphinx->SetRankingMode(SPH_RANK_WORDCOUNT);
$this->sphinx->SetFilterRange("len", $len - 2, $len + 4);
$this->sphinx->SetSortMode(SPH_SORT_EXTENDED, "@weight DESC");
$this->sphinx->SetLimits(0, 10);
$indexes = array();
foreach ($this->options->databoxes() as $databox) {
$indexes[] = 'suggest' . $this->CRCdatabox($databox);
}
$index = implode(',', $indexes);
$res = $this->sphinx->Query($query, $index);
if ($this->sphinx->Status() === false) {
return array();
}
if ( ! $res || ! isset($res["matches"])) {
return array();
}
$this->sphinx->ResetGroupBy();
$this->sphinx->ResetFilters();
$words = array();
foreach ($res["matches"] as $match) {
$words[] = $match['attrs']['keyword'];
}
$this->applyOptions($this->options);
return $words;
}
private function getQueryIndex($query)
{
$index = '*';
$index_keys = array();
foreach ($this->options->databoxes() as $databox) {
$index_keys[] = $this->CRCdatabox($databox);
}
if (count($index_keys) > 0) {
if ($this->options->fields() || $this->options->businessFieldsOn()) {
if ($query !== '' && $this->options->stemmed() && $this->options->getLocale()) {
$index = ', metadatas' . implode('_stemmed_' . $this->options->getLocale() . ', metadatas', $index_keys) . '_stemmed_' . $this->options->getLocale();
} else {
$index = 'metadatas' . implode(',metadatas', $index_keys);
}
$index .= ', metas_realtime' . implode(', metas_realtime', $index_keys);
} else {
if ($query !== '' && $this->options->stemmed() && $this->options->getLocale()) {
$index .= ', documents' . implode('_stemmed_' . $this->options->getLocale() . ', documents', $index_keys) . '_stemmed_' . $this->options->getLocale();
} else {
$index .= 'documents' . implode(', documents', $index_keys);
}
$index .= ', docs_realtime' . implode(', docs_realtime', $index_keys);
}
}
return $index;
}
private function parseQuery($query)
{
$query = trim($query);
while (substr($query, 0, 1) === '(' && substr($query, -1) === ')') {
$query = substr($query, 1, (mb_strlen($query) - 2));
}
if ($query == 'all') {
$query = '';
}
while (mb_strpos($query, ' ') !== false) {
$query = str_replace(' ', ' ', $query);
}
$offset = 0;
while (false !== $pos = mb_strpos($query, '-', $offset)) {
$offset = $pos + 1;
if ($pos === 0) {
continue;
}
if (mb_substr($query, ($pos - 1), 1) !== ' ') {
$query = mb_substr($query, 0, ($pos)) . ' ' . mb_substr($query, $pos + 1);
}
}
$query = str_ireplace(array(' ou ', ' or '), '|', $query);
$query = str_ireplace(array(' sauf ', ' except '), ' -', $query);
$query = str_ireplace(array(' and ', ' et '), ' +', $query);
return $query;
}
}