mirror of
https://github.com/alchemy-fr/Phraseanet.git
synced 2025-10-18 07:23:13 +00:00
Merge pull request #4305 from alchemy-fr/PHRAS-3805_translator
PHRAS-3805_translator
This commit is contained in:
@@ -55,6 +55,7 @@ use Alchemy\Phrasea\Command\Task\TaskStart;
|
||||
use Alchemy\Phrasea\Command\Task\TaskState;
|
||||
use Alchemy\Phrasea\Command\Task\TaskStop;
|
||||
use Alchemy\Phrasea\Command\Thesaurus\FindConceptsCommand;
|
||||
use Alchemy\Phrasea\Command\Thesaurus\Translator\TranslateCommand;
|
||||
use Alchemy\Phrasea\Command\UpgradeDBDatas;
|
||||
use Alchemy\Phrasea\Command\User\UserApplicationsCommand;
|
||||
use Alchemy\Phrasea\Command\User\UserCreateCommand;
|
||||
@@ -172,6 +173,7 @@ $cli->command(new IndexPopulateCommand());
|
||||
$cli->command(new QueryParseCommand());
|
||||
$cli->command(new QuerySampleCommand());
|
||||
$cli->command(new FindConceptsCommand());
|
||||
$cli->command(new TranslateCommand());
|
||||
|
||||
$cli->command(new WorkerExecuteCommand());
|
||||
$cli->command(new WorkerHeartbeatCommand());
|
||||
|
@@ -0,0 +1,8 @@
|
||||
<?php
|
||||
|
||||
namespace Alchemy\Phrasea\Command\Thesaurus\Translator;
|
||||
|
||||
class ConfigurationException extends \Exception
|
||||
{
|
||||
|
||||
}
|
@@ -0,0 +1,127 @@
|
||||
<?php
|
||||
|
||||
namespace Alchemy\Phrasea\Command\Thesaurus\Translator;
|
||||
|
||||
use appbox;
|
||||
use collection;
|
||||
use databox;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Yaml\Yaml;
|
||||
use Unicode;
|
||||
|
||||
Class GlobalConfiguration
|
||||
{
|
||||
const CONFIG_DIR = "/config/translator/";
|
||||
const CONFIG_FILE = "configuration.yml";
|
||||
|
||||
private $configuration = null;
|
||||
|
||||
/** @var Job[] */
|
||||
private $jobs = [];
|
||||
|
||||
private $databoxes = [];
|
||||
|
||||
/**
|
||||
* @var bool
|
||||
*/
|
||||
private $dryRun;
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
private $reportFormat;
|
||||
|
||||
/**
|
||||
* @param appbox $appBox
|
||||
* @param array $global_conf
|
||||
*/
|
||||
private function __construct($appBox, Unicode $unicode, $global_conf, bool $dryRun, string $reportFormat, OutputInterface $output)
|
||||
{
|
||||
$this->configuration = $global_conf;
|
||||
$this->dryRun = $dryRun;
|
||||
$this->reportFormat = $reportFormat;
|
||||
|
||||
// list databoxes and collections to access by id or by name
|
||||
$this->databoxes = [];
|
||||
foreach ($appBox->get_databoxes() as $databox) {
|
||||
$sbas_id = $databox->get_sbas_id();
|
||||
$sbas_name = $databox->get_dbname();
|
||||
$this->databoxes[$sbas_id] = [
|
||||
'dbox' => $databox,
|
||||
'collections' => []
|
||||
];
|
||||
$this->databoxes[$sbas_name] = &$this->databoxes[$sbas_id];
|
||||
// list all collections
|
||||
foreach ($databox->get_collections() as $collection) {
|
||||
$coll_id = $collection->get_coll_id();
|
||||
$coll_name = $collection->get_name();
|
||||
$this->databoxes[$sbas_id]['collections'][$coll_id] = $collection;
|
||||
$this->databoxes[$sbas_id]['collections'][$coll_name] = &$this->databoxes[$sbas_id]['collections'][$coll_id];
|
||||
}
|
||||
}
|
||||
|
||||
foreach($global_conf['jobs'] as $job_name => $job_conf) {
|
||||
$this->jobs[$job_name] = new Job($this, $job_conf, $unicode, $output);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param appbox $appBox
|
||||
* @param string $root
|
||||
* @return GlobalConfiguration
|
||||
* @throws ConfigurationException
|
||||
*/
|
||||
public static function create(appbox $appBox, Unicode $unicode, string $root, bool $dryRun, string $reportFormat, OutputInterface $output): GlobalConfiguration
|
||||
{
|
||||
try {
|
||||
$config_file = ($config_dir = $root . self::CONFIG_DIR) . self::CONFIG_FILE;
|
||||
|
||||
@mkdir($config_dir, 0777, true);
|
||||
|
||||
$config = Yaml::parse(file_get_contents($config_file));
|
||||
return new self($appBox, $unicode, $config['translator'], $dryRun, $reportFormat, $output);
|
||||
}
|
||||
catch (\Exception $e) {
|
||||
throw new ConfigurationException(sprintf("missing or bad configuration (%s)", $e->getMessage()));
|
||||
}
|
||||
}
|
||||
|
||||
public function getJobs()
|
||||
{
|
||||
return $this->jobs;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string|int $sbasIdOrName
|
||||
* @return databox|null
|
||||
*/
|
||||
public function getDatabox($sbasIdOrName)
|
||||
{
|
||||
return isset($this->databoxes[$sbasIdOrName]) ? $this->databoxes[$sbasIdOrName]['dbox'] : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string|int $sbasIdOrName
|
||||
* @param string|int $collIdOrName
|
||||
* @return collection|null
|
||||
*/
|
||||
public function getCollection($sbasIdOrName, $collIdOrName)
|
||||
{
|
||||
return $this->databoxes[$sbasIdOrName]['collections'][$collIdOrName] ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function isDryRun(): bool
|
||||
{
|
||||
return $this->dryRun;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return string
|
||||
*/
|
||||
public function getReportFormat(): string
|
||||
{
|
||||
return $this->reportFormat;
|
||||
}
|
||||
}
|
556
lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php
Normal file
556
lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php
Normal file
@@ -0,0 +1,556 @@
|
||||
<?php
|
||||
|
||||
namespace Alchemy\Phrasea\Command\Thesaurus\Translator;
|
||||
|
||||
use databox;
|
||||
use DOMElement;
|
||||
use DOMNode;
|
||||
use DOMNodeList;
|
||||
use DOMXpath;
|
||||
use PDO;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use thesaurus_xpath;
|
||||
use Unicode;
|
||||
|
||||
class Job
|
||||
{
|
||||
const NEVER_CLEANUP_SOURCE = 'never';
|
||||
const ALWAYS_CLEANUP_SOURCE = 'always';
|
||||
const CLEANUP_SOURCE_IF_TRANSLATED = 'if_translated';
|
||||
|
||||
|
||||
private $active = true;
|
||||
|
||||
/** @var string[] */
|
||||
private $errors = []; // error messages while parsing conf
|
||||
|
||||
/** @var databox|null $databox */
|
||||
private $databox = null;
|
||||
|
||||
/** @var array */
|
||||
private $selectRecordParams = [];
|
||||
|
||||
private $selectRecordsSql = null;
|
||||
|
||||
/** @var array list of field ids of "source_field" (unique) and "destination_fields" (many) */
|
||||
private $selectRecordFieldIds;
|
||||
|
||||
/**
|
||||
* @var OutputInterface
|
||||
*/
|
||||
private $output;
|
||||
|
||||
private $source_field; // infos about the "source_field"
|
||||
private $destination_fields; // infos about the "destination_fields" (key=lng)
|
||||
|
||||
/**
|
||||
* @var Unicode
|
||||
*/
|
||||
private $unicode;
|
||||
|
||||
/** @var DOMXpath|false|thesaurus_xpath */
|
||||
private $xpathTh;
|
||||
|
||||
/**
|
||||
* @var DOMNodeList
|
||||
* The thesaurus branch(es) linked to the "source_field"
|
||||
*/
|
||||
private $tbranches;
|
||||
|
||||
/** @var bool */
|
||||
private $cleanupDestination;
|
||||
|
||||
/** @var string */
|
||||
private $cleanupSource = self::NEVER_CLEANUP_SOURCE;
|
||||
/**
|
||||
* @var GlobalConfiguration
|
||||
*/
|
||||
private $globalConfiguration;
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $job_conf;
|
||||
/**
|
||||
* @var \collection|null
|
||||
*/
|
||||
private $setCollection = null;
|
||||
/**
|
||||
* @var string
|
||||
*/
|
||||
private $setStatus = null; // format 0xx1100xx01xxxx
|
||||
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $notTranslated; // for condensed report
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $incompletelyTranslated; // for condensed report
|
||||
/**
|
||||
* @var array
|
||||
*/
|
||||
private $fullyTranslated; // for condensed report
|
||||
/**
|
||||
* @var int
|
||||
*/
|
||||
private $recordsDone; // for condensed report
|
||||
|
||||
/**
|
||||
* @param GlobalConfiguration $globalConfiguration
|
||||
* @param array $job_conf
|
||||
*/
|
||||
public function __construct($globalConfiguration, $job_conf, Unicode $unicode, OutputInterface $output)
|
||||
{
|
||||
$this->globalConfiguration = $globalConfiguration;
|
||||
$this->job_conf = $job_conf;
|
||||
$this->unicode = $unicode;
|
||||
$this->output = $output;
|
||||
|
||||
if (array_key_exists('active', $job_conf) && $job_conf['active'] === false) {
|
||||
$this->active = false;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
$this->errors = [];
|
||||
foreach (['active', 'databox', 'source_field', 'destination_fields'] as $mandatory) {
|
||||
if (!isset($job_conf[$mandatory])) {
|
||||
$this->errors[] = sprintf("Missing mandatory setting (%s).", $mandatory);
|
||||
}
|
||||
}
|
||||
if (!empty($this->errors)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!($this->databox = $globalConfiguration->getDatabox($job_conf['databox']))) {
|
||||
$this->errors[] = sprintf("unknown databox (%s).", $job_conf['databox']);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if(array_key_exists('set_collection', $job_conf)) {
|
||||
if(!($this->setCollection = $globalConfiguration->getCollection($this->databox->get_sbas_id(), $job_conf['set_collection']))) {
|
||||
$this->errors[] = sprintf("unknown setCollection (%s).", $job_conf['set_collection']);
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if(array_key_exists('set_status', $job_conf)) {
|
||||
$this->setStatus = $job_conf['set_status'];
|
||||
}
|
||||
|
||||
|
||||
$cnx = $this->databox->get_connection();
|
||||
|
||||
// get infos about the "source_field"
|
||||
//
|
||||
$sql = "SELECT `id`, `tbranch` FROM `metadatas_structure` WHERE `name` = :name AND `tbranch` != ''";
|
||||
$stmt = $cnx->executeQuery($sql, [':name' => $job_conf['source_field']]);
|
||||
$this->source_field = $stmt->fetch(PDO::FETCH_ASSOC);
|
||||
$stmt->closeCursor();
|
||||
if (!$this->source_field) {
|
||||
$this->errors[] = sprintf("field (%s) not found or not linked to thesaurus.", $job_conf['source_field']);
|
||||
|
||||
return;
|
||||
}
|
||||
$this->source_field['lng'] = array_key_exists('source_lng', $job_conf) ? $job_conf['source_lng'] : null;
|
||||
$this->selectRecordFieldIds[] = $this->source_field['id'];
|
||||
$this->xpathTh = $this->databox->get_xpath_thesaurus();
|
||||
$this->tbranches = $this->xpathTh->query($this->source_field['tbranch']);
|
||||
if (!$this->tbranches || $this->tbranches->length <= 0) {
|
||||
$this->errors[] = sprintf("thesaurus branch(es) (%s) not found.", $this->source_field['tbranch']);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// get infos about the "destination_fields"
|
||||
//
|
||||
$this->destination_fields = [];
|
||||
$sql = "SELECT `id`, `name` FROM `metadatas_structure` WHERE `name` = :name ";
|
||||
$stmt = $cnx->prepare($sql);
|
||||
foreach ($job_conf['destination_fields'] as $tf) {
|
||||
list($lng, $fname) = explode(':', $tf);
|
||||
$stmt->execute([':name' => $fname]);
|
||||
if (!($row = $stmt->fetch(PDO::FETCH_ASSOC))) {
|
||||
$this->output->writeln(sprintf("<warning>undefined field (%s) (ignored).</warning>", $fname));
|
||||
continue;
|
||||
}
|
||||
$this->destination_fields[$lng] = $row;
|
||||
$stmt->closeCursor();
|
||||
|
||||
$this->selectRecordFieldIds[] = $row['id'];
|
||||
}
|
||||
|
||||
if (empty($this->destination_fields)) {
|
||||
$this->errors[] = sprintf("<warning>no \"destination_field\" found.</warning>");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// misc settings
|
||||
$this->cleanupDestination = array_key_exists('cleanup_destination', $job_conf) && $job_conf['cleanup_destination'] === true;
|
||||
$this->cleanupSource = array_key_exists('cleanup_source', $job_conf) ? $job_conf['cleanup_source'] : self::NEVER_CLEANUP_SOURCE;
|
||||
|
||||
// build records select sql
|
||||
//
|
||||
$selectRecordClauses = [];
|
||||
$this->selectRecordParams = [];
|
||||
if (array_key_exists('if_collection', $job_conf)) {
|
||||
if (!($coll = $globalConfiguration->getCollection($job_conf['databox'], $job_conf['if_collection']))) {
|
||||
$this->errors[] = sprintf("unknown collection (%s)", $job_conf['if_collection']);
|
||||
|
||||
return;
|
||||
}
|
||||
$selectRecordClauses[] = "`coll_id` = :coll_id";
|
||||
$this->selectRecordParams[':coll_id'] = $coll->get_coll_id();
|
||||
}
|
||||
|
||||
if (array_key_exists('if_status', $job_conf)) {
|
||||
$selectRecordClauses[] = "`status` & b:sb_and = b:sb_equ";
|
||||
$this->selectRecordParams[':sb_and'] = str_replace(['0', 'x'], ['1', '0'], $job_conf['if_status']);
|
||||
$this->selectRecordParams[':sb_equ'] = str_replace('x', '0', $job_conf['if_status']);
|
||||
}
|
||||
|
||||
$selectRecordClauses[] = "`meta_struct_id` IN ("
|
||||
. join(
|
||||
',',
|
||||
array_map(function ($id) use ($cnx) {
|
||||
return $cnx->quote($id);
|
||||
}, $this->selectRecordFieldIds)
|
||||
)
|
||||
. ")";
|
||||
|
||||
$sql = "SELECT `record_id`, `meta_struct_id`, `metadatas`.`id` AS meta_id, `value` FROM";
|
||||
$sql .= " `record` INNER JOIN `metadatas` USING(`record_id`)";
|
||||
$sql .= " WHERE " . join(" AND ", $selectRecordClauses);
|
||||
$sql .= " ORDER BY `record_id` ASC";
|
||||
$this->selectRecordsSql = $sql;
|
||||
}
|
||||
|
||||
public function run()
|
||||
{
|
||||
$cnx = $this->databox->get_connection();
|
||||
$stmt = $cnx->executeQuery($this->selectRecordsSql, $this->selectRecordParams);
|
||||
|
||||
$currentRid = '?';
|
||||
$this->recordsDone = 0;
|
||||
$this->notTranslated = [];
|
||||
$this->incompletelyTranslated = [];
|
||||
$this->fullyTranslated = [];
|
||||
|
||||
$metas = $emptyValues = array_map(function () {
|
||||
return [];
|
||||
}, array_flip($this->selectRecordFieldIds));
|
||||
while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) {
|
||||
if ($currentRid == '?') {
|
||||
$currentRid = $row['record_id'];
|
||||
}
|
||||
if ($row['record_id'] !== $currentRid) {
|
||||
// change record
|
||||
$this->doRecord($currentRid, $metas); // flush previous record
|
||||
$currentRid = $row['record_id'];
|
||||
$metas = $emptyValues;
|
||||
}
|
||||
|
||||
$metas[$row['meta_struct_id']][$row['meta_id']] = $row['value'];
|
||||
}
|
||||
if($currentRid !== '?') {
|
||||
$this->doRecord($currentRid, $metas); // flush last record
|
||||
}
|
||||
|
||||
$stmt->closeCursor();
|
||||
|
||||
// condensed report
|
||||
//
|
||||
if($this->globalConfiguration->getReportFormat() === 'condensed') {
|
||||
$this->output->writeln(sprintf("%d records done.", $this->recordsDone));
|
||||
if(!empty($this->notTranslated)) {
|
||||
ksort($this->notTranslated, SORT_STRING|SORT_FLAG_CASE);
|
||||
$this->output->writeln("Not translated terms:");
|
||||
foreach ($this->notTranslated as $term => $n) {
|
||||
$this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n));
|
||||
}
|
||||
}
|
||||
if(!empty($this->incompletelyTranslated)) {
|
||||
ksort($this->incompletelyTranslated, SORT_STRING|SORT_FLAG_CASE);
|
||||
$this->output->writeln("Incompletely translated terms:");
|
||||
foreach ($this->incompletelyTranslated as $term => $n) {
|
||||
$this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n));
|
||||
}
|
||||
}
|
||||
if(!empty($this->fullyTranslated)) {
|
||||
ksort($this->fullyTranslated, SORT_STRING|SORT_FLAG_CASE);
|
||||
$this->output->writeln("Fully translated terms:");
|
||||
foreach ($this->fullyTranslated as $term => $n) {
|
||||
$this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private function doRecord($record_id, $metas)
|
||||
{
|
||||
$reportFormat = $this->globalConfiguration->getReportFormat();
|
||||
|
||||
if($reportFormat !== 'condensed') {
|
||||
$this->output->writeln(sprintf("record id: %s", $record_id));
|
||||
}
|
||||
|
||||
$source_field_id = $this->source_field['id'];
|
||||
$meta_to_delete = []; // key = id, to easily keep unique
|
||||
$meta_to_add = [];
|
||||
|
||||
if ($this->cleanupDestination) {
|
||||
foreach ($this->destination_fields as $lng => $destination_field) {
|
||||
$destination_field_id = $destination_field['id'];
|
||||
foreach ($metas[$destination_field_id] as $meta_id => $value) {
|
||||
$meta_to_delete[$meta_id] = $value;
|
||||
}
|
||||
unset($meta_id, $value);
|
||||
}
|
||||
unset($lng, $destination_field, $destination_field_id);
|
||||
}
|
||||
|
||||
// loop on every value of the "source_field"
|
||||
//
|
||||
foreach ($metas[$source_field_id] as $source_meta_id => $source_value) {
|
||||
|
||||
$t = $this->splitTermAndContext($source_value);
|
||||
$q = '@w=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[0])) . '\'';
|
||||
if ($t[1]) {
|
||||
$q .= ' and @k=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[1])) . '\'';
|
||||
}
|
||||
if(!is_null($this->source_field['lng'])) {
|
||||
$q .= ' and @lng=\'' . \thesaurus::xquery_escape($this->source_field['lng']) . '\'';
|
||||
}
|
||||
$q = '//sy[' . $q . ']/../sy';
|
||||
unset($t);
|
||||
|
||||
// loop on every tbranch (one field may be linked to many branches)
|
||||
//
|
||||
$translations = []; // ONE translation per lng (first found in th)
|
||||
/** @var DOMNode $tbranch */
|
||||
foreach ($this->tbranches as $tbranch) {
|
||||
if (!($nodes = $this->xpathTh->query($q, $tbranch))) {
|
||||
$this->output->writeln(sprintf(" - \"%s\" <warning>xpath error on (%s), ignored.</warning>", $source_value, $q));
|
||||
continue;
|
||||
}
|
||||
|
||||
// loop on every synonym
|
||||
//
|
||||
/** @var DOMElement $node */
|
||||
foreach ($nodes as $node) {
|
||||
$lng = $node->getAttribute('lng');
|
||||
|
||||
// ignore synonyms not in one of the "destination_field" languages
|
||||
//
|
||||
if (!array_key_exists($lng, $this->destination_fields)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$translated_value = $node->getAttribute('v');
|
||||
|
||||
$destination_field_id = $this->destination_fields[$lng]['id'];
|
||||
if (!array_key_exists($lng, $translations)) {
|
||||
if (($destination_meta_id = array_search($translated_value, $metas[$destination_field_id])) === false) {
|
||||
$translations[$lng] = [
|
||||
'val' => $translated_value,
|
||||
'id' => null,
|
||||
'msg' => sprintf(" --> %s", $this->destination_fields[$lng]['name'])
|
||||
];
|
||||
$meta_to_add[$destination_field_id][] = $translated_value;
|
||||
}
|
||||
else {
|
||||
$translations[$lng] = [
|
||||
'val' => $translated_value,
|
||||
'id' => $destination_meta_id,
|
||||
'msg' => sprintf("already in %s", $this->destination_fields[$lng]['name'])
|
||||
];
|
||||
unset($meta_to_delete[$destination_meta_id]);
|
||||
}
|
||||
unset($destination_meta_id);
|
||||
}
|
||||
unset($lng, $destination_field_id, $translated_value);
|
||||
}
|
||||
unset($nodes, $node, $tbranch);
|
||||
}
|
||||
unset($q);
|
||||
|
||||
// cleanup source
|
||||
//
|
||||
if (empty($translations)) {
|
||||
if($reportFormat === 'all') {
|
||||
$this->output->writeln(sprintf(" - \"%s\" : no translation found.", $source_value));
|
||||
}
|
||||
$this->addToCondensedReport($source_value, $this->notTranslated);
|
||||
}
|
||||
else if (count($translations) < count($this->destination_fields)) {
|
||||
if(in_array($reportFormat, ['all', 'translated'])) {
|
||||
$this->output->writeln(sprintf(" - \"%s\" : incomplete translation.", $source_value));
|
||||
}
|
||||
$this->addToCondensedReport($source_value, $this->incompletelyTranslated);
|
||||
}
|
||||
else {
|
||||
// complete translation (all target lng)
|
||||
if(in_array($reportFormat, ['all', 'translated'])) {
|
||||
$this->output->writeln(sprintf(" - \"%s\" :", $source_value));
|
||||
}
|
||||
$this->addToCondensedReport($source_value, $this->fullyTranslated);
|
||||
|
||||
if ($this->cleanupSource === self::CLEANUP_SOURCE_IF_TRANSLATED) {
|
||||
// do NOT delete the source term if one translation found it as already present as destination (possible if source=destination)
|
||||
$used = false;
|
||||
foreach($translations as $l => $t) {
|
||||
if($t['id'] === $source_meta_id) {
|
||||
$used = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!$used) {
|
||||
$meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(in_array($reportFormat, ['all', 'translated'])) {
|
||||
foreach ($translations as $lng => $translation) {
|
||||
$this->output->writeln(sprintf(" - [%s] \"%s\" %s", $lng, $translation['val'], $translation['msg']));
|
||||
}
|
||||
}
|
||||
|
||||
if ($this->cleanupSource === self::ALWAYS_CLEANUP_SOURCE) {
|
||||
// do NOT delete the source term if one translation found it as already present as destination (possible if source=destination)
|
||||
$used = false;
|
||||
foreach($translations as $l => $t) {
|
||||
if($t['id'] === $source_meta_id) {
|
||||
$used = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!$used) {
|
||||
$meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id];
|
||||
}
|
||||
}
|
||||
|
||||
unset($lng, $translations, $translation);
|
||||
}
|
||||
|
||||
unset($metas, $source_meta_id, $source_value);
|
||||
|
||||
$actions = [];
|
||||
|
||||
$metadatas = [];
|
||||
foreach ($meta_to_delete as $id => $value) {
|
||||
$metadatas[] = [
|
||||
'action' => "delete",
|
||||
'meta_id' => $id,
|
||||
'_value_' => $value
|
||||
];
|
||||
}
|
||||
foreach($meta_to_add as $struct_id => $values) {
|
||||
$metadatas[] = [
|
||||
'action' => "add",
|
||||
'meta_struct_id' => $struct_id,
|
||||
'value' => $values
|
||||
];
|
||||
}
|
||||
if(!empty($metadatas)) {
|
||||
$actions['metadatas'] = $metadatas;
|
||||
}
|
||||
unset($metadatas);
|
||||
|
||||
if(!is_null($this->setCollection)) {
|
||||
$actions['base_id'] = $this->setCollection->get_base_id();
|
||||
}
|
||||
|
||||
if(!is_null($this->setStatus)) {
|
||||
$status = [];
|
||||
foreach(str_split(strrev($this->setStatus), 1) as $bit => $v) {
|
||||
if($v === '0' || $v === '1') {
|
||||
$status[] = [
|
||||
'bit' => $bit,
|
||||
'state' => $v === '1'
|
||||
];
|
||||
}
|
||||
}
|
||||
if(!empty($status)) {
|
||||
$actions['status'] = $status;
|
||||
}
|
||||
}
|
||||
|
||||
$jsActions = json_encode($actions, JSON_PRETTY_PRINT);
|
||||
if($this->output->getVerbosity() >= OutputInterface::VERBOSITY_VERY_VERBOSE) {
|
||||
$this->output->writeln(sprintf("<info>JS : %s</info>", $jsActions));
|
||||
}
|
||||
|
||||
if (!$this->globalConfiguration->isDryRun()) {
|
||||
$record = $this->getDatabox()->getRecordRepository()->find($record_id);
|
||||
$record->setMetadatasByActions(json_decode($jsActions));
|
||||
}
|
||||
|
||||
$this->recordsDone++;
|
||||
}
|
||||
|
||||
private function addToCondensedReport($term, &$where)
|
||||
{
|
||||
if($this->globalConfiguration->getReportFormat() !== 'condensed') {
|
||||
return;
|
||||
}
|
||||
if(!array_key_exists($term, $where)) {
|
||||
$where[$term] = 0;
|
||||
}
|
||||
$where[$term]++;
|
||||
}
|
||||
|
||||
private function splitTermAndContext($word)
|
||||
{
|
||||
$term = trim($word);
|
||||
$context = '';
|
||||
if (($po = strpos($term, '(')) !== false) {
|
||||
if (($pc = strpos($term, ')', $po)) !== false) {
|
||||
$context = trim(substr($term, $po + 1, $pc - $po - 1));
|
||||
$term = trim(substr($term, 0, $po));
|
||||
}
|
||||
else {
|
||||
$context = trim(substr($term, $po + 1));
|
||||
$term = trim(substr($term, 0, $po));
|
||||
}
|
||||
}
|
||||
|
||||
return [$term, $context];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return string[]
|
||||
*/
|
||||
public function getErrors(): array
|
||||
{
|
||||
return $this->errors;
|
||||
}
|
||||
|
||||
public function isValid(): bool
|
||||
{
|
||||
return empty($this->errors);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return databox|null
|
||||
*/
|
||||
public function getDatabox()
|
||||
{
|
||||
return $this->databox;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return bool
|
||||
*/
|
||||
public function isActive(): bool
|
||||
{
|
||||
return $this->active;
|
||||
}
|
||||
|
||||
|
||||
}
|
@@ -0,0 +1,120 @@
|
||||
<?php
|
||||
|
||||
namespace Alchemy\Phrasea\Command\Thesaurus\Translator;
|
||||
|
||||
|
||||
use Alchemy\Phrasea\Border\File;
|
||||
use Alchemy\Phrasea\Border\Manager as BorderManager;
|
||||
use Alchemy\Phrasea\Command\Command as phrCommand;
|
||||
use Alchemy\Phrasea\Model\Entities\LazaretSession;
|
||||
use collection;
|
||||
use databox;
|
||||
use Doctrine\DBAL\DBALException;
|
||||
use Doctrine\ORM\EntityManager;
|
||||
use Exception;
|
||||
use Guzzle\Http\Client as Guzzle;
|
||||
use igorw;
|
||||
use MediaVorus\MediaVorus;
|
||||
use Neutron\TemporaryFilesystem\TemporaryFilesystem;
|
||||
use PDO;
|
||||
use record_adapter;
|
||||
use Symfony\Component\Console\Formatter\OutputFormatterStyle;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Yaml\Yaml;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @license http://opensource.org/licenses/gpl-3.0 GPLv3
|
||||
* @link www.phraseanet.com
|
||||
*/
|
||||
class TranslateCommand extends phrCommand
|
||||
{
|
||||
/** @var InputInterface $input */
|
||||
private $input;
|
||||
/** @var OutputInterface $output */
|
||||
private $output;
|
||||
|
||||
/** @var GlobalConfiguration */
|
||||
private $config;
|
||||
|
||||
public function configure()
|
||||
{
|
||||
$this->setName('thesaurus:translate')
|
||||
->setDescription('(BETA) Translate fields values using thesaurus')
|
||||
->addOption('report', null, InputOption::VALUE_REQUIRED, "Report output format (all|condensed|translated|record)", "all")
|
||||
->addOption('dry', null, InputOption::VALUE_NONE, "list translations but don't apply.", null)
|
||||
->setHelp("--report:\n - all : list every term.\n - translated : list only translated terms.\n - record : list only selected record ids.\n - condensed : count terms occurences.")
|
||||
;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param $input
|
||||
* @param $output
|
||||
* @return int
|
||||
*/
|
||||
protected function doExecute(InputInterface $input, OutputInterface $output)
|
||||
{
|
||||
// add cool styles
|
||||
$style = new OutputFormatterStyle('black', 'yellow'); // , array('bold'));
|
||||
$output->getFormatter()->setStyle('warning', $style);
|
||||
|
||||
// sanitize args
|
||||
if(!in_array($input->getOption('report'), ['all', 'condensed', 'translated', 'record'])) {
|
||||
$output->writeln(sprintf("<error>bad --report value (%s), use all|condensed|translated|record</error>", $input->getOption('report')));
|
||||
return 1;
|
||||
}
|
||||
|
||||
$this->input = $input;
|
||||
$this->output = $output;
|
||||
|
||||
// config must be ok
|
||||
//
|
||||
try {
|
||||
$this->config = GlobalConfiguration::create(
|
||||
$this->container['phraseanet.appbox'],
|
||||
$this->container['unicode'],
|
||||
$this->container['root.path'],
|
||||
$input->getOption('dry'),
|
||||
$input->getOption('report'),
|
||||
$output
|
||||
);
|
||||
}
|
||||
catch(\Exception $e) {
|
||||
$output->writeln(sprintf("<error>missing or bad configuration: %s</error>", $e->getMessage()));
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @var string $jobName
|
||||
* @var Job $job
|
||||
*/
|
||||
foreach ($this->config->getJobs() as $jobName => $job) {
|
||||
$output->writeln("");
|
||||
$output->writeln(sprintf("======== Playing job %s ========", $jobName));
|
||||
|
||||
if(!$job->isValid()) {
|
||||
$output->writeln("<warning>Configuration error(s)... :</warning>");
|
||||
foreach ($job->getErrors() as $err) {
|
||||
$output->writeln(sprintf(" - %s", $err));
|
||||
}
|
||||
$output->writeln("<warning>...Job ignored</warning>");
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!$job->isActive()) {
|
||||
$output->writeln(sprintf("job is inactive, skipped."));
|
||||
continue;
|
||||
}
|
||||
|
||||
$job->run();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
@@ -0,0 +1,47 @@
|
||||
# <phraseanet>/config/translator/configuration.yml
|
||||
|
||||
translator:
|
||||
jobs:
|
||||
#
|
||||
# first job : translate EN keywords to FR and DE
|
||||
# then change status-bit to elect record for job 2
|
||||
keywords_EN_to_FR_DE:
|
||||
active: true
|
||||
databox: my_databox
|
||||
if_collection: to_translate
|
||||
if_status: xx1xxxx
|
||||
source_field: KeywordsEN
|
||||
source_lng: en
|
||||
destination_fields:
|
||||
- fr:keywordsFR
|
||||
- de:keywordsDE
|
||||
cleanup_source: if_translated
|
||||
# job 1 cleans the destination fields
|
||||
cleanup_destination: true
|
||||
# do NOT change collection because job 2 looks here...
|
||||
#-- set_collection: online
|
||||
# ... but change status
|
||||
set_status: 010xxxx
|
||||
|
||||
#
|
||||
# second (last) job : translate EN country to FR and DE, add also as keywords !
|
||||
#
|
||||
country_EN_to_FR_DE:
|
||||
active: true
|
||||
databox: my_databox
|
||||
# same collection as job 1
|
||||
if_collection: to_translate
|
||||
# status was changet by job 1
|
||||
if_status: 010xxxx
|
||||
source_field: CountryEN
|
||||
source_lng: en
|
||||
# add translated country to the keywords
|
||||
destination_fields:
|
||||
- fr:keywordsFR
|
||||
- de:keywordsDE
|
||||
cleanup_source: if_translated
|
||||
# job 2 must NOT erase what job 1 did
|
||||
cleanup_destination: false
|
||||
# the last job will change collection
|
||||
set_collection: online
|
||||
set_status: 100xxxx
|
@@ -0,0 +1,218 @@
|
||||
#Translator
|
||||
|
||||
Translator is a console command that uses the thesaurus to translate terms from one field (source), to one or many fields (destinations).
|
||||
|
||||
It will act on records matching conditions like "from this collection" or
|
||||
"if this status-bit is 1".
|
||||
|
||||
Translator play __jobs__ one after one, each __job__ can define his own settings.
|
||||
Jobs and settings are declared in a configuration file (yml):
|
||||
|
||||
```yaml
|
||||
# <phraseanet>/config/translator/configuration.yml
|
||||
|
||||
translator:
|
||||
jobs:
|
||||
keywords_EN_to_FR_DE:
|
||||
active: true
|
||||
databox: my_databox
|
||||
...
|
||||
country_EN_to_FR_DE:
|
||||
active: false
|
||||
...
|
||||
```
|
||||
|
||||
##Job settings:
|
||||
|
||||
- `active` : (mandatory) boolean to activate the job.
|
||||
- `databox`: (mandatory) The databox name|id to act on.
|
||||
- `if_collection`: (optional) The unique collection name|id to act on; Default if not set: All collections.
|
||||
- `if_status`: (optional) Act only on records matching this status-bits mask; Format 01x10xxxx; Default: All records.
|
||||
- `source_field`: (mandatory) The name of the source field containing terms to be translated.
|
||||
- `source_lng`: (optional) The language of the source terms to translate. If set, only terms matching this lng will be searched into thesaurus. Default if not set: Search term without language criteria.
|
||||
- `destination_fields` (mandatory) A __list__ of destinations using format `{lng}:{field name}`;
|
||||
Each translated term (from thesaurus) will be directed to the matching field, depending on his lng (see examples).
|
||||
- `cleanup_source`: (optional) Whether to remove or keep the source term, depending on it was succesfully translated or not.
|
||||
- `never`: keep the term (default).
|
||||
- `if-translated`: remove if fully translated (all destination lngs).
|
||||
- `always`: remove the term even it was not translated.
|
||||
- `cleanup_destination`: (optional) Empty the destination(s) field(s) before translation (default `false`)
|
||||
- `set_collection`: (optional) collection where to move the record after translation.
|
||||
- `set_status`: (optional) status-bit mask to apply on record after translation.
|
||||
|
||||
##Important:
|
||||
|
||||
#### After playing job(s), no more record must match the selection conditions `if_collection`, `if_status`.
|
||||
|
||||
- Because a job will act on __all__ records matching the `if_collection` and `if_status` conditions,
|
||||
one __should__ change the collection or sb after translation (`set_colllection` and `set_status` settings).
|
||||
|
||||
|
||||
- Because each job declares his own conditions, playing multiple jobs must implement a _workflow_ mechanism:
|
||||
- job 1 selects records matching conditions A (coll/sb) __must__ change collection and/or status to match conditions (B) of job 2.
|
||||
- job 2 selects records matching conditions B and __must__ set new final values that matches neither A or B.
|
||||
|
||||
|
||||
- Because jobs are played one after one, in case of many jobs acting on same records, workflow can be simplified:
|
||||
- __first__ job 1 selects records matching "work-on" conditions, and does not change anything after translation.
|
||||
- job 2 selects using the same conditions and does not change conditions either.
|
||||
- __last__ job 3 selects using the same conditions, and is responsible to change collection and/or status when done.
|
||||
|
||||
Those rules prevent the job(s) to run multiple times on the same records. Of course care must be taken if one part of a workflow is de-activated.
|
||||
|
||||
#### Cleanup with multiple jobs.
|
||||
|
||||
- Because job n+1 is played after job n is fully completed, care must be taken when using `cleanup` options:
|
||||
- If acting on same source, `cleanup_source: always` must only be applied on __last__ job, else job 1 will remove every term that job 2 should work on.
|
||||
(This case might not happen since - thanks to multiple destinations - there is no reason to act on same source twice).
|
||||
|
||||
- Same care with multiples jobs writing on same destination(s): `cleanup_destination: true` should be set only on __first__ job, else job 2 will erase what job 1 has done.
|
||||
|
||||
|
||||
|
||||
##Example 1:
|
||||
### translate new records (having default sb=0).
|
||||
```yaml
|
||||
translator:
|
||||
jobs:
|
||||
example:
|
||||
active: true
|
||||
databox: my_databox
|
||||
# condition: act on new records having "translated" sb[4]=0
|
||||
if_status: 0xxxx
|
||||
# original keywords are expected to be EN
|
||||
source_field: KeywordsEN
|
||||
source_lng: en
|
||||
# translate to 2 separate fields
|
||||
destination_fields:
|
||||
- fr:keywordsFR
|
||||
- de:keywordsDE
|
||||
# keep original EN keywords
|
||||
cleanup_source: never
|
||||
# remove existing terms on destinations before translating
|
||||
cleanup_destination: true
|
||||
# end: set "translated" sb to 1
|
||||
set_status: 1xxxx
|
||||
```
|
||||
|
||||
##Example 2:
|
||||
### manually select records to translate by setting sb[4].
|
||||
```yaml
|
||||
translator:
|
||||
jobs:
|
||||
example:
|
||||
# ...
|
||||
# condition: act on records having "to translate" sb[4]=1
|
||||
if_status: x1xxxx
|
||||
# end: mark the record as "translated"
|
||||
set_status: 10xxxx
|
||||
```
|
||||
|
||||
##Example 3:
|
||||
### translate new records from temporary collection.
|
||||
```yaml
|
||||
translator:
|
||||
jobs:
|
||||
example:
|
||||
# ...
|
||||
if_collection: 'upload'
|
||||
set_collection: 'online'
|
||||
```
|
||||
|
||||
##Example 4:
|
||||
### add translations to the same field
|
||||
|
||||
__Trick__:
|
||||
If one cleans the destination field - the __same as the source__ -, the original source will be deleted.
|
||||
If the intent is to preserve the original term (adding translations), it must be added again.
|
||||
|
||||
The program will detect that the same term is to be deleted then added, and will preserve the original one.
|
||||
|
||||
```yaml
|
||||
translator:
|
||||
jobs:
|
||||
example:
|
||||
# ...
|
||||
source_field: Keywords
|
||||
source_lng: en
|
||||
# since source=destination, source will be cleaned of all not-translatable terms...
|
||||
cleanup_destination: true
|
||||
destination_fields:
|
||||
# ... this is why one must re-add the EN "translated" term (same as source)
|
||||
- en:Keywords
|
||||
- fr:Keywords
|
||||
- de:Keywords
|
||||
```
|
||||
|
||||
##Example 4-bis:
|
||||
### removing terms that are not in the thesaurus
|
||||
|
||||
```yaml
|
||||
translator:
|
||||
jobs:
|
||||
example:
|
||||
# ...
|
||||
source_field: Keywords
|
||||
source_lng: en
|
||||
cleanup_source: always
|
||||
destination_fields:
|
||||
- en:Keywords
|
||||
```
|
||||
|
||||
##Example 5:
|
||||
### merge many sources to one "tote bag"
|
||||
```yaml
|
||||
translator:
|
||||
jobs:
|
||||
keywords:
|
||||
active: true
|
||||
databox: my_databox
|
||||
# manually start condition: set sb[4]
|
||||
if_status: xxx1xxxx
|
||||
# original keywords are expected to be EN
|
||||
source_field: keywords
|
||||
source_lng: en
|
||||
# translate to a common field
|
||||
destination_fields:
|
||||
- fr:motscles
|
||||
# each job can clean his own distinct source
|
||||
cleanup_source: always
|
||||
# first job cleanups destination
|
||||
cleanup_destination: true
|
||||
# end: set ready for next job
|
||||
set_status: 0010xxxx
|
||||
country:
|
||||
active: true
|
||||
databox: my_databox
|
||||
# condition: set by previous job
|
||||
if_status: 0010xxxx
|
||||
# original country is expected to be EN
|
||||
source_field: country
|
||||
source_lng: en
|
||||
# translate to the same destination
|
||||
destination_fields:
|
||||
- fr:motscles
|
||||
# each job can clean his own distinct source
|
||||
cleanup_source: always
|
||||
# do NOT cleanup destination, first job did it
|
||||
cleanup_destination: false
|
||||
# end: set ready for next job
|
||||
set_status: 0100xxxx
|
||||
city:
|
||||
active: true
|
||||
databox: my_databox
|
||||
# condition: set by previous job
|
||||
if_status: 0010xxxx
|
||||
# original city is expected to be EN
|
||||
source_field: city
|
||||
source_lng: en
|
||||
# translate to the same field
|
||||
destination_fields:
|
||||
- fr:motscles
|
||||
# each job can clean his own distinct source
|
||||
cleanup_source: always
|
||||
# do NOT cleanup destination, first job did it
|
||||
cleanup_destination: false
|
||||
# end: set to "translated"
|
||||
set_status: 1000xxxx
|
||||
```
|
@@ -1490,7 +1490,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
|
||||
foreach ($values as $value) {
|
||||
if ($value) {
|
||||
$ops[] = [
|
||||
'expain' => sprintf('set:: adding value "%s" to "%s" (multi)', $value, $sf->get_name()),
|
||||
'explain' => sprintf('set:: adding value "%s" to "%s" (multi)', $value, $sf->get_name()),
|
||||
'meta_struct_id' => $sf->get_id(),
|
||||
'meta_id' => $meta_id, // can be null
|
||||
'value' => $value
|
||||
@@ -1505,7 +1505,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
|
||||
}
|
||||
if( ($value = $values[0]) ) {
|
||||
$ops[] = [
|
||||
'expain' => sprintf('adding value "%s" to "%s" (mono)', $value, $sf->get_name()),
|
||||
'explain' => sprintf('adding value "%s" to "%s" (mono)', $value, $sf->get_name()),
|
||||
'meta_struct_id' => $sf->get_id(),
|
||||
'meta_id' => $meta_id, // probably null,
|
||||
'value' => $value
|
||||
@@ -1522,7 +1522,6 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
|
||||
* @param string[] $values
|
||||
*
|
||||
* @return array ops to execute
|
||||
* @throws Exception
|
||||
*/
|
||||
private function metadata_add($struct_fields, $values)
|
||||
{
|
||||
@@ -1531,11 +1530,12 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
|
||||
// now set values to matching struct_fields
|
||||
foreach ($struct_fields as $sf) {
|
||||
if(!$sf->is_multi()) {
|
||||
throw new Exception(sprintf("can't \"add\" to mono-valued (%s).", $sf->get_name()));
|
||||
// easy support "add" on mono : join values...
|
||||
$values = [ join(' ; ', $values) ];
|
||||
}
|
||||
foreach ($values as $value) {
|
||||
$ops[] = [
|
||||
'expain' => sprintf('add:: adding value "%s" to "%s"', $value, $sf->get_name()),
|
||||
'explain' => sprintf('add:: adding value "%s" to "%s"', $value, $sf->get_name()),
|
||||
'meta_struct_id' => $sf->get_id(),
|
||||
'meta_id' => null,
|
||||
'value' => $value
|
||||
@@ -1577,7 +1577,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
|
||||
}
|
||||
// then add the replacing value
|
||||
$ops[] = [
|
||||
'expain' => sprintf('rpl::match_all: adding value "%s" to "%s"', $replace_with, $cf->get_name()),
|
||||
'explain' => sprintf('rpl::match_all: adding value "%s" to "%s"', $replace_with, $cf->get_name()),
|
||||
'meta_struct_id' => $cf->get_meta_struct_id(),
|
||||
'meta_id' => null,
|
||||
'value' => $replace_with
|
||||
@@ -1590,7 +1590,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
|
||||
foreach ($cf->get_values() as $field_value) {
|
||||
if ($field_value->getId() === $meta_id) {
|
||||
$ops[] = [
|
||||
'expain' => sprintf('rpl::match_meta_id %s (field "%s") set value "%s"', $field_value->getId(), $cf->get_name(), $replace_with),
|
||||
'explain' => sprintf('rpl::match_meta_id %s (field "%s") set value "%s"', $field_value->getId(), $cf->get_name(), $replace_with),
|
||||
'meta_struct_id' => $cf->get_meta_struct_id(),
|
||||
'meta_id' => $field_value->getId(),
|
||||
'value' => $replace_with
|
||||
@@ -1609,7 +1609,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
|
||||
}
|
||||
if ($this->match($value, $match_method, $field_value->getValue())) {
|
||||
$ops[] = [
|
||||
'expain' => sprintf('rpl::match_value "%s" (field "%s") set value "%s"', $field_value->getValue(), $cf->get_name(), $rw),
|
||||
'explain' => sprintf('rpl::match_value "%s" (field "%s") set value "%s"', $field_value->getValue(), $cf->get_name(), $rw),
|
||||
'meta_struct_id' => $cf->get_meta_struct_id(),
|
||||
'meta_id' => $field_value->getId(),
|
||||
'value' => $rw
|
||||
|
Reference in New Issue
Block a user