Merge pull request #4305 from alchemy-fr/PHRAS-3805_translator

PHRAS-3805_translator
This commit is contained in:
jygaulier
2023-05-17 15:55:34 +02:00
committed by GitHub
8 changed files with 1086 additions and 8 deletions

View File

@@ -55,6 +55,7 @@ use Alchemy\Phrasea\Command\Task\TaskStart;
use Alchemy\Phrasea\Command\Task\TaskState;
use Alchemy\Phrasea\Command\Task\TaskStop;
use Alchemy\Phrasea\Command\Thesaurus\FindConceptsCommand;
use Alchemy\Phrasea\Command\Thesaurus\Translator\TranslateCommand;
use Alchemy\Phrasea\Command\UpgradeDBDatas;
use Alchemy\Phrasea\Command\User\UserApplicationsCommand;
use Alchemy\Phrasea\Command\User\UserCreateCommand;
@@ -172,6 +173,7 @@ $cli->command(new IndexPopulateCommand());
$cli->command(new QueryParseCommand());
$cli->command(new QuerySampleCommand());
$cli->command(new FindConceptsCommand());
$cli->command(new TranslateCommand());
$cli->command(new WorkerExecuteCommand());
$cli->command(new WorkerHeartbeatCommand());

View File

@@ -0,0 +1,8 @@
<?php
namespace Alchemy\Phrasea\Command\Thesaurus\Translator;
class ConfigurationException extends \Exception
{
}

View File

@@ -0,0 +1,127 @@
<?php
namespace Alchemy\Phrasea\Command\Thesaurus\Translator;
use appbox;
use collection;
use databox;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Yaml\Yaml;
use Unicode;
Class GlobalConfiguration
{
const CONFIG_DIR = "/config/translator/";
const CONFIG_FILE = "configuration.yml";
private $configuration = null;
/** @var Job[] */
private $jobs = [];
private $databoxes = [];
/**
* @var bool
*/
private $dryRun;
/**
* @var string
*/
private $reportFormat;
/**
* @param appbox $appBox
* @param array $global_conf
*/
private function __construct($appBox, Unicode $unicode, $global_conf, bool $dryRun, string $reportFormat, OutputInterface $output)
{
$this->configuration = $global_conf;
$this->dryRun = $dryRun;
$this->reportFormat = $reportFormat;
// list databoxes and collections to access by id or by name
$this->databoxes = [];
foreach ($appBox->get_databoxes() as $databox) {
$sbas_id = $databox->get_sbas_id();
$sbas_name = $databox->get_dbname();
$this->databoxes[$sbas_id] = [
'dbox' => $databox,
'collections' => []
];
$this->databoxes[$sbas_name] = &$this->databoxes[$sbas_id];
// list all collections
foreach ($databox->get_collections() as $collection) {
$coll_id = $collection->get_coll_id();
$coll_name = $collection->get_name();
$this->databoxes[$sbas_id]['collections'][$coll_id] = $collection;
$this->databoxes[$sbas_id]['collections'][$coll_name] = &$this->databoxes[$sbas_id]['collections'][$coll_id];
}
}
foreach($global_conf['jobs'] as $job_name => $job_conf) {
$this->jobs[$job_name] = new Job($this, $job_conf, $unicode, $output);
}
}
/**
* @param appbox $appBox
* @param string $root
* @return GlobalConfiguration
* @throws ConfigurationException
*/
public static function create(appbox $appBox, Unicode $unicode, string $root, bool $dryRun, string $reportFormat, OutputInterface $output): GlobalConfiguration
{
try {
$config_file = ($config_dir = $root . self::CONFIG_DIR) . self::CONFIG_FILE;
@mkdir($config_dir, 0777, true);
$config = Yaml::parse(file_get_contents($config_file));
return new self($appBox, $unicode, $config['translator'], $dryRun, $reportFormat, $output);
}
catch (\Exception $e) {
throw new ConfigurationException(sprintf("missing or bad configuration (%s)", $e->getMessage()));
}
}
public function getJobs()
{
return $this->jobs;
}
/**
* @param string|int $sbasIdOrName
* @return databox|null
*/
public function getDatabox($sbasIdOrName)
{
return isset($this->databoxes[$sbasIdOrName]) ? $this->databoxes[$sbasIdOrName]['dbox'] : null;
}
/**
* @param string|int $sbasIdOrName
* @param string|int $collIdOrName
* @return collection|null
*/
public function getCollection($sbasIdOrName, $collIdOrName)
{
return $this->databoxes[$sbasIdOrName]['collections'][$collIdOrName] ?? null;
}
/**
* @return bool
*/
public function isDryRun(): bool
{
return $this->dryRun;
}
/**
* @return string
*/
public function getReportFormat(): string
{
return $this->reportFormat;
}
}

View File

@@ -0,0 +1,556 @@
<?php
namespace Alchemy\Phrasea\Command\Thesaurus\Translator;
use databox;
use DOMElement;
use DOMNode;
use DOMNodeList;
use DOMXpath;
use PDO;
use Symfony\Component\Console\Output\OutputInterface;
use thesaurus_xpath;
use Unicode;
class Job
{
const NEVER_CLEANUP_SOURCE = 'never';
const ALWAYS_CLEANUP_SOURCE = 'always';
const CLEANUP_SOURCE_IF_TRANSLATED = 'if_translated';
private $active = true;
/** @var string[] */
private $errors = []; // error messages while parsing conf
/** @var databox|null $databox */
private $databox = null;
/** @var array */
private $selectRecordParams = [];
private $selectRecordsSql = null;
/** @var array list of field ids of "source_field" (unique) and "destination_fields" (many) */
private $selectRecordFieldIds;
/**
* @var OutputInterface
*/
private $output;
private $source_field; // infos about the "source_field"
private $destination_fields; // infos about the "destination_fields" (key=lng)
/**
* @var Unicode
*/
private $unicode;
/** @var DOMXpath|false|thesaurus_xpath */
private $xpathTh;
/**
* @var DOMNodeList
* The thesaurus branch(es) linked to the "source_field"
*/
private $tbranches;
/** @var bool */
private $cleanupDestination;
/** @var string */
private $cleanupSource = self::NEVER_CLEANUP_SOURCE;
/**
* @var GlobalConfiguration
*/
private $globalConfiguration;
/**
* @var array
*/
private $job_conf;
/**
* @var \collection|null
*/
private $setCollection = null;
/**
* @var string
*/
private $setStatus = null; // format 0xx1100xx01xxxx
/**
* @var array
*/
private $notTranslated; // for condensed report
/**
* @var array
*/
private $incompletelyTranslated; // for condensed report
/**
* @var array
*/
private $fullyTranslated; // for condensed report
/**
* @var int
*/
private $recordsDone; // for condensed report
/**
* @param GlobalConfiguration $globalConfiguration
* @param array $job_conf
*/
public function __construct($globalConfiguration, $job_conf, Unicode $unicode, OutputInterface $output)
{
$this->globalConfiguration = $globalConfiguration;
$this->job_conf = $job_conf;
$this->unicode = $unicode;
$this->output = $output;
if (array_key_exists('active', $job_conf) && $job_conf['active'] === false) {
$this->active = false;
return;
}
$this->errors = [];
foreach (['active', 'databox', 'source_field', 'destination_fields'] as $mandatory) {
if (!isset($job_conf[$mandatory])) {
$this->errors[] = sprintf("Missing mandatory setting (%s).", $mandatory);
}
}
if (!empty($this->errors)) {
return;
}
if (!($this->databox = $globalConfiguration->getDatabox($job_conf['databox']))) {
$this->errors[] = sprintf("unknown databox (%s).", $job_conf['databox']);
return;
}
if(array_key_exists('set_collection', $job_conf)) {
if(!($this->setCollection = $globalConfiguration->getCollection($this->databox->get_sbas_id(), $job_conf['set_collection']))) {
$this->errors[] = sprintf("unknown setCollection (%s).", $job_conf['set_collection']);
return;
}
}
if(array_key_exists('set_status', $job_conf)) {
$this->setStatus = $job_conf['set_status'];
}
$cnx = $this->databox->get_connection();
// get infos about the "source_field"
//
$sql = "SELECT `id`, `tbranch` FROM `metadatas_structure` WHERE `name` = :name AND `tbranch` != ''";
$stmt = $cnx->executeQuery($sql, [':name' => $job_conf['source_field']]);
$this->source_field = $stmt->fetch(PDO::FETCH_ASSOC);
$stmt->closeCursor();
if (!$this->source_field) {
$this->errors[] = sprintf("field (%s) not found or not linked to thesaurus.", $job_conf['source_field']);
return;
}
$this->source_field['lng'] = array_key_exists('source_lng', $job_conf) ? $job_conf['source_lng'] : null;
$this->selectRecordFieldIds[] = $this->source_field['id'];
$this->xpathTh = $this->databox->get_xpath_thesaurus();
$this->tbranches = $this->xpathTh->query($this->source_field['tbranch']);
if (!$this->tbranches || $this->tbranches->length <= 0) {
$this->errors[] = sprintf("thesaurus branch(es) (%s) not found.", $this->source_field['tbranch']);
return;
}
// get infos about the "destination_fields"
//
$this->destination_fields = [];
$sql = "SELECT `id`, `name` FROM `metadatas_structure` WHERE `name` = :name ";
$stmt = $cnx->prepare($sql);
foreach ($job_conf['destination_fields'] as $tf) {
list($lng, $fname) = explode(':', $tf);
$stmt->execute([':name' => $fname]);
if (!($row = $stmt->fetch(PDO::FETCH_ASSOC))) {
$this->output->writeln(sprintf("<warning>undefined field (%s) (ignored).</warning>", $fname));
continue;
}
$this->destination_fields[$lng] = $row;
$stmt->closeCursor();
$this->selectRecordFieldIds[] = $row['id'];
}
if (empty($this->destination_fields)) {
$this->errors[] = sprintf("<warning>no \"destination_field\" found.</warning>");
return;
}
// misc settings
$this->cleanupDestination = array_key_exists('cleanup_destination', $job_conf) && $job_conf['cleanup_destination'] === true;
$this->cleanupSource = array_key_exists('cleanup_source', $job_conf) ? $job_conf['cleanup_source'] : self::NEVER_CLEANUP_SOURCE;
// build records select sql
//
$selectRecordClauses = [];
$this->selectRecordParams = [];
if (array_key_exists('if_collection', $job_conf)) {
if (!($coll = $globalConfiguration->getCollection($job_conf['databox'], $job_conf['if_collection']))) {
$this->errors[] = sprintf("unknown collection (%s)", $job_conf['if_collection']);
return;
}
$selectRecordClauses[] = "`coll_id` = :coll_id";
$this->selectRecordParams[':coll_id'] = $coll->get_coll_id();
}
if (array_key_exists('if_status', $job_conf)) {
$selectRecordClauses[] = "`status` & b:sb_and = b:sb_equ";
$this->selectRecordParams[':sb_and'] = str_replace(['0', 'x'], ['1', '0'], $job_conf['if_status']);
$this->selectRecordParams[':sb_equ'] = str_replace('x', '0', $job_conf['if_status']);
}
$selectRecordClauses[] = "`meta_struct_id` IN ("
. join(
',',
array_map(function ($id) use ($cnx) {
return $cnx->quote($id);
}, $this->selectRecordFieldIds)
)
. ")";
$sql = "SELECT `record_id`, `meta_struct_id`, `metadatas`.`id` AS meta_id, `value` FROM";
$sql .= " `record` INNER JOIN `metadatas` USING(`record_id`)";
$sql .= " WHERE " . join(" AND ", $selectRecordClauses);
$sql .= " ORDER BY `record_id` ASC";
$this->selectRecordsSql = $sql;
}
public function run()
{
$cnx = $this->databox->get_connection();
$stmt = $cnx->executeQuery($this->selectRecordsSql, $this->selectRecordParams);
$currentRid = '?';
$this->recordsDone = 0;
$this->notTranslated = [];
$this->incompletelyTranslated = [];
$this->fullyTranslated = [];
$metas = $emptyValues = array_map(function () {
return [];
}, array_flip($this->selectRecordFieldIds));
while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) {
if ($currentRid == '?') {
$currentRid = $row['record_id'];
}
if ($row['record_id'] !== $currentRid) {
// change record
$this->doRecord($currentRid, $metas); // flush previous record
$currentRid = $row['record_id'];
$metas = $emptyValues;
}
$metas[$row['meta_struct_id']][$row['meta_id']] = $row['value'];
}
if($currentRid !== '?') {
$this->doRecord($currentRid, $metas); // flush last record
}
$stmt->closeCursor();
// condensed report
//
if($this->globalConfiguration->getReportFormat() === 'condensed') {
$this->output->writeln(sprintf("%d records done.", $this->recordsDone));
if(!empty($this->notTranslated)) {
ksort($this->notTranslated, SORT_STRING|SORT_FLAG_CASE);
$this->output->writeln("Not translated terms:");
foreach ($this->notTranslated as $term => $n) {
$this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n));
}
}
if(!empty($this->incompletelyTranslated)) {
ksort($this->incompletelyTranslated, SORT_STRING|SORT_FLAG_CASE);
$this->output->writeln("Incompletely translated terms:");
foreach ($this->incompletelyTranslated as $term => $n) {
$this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n));
}
}
if(!empty($this->fullyTranslated)) {
ksort($this->fullyTranslated, SORT_STRING|SORT_FLAG_CASE);
$this->output->writeln("Fully translated terms:");
foreach ($this->fullyTranslated as $term => $n) {
$this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n));
}
}
}
}
private function doRecord($record_id, $metas)
{
$reportFormat = $this->globalConfiguration->getReportFormat();
if($reportFormat !== 'condensed') {
$this->output->writeln(sprintf("record id: %s", $record_id));
}
$source_field_id = $this->source_field['id'];
$meta_to_delete = []; // key = id, to easily keep unique
$meta_to_add = [];
if ($this->cleanupDestination) {
foreach ($this->destination_fields as $lng => $destination_field) {
$destination_field_id = $destination_field['id'];
foreach ($metas[$destination_field_id] as $meta_id => $value) {
$meta_to_delete[$meta_id] = $value;
}
unset($meta_id, $value);
}
unset($lng, $destination_field, $destination_field_id);
}
// loop on every value of the "source_field"
//
foreach ($metas[$source_field_id] as $source_meta_id => $source_value) {
$t = $this->splitTermAndContext($source_value);
$q = '@w=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[0])) . '\'';
if ($t[1]) {
$q .= ' and @k=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[1])) . '\'';
}
if(!is_null($this->source_field['lng'])) {
$q .= ' and @lng=\'' . \thesaurus::xquery_escape($this->source_field['lng']) . '\'';
}
$q = '//sy[' . $q . ']/../sy';
unset($t);
// loop on every tbranch (one field may be linked to many branches)
//
$translations = []; // ONE translation per lng (first found in th)
/** @var DOMNode $tbranch */
foreach ($this->tbranches as $tbranch) {
if (!($nodes = $this->xpathTh->query($q, $tbranch))) {
$this->output->writeln(sprintf(" - \"%s\" <warning>xpath error on (%s), ignored.</warning>", $source_value, $q));
continue;
}
// loop on every synonym
//
/** @var DOMElement $node */
foreach ($nodes as $node) {
$lng = $node->getAttribute('lng');
// ignore synonyms not in one of the "destination_field" languages
//
if (!array_key_exists($lng, $this->destination_fields)) {
continue;
}
$translated_value = $node->getAttribute('v');
$destination_field_id = $this->destination_fields[$lng]['id'];
if (!array_key_exists($lng, $translations)) {
if (($destination_meta_id = array_search($translated_value, $metas[$destination_field_id])) === false) {
$translations[$lng] = [
'val' => $translated_value,
'id' => null,
'msg' => sprintf(" --> %s", $this->destination_fields[$lng]['name'])
];
$meta_to_add[$destination_field_id][] = $translated_value;
}
else {
$translations[$lng] = [
'val' => $translated_value,
'id' => $destination_meta_id,
'msg' => sprintf("already in %s", $this->destination_fields[$lng]['name'])
];
unset($meta_to_delete[$destination_meta_id]);
}
unset($destination_meta_id);
}
unset($lng, $destination_field_id, $translated_value);
}
unset($nodes, $node, $tbranch);
}
unset($q);
// cleanup source
//
if (empty($translations)) {
if($reportFormat === 'all') {
$this->output->writeln(sprintf(" - \"%s\" : no translation found.", $source_value));
}
$this->addToCondensedReport($source_value, $this->notTranslated);
}
else if (count($translations) < count($this->destination_fields)) {
if(in_array($reportFormat, ['all', 'translated'])) {
$this->output->writeln(sprintf(" - \"%s\" : incomplete translation.", $source_value));
}
$this->addToCondensedReport($source_value, $this->incompletelyTranslated);
}
else {
// complete translation (all target lng)
if(in_array($reportFormat, ['all', 'translated'])) {
$this->output->writeln(sprintf(" - \"%s\" :", $source_value));
}
$this->addToCondensedReport($source_value, $this->fullyTranslated);
if ($this->cleanupSource === self::CLEANUP_SOURCE_IF_TRANSLATED) {
// do NOT delete the source term if one translation found it as already present as destination (possible if source=destination)
$used = false;
foreach($translations as $l => $t) {
if($t['id'] === $source_meta_id) {
$used = true;
break;
}
}
if(!$used) {
$meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id];
}
}
}
if(in_array($reportFormat, ['all', 'translated'])) {
foreach ($translations as $lng => $translation) {
$this->output->writeln(sprintf(" - [%s] \"%s\" %s", $lng, $translation['val'], $translation['msg']));
}
}
if ($this->cleanupSource === self::ALWAYS_CLEANUP_SOURCE) {
// do NOT delete the source term if one translation found it as already present as destination (possible if source=destination)
$used = false;
foreach($translations as $l => $t) {
if($t['id'] === $source_meta_id) {
$used = true;
break;
}
}
if(!$used) {
$meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id];
}
}
unset($lng, $translations, $translation);
}
unset($metas, $source_meta_id, $source_value);
$actions = [];
$metadatas = [];
foreach ($meta_to_delete as $id => $value) {
$metadatas[] = [
'action' => "delete",
'meta_id' => $id,
'_value_' => $value
];
}
foreach($meta_to_add as $struct_id => $values) {
$metadatas[] = [
'action' => "add",
'meta_struct_id' => $struct_id,
'value' => $values
];
}
if(!empty($metadatas)) {
$actions['metadatas'] = $metadatas;
}
unset($metadatas);
if(!is_null($this->setCollection)) {
$actions['base_id'] = $this->setCollection->get_base_id();
}
if(!is_null($this->setStatus)) {
$status = [];
foreach(str_split(strrev($this->setStatus), 1) as $bit => $v) {
if($v === '0' || $v === '1') {
$status[] = [
'bit' => $bit,
'state' => $v === '1'
];
}
}
if(!empty($status)) {
$actions['status'] = $status;
}
}
$jsActions = json_encode($actions, JSON_PRETTY_PRINT);
if($this->output->getVerbosity() >= OutputInterface::VERBOSITY_VERY_VERBOSE) {
$this->output->writeln(sprintf("<info>JS : %s</info>", $jsActions));
}
if (!$this->globalConfiguration->isDryRun()) {
$record = $this->getDatabox()->getRecordRepository()->find($record_id);
$record->setMetadatasByActions(json_decode($jsActions));
}
$this->recordsDone++;
}
private function addToCondensedReport($term, &$where)
{
if($this->globalConfiguration->getReportFormat() !== 'condensed') {
return;
}
if(!array_key_exists($term, $where)) {
$where[$term] = 0;
}
$where[$term]++;
}
private function splitTermAndContext($word)
{
$term = trim($word);
$context = '';
if (($po = strpos($term, '(')) !== false) {
if (($pc = strpos($term, ')', $po)) !== false) {
$context = trim(substr($term, $po + 1, $pc - $po - 1));
$term = trim(substr($term, 0, $po));
}
else {
$context = trim(substr($term, $po + 1));
$term = trim(substr($term, 0, $po));
}
}
return [$term, $context];
}
/**
* @return string[]
*/
public function getErrors(): array
{
return $this->errors;
}
public function isValid(): bool
{
return empty($this->errors);
}
/**
* @return databox|null
*/
public function getDatabox()
{
return $this->databox;
}
/**
* @return bool
*/
public function isActive(): bool
{
return $this->active;
}
}

View File

@@ -0,0 +1,120 @@
<?php
namespace Alchemy\Phrasea\Command\Thesaurus\Translator;
use Alchemy\Phrasea\Border\File;
use Alchemy\Phrasea\Border\Manager as BorderManager;
use Alchemy\Phrasea\Command\Command as phrCommand;
use Alchemy\Phrasea\Model\Entities\LazaretSession;
use collection;
use databox;
use Doctrine\DBAL\DBALException;
use Doctrine\ORM\EntityManager;
use Exception;
use Guzzle\Http\Client as Guzzle;
use igorw;
use MediaVorus\MediaVorus;
use Neutron\TemporaryFilesystem\TemporaryFilesystem;
use PDO;
use record_adapter;
use Symfony\Component\Console\Formatter\OutputFormatterStyle;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Yaml\Yaml;
/**
*
* @license http://opensource.org/licenses/gpl-3.0 GPLv3
* @link www.phraseanet.com
*/
class TranslateCommand extends phrCommand
{
/** @var InputInterface $input */
private $input;
/** @var OutputInterface $output */
private $output;
/** @var GlobalConfiguration */
private $config;
public function configure()
{
$this->setName('thesaurus:translate')
->setDescription('(BETA) Translate fields values using thesaurus')
->addOption('report', null, InputOption::VALUE_REQUIRED, "Report output format (all|condensed|translated|record)", "all")
->addOption('dry', null, InputOption::VALUE_NONE, "list translations but don't apply.", null)
->setHelp("--report:\n - all : list every term.\n - translated : list only translated terms.\n - record : list only selected record ids.\n - condensed : count terms occurences.")
;
}
/**
* @param $input
* @param $output
* @return int
*/
protected function doExecute(InputInterface $input, OutputInterface $output)
{
// add cool styles
$style = new OutputFormatterStyle('black', 'yellow'); // , array('bold'));
$output->getFormatter()->setStyle('warning', $style);
// sanitize args
if(!in_array($input->getOption('report'), ['all', 'condensed', 'translated', 'record'])) {
$output->writeln(sprintf("<error>bad --report value (%s), use all|condensed|translated|record</error>", $input->getOption('report')));
return 1;
}
$this->input = $input;
$this->output = $output;
// config must be ok
//
try {
$this->config = GlobalConfiguration::create(
$this->container['phraseanet.appbox'],
$this->container['unicode'],
$this->container['root.path'],
$input->getOption('dry'),
$input->getOption('report'),
$output
);
}
catch(\Exception $e) {
$output->writeln(sprintf("<error>missing or bad configuration: %s</error>", $e->getMessage()));
return -1;
}
/**
* @var string $jobName
* @var Job $job
*/
foreach ($this->config->getJobs() as $jobName => $job) {
$output->writeln("");
$output->writeln(sprintf("======== Playing job %s ========", $jobName));
if(!$job->isValid()) {
$output->writeln("<warning>Configuration error(s)... :</warning>");
foreach ($job->getErrors() as $err) {
$output->writeln(sprintf(" - %s", $err));
}
$output->writeln("<warning>...Job ignored</warning>");
continue;
}
if(!$job->isActive()) {
$output->writeln(sprintf("job is inactive, skipped."));
continue;
}
$job->run();
}
return 0;
}
}

View File

@@ -0,0 +1,47 @@
# <phraseanet>/config/translator/configuration.yml
translator:
jobs:
#
# first job : translate EN keywords to FR and DE
# then change status-bit to elect record for job 2
keywords_EN_to_FR_DE:
active: true
databox: my_databox
if_collection: to_translate
if_status: xx1xxxx
source_field: KeywordsEN
source_lng: en
destination_fields:
- fr:keywordsFR
- de:keywordsDE
cleanup_source: if_translated
# job 1 cleans the destination fields
cleanup_destination: true
# do NOT change collection because job 2 looks here...
#-- set_collection: online
# ... but change status
set_status: 010xxxx
#
# second (last) job : translate EN country to FR and DE, add also as keywords !
#
country_EN_to_FR_DE:
active: true
databox: my_databox
# same collection as job 1
if_collection: to_translate
# status was changet by job 1
if_status: 010xxxx
source_field: CountryEN
source_lng: en
# add translated country to the keywords
destination_fields:
- fr:keywordsFR
- de:keywordsDE
cleanup_source: if_translated
# job 2 must NOT erase what job 1 did
cleanup_destination: false
# the last job will change collection
set_collection: online
set_status: 100xxxx

View File

@@ -0,0 +1,218 @@
#Translator
Translator is a console command that uses the thesaurus to translate terms from one field (source), to one or many fields (destinations).
It will act on records matching conditions like "from this collection" or
"if this status-bit is 1".
Translator play __jobs__ one after one, each __job__ can define his own settings.
Jobs and settings are declared in a configuration file (yml):
```yaml
# <phraseanet>/config/translator/configuration.yml
translator:
jobs:
keywords_EN_to_FR_DE:
active: true
databox: my_databox
...
country_EN_to_FR_DE:
active: false
...
```
##Job settings:
- `active` : (mandatory) boolean to activate the job.
- `databox`: (mandatory) The databox name|id to act on.
- `if_collection`: (optional) The unique collection name|id to act on; Default if not set: All collections.
- `if_status`: (optional) Act only on records matching this status-bits mask; Format 01x10xxxx; Default: All records.
- `source_field`: (mandatory) The name of the source field containing terms to be translated.
- `source_lng`: (optional) The language of the source terms to translate. If set, only terms matching this lng will be searched into thesaurus. Default if not set: Search term without language criteria.
- `destination_fields` (mandatory) A __list__ of destinations using format `{lng}:{field name}`;
Each translated term (from thesaurus) will be directed to the matching field, depending on his lng (see examples).
- `cleanup_source`: (optional) Whether to remove or keep the source term, depending on it was succesfully translated or not.
- `never`: keep the term (default).
- `if-translated`: remove if fully translated (all destination lngs).
- `always`: remove the term even it was not translated.
- `cleanup_destination`: (optional) Empty the destination(s) field(s) before translation (default `false`)
- `set_collection`: (optional) collection where to move the record after translation.
- `set_status`: (optional) status-bit mask to apply on record after translation.
##Important:
#### After playing job(s), no more record must match the selection conditions `if_collection`, `if_status`.
- Because a job will act on __all__ records matching the `if_collection` and `if_status` conditions,
one __should__ change the collection or sb after translation (`set_colllection` and `set_status` settings).
- Because each job declares his own conditions, playing multiple jobs must implement a _workflow_ mechanism:
- job 1 selects records matching conditions A (coll/sb) __must__ change collection and/or status to match conditions (B) of job 2.
- job 2 selects records matching conditions B and __must__ set new final values that matches neither A or B.
- Because jobs are played one after one, in case of many jobs acting on same records, workflow can be simplified:
- __first__ job 1 selects records matching "work-on" conditions, and does not change anything after translation.
- job 2 selects using the same conditions and does not change conditions either.
- __last__ job 3 selects using the same conditions, and is responsible to change collection and/or status when done.
Those rules prevent the job(s) to run multiple times on the same records. Of course care must be taken if one part of a workflow is de-activated.
#### Cleanup with multiple jobs.
- Because job n+1 is played after job n is fully completed, care must be taken when using `cleanup` options:
- If acting on same source, `cleanup_source: always` must only be applied on __last__ job, else job 1 will remove every term that job 2 should work on.
(This case might not happen since - thanks to multiple destinations - there is no reason to act on same source twice).
- Same care with multiples jobs writing on same destination(s): `cleanup_destination: true` should be set only on __first__ job, else job 2 will erase what job 1 has done.
##Example 1:
### translate new records (having default sb=0).
```yaml
translator:
jobs:
example:
active: true
databox: my_databox
# condition: act on new records having "translated" sb[4]=0
if_status: 0xxxx
# original keywords are expected to be EN
source_field: KeywordsEN
source_lng: en
# translate to 2 separate fields
destination_fields:
- fr:keywordsFR
- de:keywordsDE
# keep original EN keywords
cleanup_source: never
# remove existing terms on destinations before translating
cleanup_destination: true
# end: set "translated" sb to 1
set_status: 1xxxx
```
##Example 2:
### manually select records to translate by setting sb[4].
```yaml
translator:
jobs:
example:
# ...
# condition: act on records having "to translate" sb[4]=1
if_status: x1xxxx
# end: mark the record as "translated"
set_status: 10xxxx
```
##Example 3:
### translate new records from temporary collection.
```yaml
translator:
jobs:
example:
# ...
if_collection: 'upload'
set_collection: 'online'
```
##Example 4:
### add translations to the same field
__Trick__:
If one cleans the destination field - the __same as the source__ -, the original source will be deleted.
If the intent is to preserve the original term (adding translations), it must be added again.
The program will detect that the same term is to be deleted then added, and will preserve the original one.
```yaml
translator:
jobs:
example:
# ...
source_field: Keywords
source_lng: en
# since source=destination, source will be cleaned of all not-translatable terms...
cleanup_destination: true
destination_fields:
# ... this is why one must re-add the EN "translated" term (same as source)
- en:Keywords
- fr:Keywords
- de:Keywords
```
##Example 4-bis:
### removing terms that are not in the thesaurus
```yaml
translator:
jobs:
example:
# ...
source_field: Keywords
source_lng: en
cleanup_source: always
destination_fields:
- en:Keywords
```
##Example 5:
### merge many sources to one "tote bag"
```yaml
translator:
jobs:
keywords:
active: true
databox: my_databox
# manually start condition: set sb[4]
if_status: xxx1xxxx
# original keywords are expected to be EN
source_field: keywords
source_lng: en
# translate to a common field
destination_fields:
- fr:motscles
# each job can clean his own distinct source
cleanup_source: always
# first job cleanups destination
cleanup_destination: true
# end: set ready for next job
set_status: 0010xxxx
country:
active: true
databox: my_databox
# condition: set by previous job
if_status: 0010xxxx
# original country is expected to be EN
source_field: country
source_lng: en
# translate to the same destination
destination_fields:
- fr:motscles
# each job can clean his own distinct source
cleanup_source: always
# do NOT cleanup destination, first job did it
cleanup_destination: false
# end: set ready for next job
set_status: 0100xxxx
city:
active: true
databox: my_databox
# condition: set by previous job
if_status: 0010xxxx
# original city is expected to be EN
source_field: city
source_lng: en
# translate to the same field
destination_fields:
- fr:motscles
# each job can clean his own distinct source
cleanup_source: always
# do NOT cleanup destination, first job did it
cleanup_destination: false
# end: set to "translated"
set_status: 1000xxxx
```

View File

@@ -1490,7 +1490,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
foreach ($values as $value) {
if ($value) {
$ops[] = [
'expain' => sprintf('set:: adding value "%s" to "%s" (multi)', $value, $sf->get_name()),
'explain' => sprintf('set:: adding value "%s" to "%s" (multi)', $value, $sf->get_name()),
'meta_struct_id' => $sf->get_id(),
'meta_id' => $meta_id, // can be null
'value' => $value
@@ -1505,7 +1505,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
}
if( ($value = $values[0]) ) {
$ops[] = [
'expain' => sprintf('adding value "%s" to "%s" (mono)', $value, $sf->get_name()),
'explain' => sprintf('adding value "%s" to "%s" (mono)', $value, $sf->get_name()),
'meta_struct_id' => $sf->get_id(),
'meta_id' => $meta_id, // probably null,
'value' => $value
@@ -1522,7 +1522,6 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
* @param string[] $values
*
* @return array ops to execute
* @throws Exception
*/
private function metadata_add($struct_fields, $values)
{
@@ -1531,11 +1530,12 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
// now set values to matching struct_fields
foreach ($struct_fields as $sf) {
if(!$sf->is_multi()) {
throw new Exception(sprintf("can't \"add\" to mono-valued (%s).", $sf->get_name()));
// easy support "add" on mono : join values...
$values = [ join(' ; ', $values) ];
}
foreach ($values as $value) {
$ops[] = [
'expain' => sprintf('add:: adding value "%s" to "%s"', $value, $sf->get_name()),
'explain' => sprintf('add:: adding value "%s" to "%s"', $value, $sf->get_name()),
'meta_struct_id' => $sf->get_id(),
'meta_id' => null,
'value' => $value
@@ -1577,7 +1577,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
}
// then add the replacing value
$ops[] = [
'expain' => sprintf('rpl::match_all: adding value "%s" to "%s"', $replace_with, $cf->get_name()),
'explain' => sprintf('rpl::match_all: adding value "%s" to "%s"', $replace_with, $cf->get_name()),
'meta_struct_id' => $cf->get_meta_struct_id(),
'meta_id' => null,
'value' => $replace_with
@@ -1590,7 +1590,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
foreach ($cf->get_values() as $field_value) {
if ($field_value->getId() === $meta_id) {
$ops[] = [
'expain' => sprintf('rpl::match_meta_id %s (field "%s") set value "%s"', $field_value->getId(), $cf->get_name(), $replace_with),
'explain' => sprintf('rpl::match_meta_id %s (field "%s") set value "%s"', $field_value->getId(), $cf->get_name(), $replace_with),
'meta_struct_id' => $cf->get_meta_struct_id(),
'meta_id' => $field_value->getId(),
'value' => $replace_with
@@ -1609,7 +1609,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface
}
if ($this->match($value, $match_method, $field_value->getValue())) {
$ops[] = [
'expain' => sprintf('rpl::match_value "%s" (field "%s") set value "%s"', $field_value->getValue(), $cf->get_name(), $rw),
'explain' => sprintf('rpl::match_value "%s" (field "%s") set value "%s"', $field_value->getValue(), $cf->get_name(), $rw),
'meta_struct_id' => $cf->get_meta_struct_id(),
'meta_id' => $field_value->getId(),
'value' => $rw