From 17ee2c45615968f993f9bbff7fb12b8d3f89fb38 Mon Sep 17 00:00:00 2001 From: jygaulier Date: Thu, 27 Apr 2023 19:33:57 +0200 Subject: [PATCH 1/7] WIP ; translate but not write in db --- bin/console | 2 + .../Translator/ConfigurationException.php | 8 + .../Translator/GlobalConfiguration.php | 103 +++++ .../Command/Thesaurus/Translator/Job.php | 355 ++++++++++++++++++ .../Thesaurus/Translator/TranslateCommand.php | 117 ++++++ 5 files changed, 585 insertions(+) create mode 100644 lib/Alchemy/Phrasea/Command/Thesaurus/Translator/ConfigurationException.php create mode 100644 lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php create mode 100644 lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php create mode 100644 lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php diff --git a/bin/console b/bin/console index ef81b86196..e4906d5f65 100755 --- a/bin/console +++ b/bin/console @@ -55,6 +55,7 @@ use Alchemy\Phrasea\Command\Task\TaskStart; use Alchemy\Phrasea\Command\Task\TaskState; use Alchemy\Phrasea\Command\Task\TaskStop; use Alchemy\Phrasea\Command\Thesaurus\FindConceptsCommand; +use Alchemy\Phrasea\Command\Thesaurus\Translator\TranslateCommand; use Alchemy\Phrasea\Command\UpgradeDBDatas; use Alchemy\Phrasea\Command\User\UserApplicationsCommand; use Alchemy\Phrasea\Command\User\UserCreateCommand; @@ -172,6 +173,7 @@ $cli->command(new IndexPopulateCommand()); $cli->command(new QueryParseCommand()); $cli->command(new QuerySampleCommand()); $cli->command(new FindConceptsCommand()); +$cli->command(new TranslateCommand()); $cli->command(new WorkerExecuteCommand()); $cli->command(new WorkerHeartbeatCommand()); diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/ConfigurationException.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/ConfigurationException.php new file mode 100644 index 0000000000..664eada51e --- /dev/null +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/ConfigurationException.php @@ -0,0 +1,8 @@ +configuration = $global_conf; + + // list databoxes and collections to access by id or by name + $this->databoxes = []; + foreach ($appBox->get_databoxes() as $databox) { + $sbas_id = $databox->get_sbas_id(); + $sbas_name = $databox->get_dbname(); + $this->databoxes[$sbas_id] = [ + 'dbox' => $databox, + 'collections' => [] + ]; + $this->databoxes[$sbas_name] = &$this->databoxes[$sbas_id]; + // list all collections + foreach ($databox->get_collections() as $collection) { + $coll_id = $collection->get_coll_id(); + $coll_name = $collection->get_name(); + $this->databoxes[$sbas_id]['collections'][$coll_id] = $collection; + $this->databoxes[$sbas_id]['collections'][$coll_name] = &$this->databoxes[$sbas_id]['collections'][$coll_id]; + } + } + + foreach($global_conf['jobs'] as $job_name => $job_conf) { + $this->jobs[$job_name] = new Job($this, $job_conf, $unicode, $output); + } + } + + /** + * @param appbox $appBox + * @param string $root + * @return GlobalConfiguration + * @throws ConfigurationException + */ + public static function create(appbox $appBox, Unicode $unicode, string $root, OutputInterface $output): GlobalConfiguration + { + try { + $config_file = ($config_dir = $root . self::CONFIG_DIR) . self::CONFIG_FILE; + + @mkdir($config_dir, 0777, true); + + $config = Yaml::parse(file_get_contents($config_file)); + return new self($appBox, $unicode, $config['translator'], $output); + } + catch (\Exception $e) { + throw new ConfigurationException(sprintf("missing or bad configuration (%s)", $e->getMessage())); + } + } + + public function getJobs() + { + return $this->jobs; + } + + /** + * @param string|int $sbasIdOrName + * @return databox|null + */ + public function getDatabox($sbasIdOrName) + { + return isset($this->databoxes[$sbasIdOrName]) ? $this->databoxes[$sbasIdOrName]['dbox'] : null; + } + + /** + * @param string|int $sbasIdOrName + * @param string|int $collIdOrName + * @return collection|null + */ + public function getCollection($sbasIdOrName, $collIdOrName) + { + return $this->databoxes[$sbasIdOrName]['collections'][$collIdOrName] ?? null; + } + + + +} diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php new file mode 100644 index 0000000000..c18f4aa8e7 --- /dev/null +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php @@ -0,0 +1,355 @@ +output = $output; + $this->unicode = $unicode; + + if (array_key_exists('active', $job_conf) && $job_conf['active'] === false) { + $this->active = false; + + return; + } + + $this->errors = []; + foreach (['from_databox', 'from_field', 'from_lng'] as $mandatory) { + if (!isset($job_conf[$mandatory])) { + $this->errors[] = sprintf("Missing mandatory setting (%s).", $mandatory); + } + } + if (!empty($this->errors)) { + return; + } + + if (!($this->databox = $globalConfiguration->getDatabox($job_conf['from_databox']))) { + $this->errors[] = sprintf("unknown databox (%s).", $job_conf['from_databox']); + + return; + } + + $cnx = $this->databox->get_connection(); + + // get infos about the "from_field" + // + $sql = "SELECT `id`, `tbranch` FROM `metadatas_structure` WHERE `name` = :name AND `tbranch` != ''"; + $stmt = $cnx->executeQuery($sql, [':name' => $job_conf['from_field']]); + $this->from_field = $stmt->fetch(PDO::FETCH_ASSOC); + $stmt->closeCursor(); + if (!$this->from_field) { + $this->errors[] = sprintf("field (%s) not found or not linked to thesaurus.", $job_conf['from_field']); + + return; + } + $this->from_field['lng'] = $job_conf['from_lng']; + $this->selectRecordFieldIds[] = $this->from_field['id']; + $this->xpathTh = $this->databox->get_xpath_thesaurus(); + $this->tbranches = $this->xpathTh->query($this->from_field['tbranch']); + if (!$this->tbranches || $this->tbranches->length <= 0) { + $this->errors[] = sprintf("thesaurus branch(es) (%s) not found.", $this->from_field['tbranch']); + + return; + } + + // get infos about the "to_fields" + // + $this->to_fields = []; + $sql = "SELECT `id`, `name` FROM `metadatas_structure` WHERE `name` = :name "; + $stmt = $cnx->prepare($sql); + foreach ($job_conf['to_fields'] as $tf) { + list($lng, $fname) = explode(':', $tf); + $stmt->execute([':name' => $fname]); + if (!($row = $stmt->fetch(PDO::FETCH_ASSOC))) { + $this->output->writeln(sprintf("undefined field (%s) (ignored).", $fname)); + continue; + } + $this->to_fields[$lng] = $row; + $stmt->closeCursor(); + + $this->selectRecordFieldIds[] = $row['id']; + } + + if (empty($this->to_fields)) { + $this->errors[] = sprintf("no \"to_field\" found."); + + return; + } + + // misc settings + $this->cleanupDestination = array_key_exists('cleanup_destination', $job_conf) && $job_conf['cleanup_destination'] === true; + $this->cleanupSource = array_key_exists('cleanup_source', $job_conf) ? $job_conf['cleanup_source'] : self::NEVER_CLEANUP_SOURCE; + + // build records select sql + // + $selectRecordClauses = []; + $this->selectRecordParams = []; + if (array_key_exists('from_collection', $job_conf)) { + if (!($coll = $globalConfiguration->getCollection($job_conf['from_databox'], $job_conf['from_collection']))) { + $this->errors[] = sprintf("unknown collection (%s)", $job_conf['from_collection']); + + return; + } + $selectRecordClauses[] = "`coll_id` = :coll_id"; + $this->selectRecordParams[':coll_id'] = $coll->get_coll_id(); + } + + if (array_key_exists('if_status', $job_conf)) { + $selectRecordClauses[] = "`status` & b:sb_and = b:sb_equ"; + $this->selectRecordParams[':sb_and'] = str_replace(['0', 'x'], ['1', '0'], $job_conf['if_status']); + $this->selectRecordParams[':sb_equ'] = str_replace('x', '0', $job_conf['if_status']); + } + +// if ($this->cleanupDestination) { +// // if we must empty the destination field(s), no need to get the values +// $selectRecordClauses[] = "`meta_struct_id` = :ffid"; +// $this->selectRecordParams[':ffid'] = $this->from_field['id']; +// } +// else { + // if we add translations, we must fetch the actual values + $selectRecordClauses[] = "`meta_struct_id` IN (" . join(',', array_map(function ($id) use ($cnx) { + return $cnx->quote($id); + }, $this->selectRecordFieldIds)) . ")"; +// } + + $sql = "SELECT `record_id`, `meta_struct_id`, `metadatas`.`id` AS meta_id, `value` FROM"; + $sql .= " `record` INNER JOIN `metadatas` USING(`record_id`)"; + $sql .= " WHERE " . join(" AND ", $selectRecordClauses); + $sql .= " ORDER BY `record_id` ASC"; + $this->selectRecordsSql = $sql; + } + + public function run() + { + $cnx = $this->databox->get_connection(); + $stmt = $cnx->executeQuery($this->selectRecordsSql, $this->selectRecordParams); + + $currentRid = '?'; + $metas = $emptyValues = array_map(function () { + return []; + }, array_flip($this->selectRecordFieldIds)); + while ($row = $stmt->fetch(PDO::FETCH_ASSOC)) { + if ($currentRid == '?') { + $currentRid = $row['record_id']; + } + if ($row['record_id'] !== $currentRid) { + // change record + $this->doRecord($currentRid, $metas); // flush previous record + $currentRid = $row['record_id']; + $metas = $emptyValues; + } + + $metas[$row['meta_struct_id']][$row['meta_id']] = ['value' => $row['value'], 'status' => self::ORIGINAL]; + } + $this->doRecord($currentRid, $metas); // flush last record + + $stmt->closeCursor(); + } + + private function doRecord($record_id, $metas) + { + // loop on every "from" values + $from_field_id = $this->from_field['id']; + $this->output->writeln(sprintf("record id: %s", $record_id)); + + // loop on every value of the "from_field" + // + foreach ($metas[$from_field_id] as $kmeta => $meta) { + $value = $meta['value']; + // $this->output->write(sprintf(" - \"%s\"", $value)); + + $t = $this->splitTermAndContext($value); + $q = '@w=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[0])) . '\''; + if ($t[1]) { + $q .= ' and @k=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[1])) . '\''; + } + $q .= ' and @lng=\'' . \thesaurus::xquery_escape($this->from_field['lng']) . '\''; + $q = '//sy[' . $q . ']/../sy'; + + // loop on every tbranch (one field may be linked on many branches) + // + $translations = []; // ONE translation per lng (first found in th) + /** @var DOMNode $tbranch */ + foreach ($this->tbranches as $tbranch) { + if (!($nodes = $this->xpathTh->query($q, $tbranch))) { + $this->output->writeln(sprintf(" - \"%s\" xpath error on (%s), ignored.", $value, $q)); + continue; + } + + // loop on every synonym + // + /** @var DOMElement $node */ + foreach ($nodes as $node) { + $lng = $node->getAttribute('lng'); + + // ignore synonyms not in one of the "to_field" languages + // + if (!array_key_exists($lng, $this->to_fields)) { + continue; + } + + if (empty($translations)) { + // first translation: begin list + $this->output->writeln(sprintf(" - \"%s\"", $value)); + } + + $to_field_id = $this->to_fields[$lng]['id']; + + if (!array_key_exists($lng, $translations)) { + $translations[$lng] = $node->getAttribute('v'); + $this->output->writeln(sprintf(" - [%s] \"%s\" --> %s", $lng, $translations[$lng], $this->to_fields[$lng]['name'])); + } + } + } + + // cleanup source + // + if (empty($translations)) { + $this->output->writeln(sprintf(" - \"%s\" no translation found.", $value)); + $metas[$from_field_id][$kmeta]['status'] = self::NOT_TRANSLATED; + } + else if (count($translations) < count($this->to_fields)) { + $this->output->writeln(sprintf(" (incomplete translation).")); + $metas[$from_field_id][$kmeta]['status'] = self::INCOMPLETE; + } + else { + // complete translation (all target lng) + $metas[$from_field_id][$kmeta]['status'] = self::COMPLETE; + if($this->cleanupSource === self::CLEANUP_SOURCE_IF_TRANSLATED) { + $metas[$from_field_id][$kmeta]['status'] = self::TO_BE_DELETED; + } + } + if($this->cleanupSource === self::ALWAYS_CLEANUP_SOURCE) { + $metas[$from_field_id][$kmeta]['status'] = self::TO_BE_DELETED; + } + + // add / merge translations to targets + // + foreach($translations as $lng => $value) { + $to_field_id = $this->to_fields[$lng]['id']; + } + + } + + return; + } + + private function splitTermAndContext($word) + { + $term = trim($word); + $context = ''; + if (($po = strpos($term, '(')) !== false) { + if (($pc = strpos($term, ')', $po)) !== false) { + $context = trim(substr($term, $po + 1, $pc - $po - 1)); + $term = trim(substr($term, 0, $po)); + } + else { + $context = trim(substr($term, $po + 1)); + $term = trim(substr($term, 0, $po)); + } + } + + return [$term, $context]; + } + + + /** + * @return string[] + */ + public function getErrors(): array + { + return $this->errors; + } + + public function isValid(): bool + { + return empty($this->errors); + } + + /** + * @return databox|null + */ + public function getDatabox() + { + return $this->databox; + } + + /** + * @return bool + */ + public function isActive(): bool + { + return $this->active; + } + + +} diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php new file mode 100644 index 0000000000..f0c62ecad6 --- /dev/null +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php @@ -0,0 +1,117 @@ +setName('thesaurus:translate') + ->setDescription('Translate fields values using thesaurus') + ->addOption('from_collection', null, InputOption::VALUE_REQUIRED, "", null) + ->addOption('if_status', null, InputOption::VALUE_REQUIRED, "", null) + ->addOption('from_field', null, InputOption::VALUE_REQUIRED, "", null) + ->addOption('from_lng', null, InputOption::VALUE_REQUIRED, "", null) + ->addOption('to_field', null, InputOption::VALUE_IS_ARRAY | InputOption::VALUE_REQUIRED, "", null) + ->addOption('cleanup_source', null, InputOption::VALUE_REQUIRED, "", null) + ->addOption('cleanup_destination', null, InputOption::VALUE_NONE, "", null) + ->addOption('to_collection', null, InputOption::VALUE_REQUIRED, "", null) + ; + } + + /** + * @param $input + * @param $output + * @return int + */ + protected function doExecute(InputInterface $input, OutputInterface $output) + { + // add cool styles + $style = new OutputFormatterStyle('black', 'yellow'); // , array('bold')); + $output->getFormatter()->setStyle('warning', $style); + + $this->input = $input; + $this->output = $output; + + // config must be ok + // + try { + $this->config = GlobalConfiguration::create( + $this->container['phraseanet.appbox'], + $this->container['unicode'], + $this->container['root.path'], + $output + ); + } + catch(\Exception $e) { + $output->writeln(sprintf("missing or bad configuration: %s", $e->getMessage())); + + return -1; + } + + /** + * @var string $jobName + * @var Job $job + */ + foreach ($this->config->getJobs() as $jobName => $job) { + $output->writeln(""); + $output->writeln(sprintf("======== Playing job %s ========", $jobName)); + + if(!$job->isValid()) { + $output->writeln("Configuration error(s)... :"); + foreach ($job->getErrors() as $err) { + $output->writeln(sprintf(" - %s", $err)); + } + $output->writeln("...Job ignored"); + + continue; + } + + if(!$job->isActive()) { + $output->writeln(sprintf("job is inactive, skipped.")); + continue; + } + + $job->run(); + } + + return 0; + } + +} From 36c273327bd70a65b364a17818e299f0895edc03 Mon Sep 17 00:00:00 2001 From: jygaulier Date: Wed, 10 May 2023 15:24:38 +0200 Subject: [PATCH 2/7] WIP ; todo: use record_edit (setMetadatasByActions ?) to persist changes --- .../Translator/GlobalConfiguration.php | 21 +- .../Command/Thesaurus/Translator/Job.php | 225 ++++++++++++------ .../Thesaurus/Translator/TranslateCommand.php | 10 +- .../Translator/doc/configuration-sample.yml | 45 ++++ 4 files changed, 209 insertions(+), 92 deletions(-) create mode 100644 lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/configuration-sample.yml diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php index 991e74e93b..3a387cf208 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php @@ -21,13 +21,19 @@ Class GlobalConfiguration private $databoxes = []; + /** + * @var bool + */ + private $dryRun; + /** * @param appbox $appBox * @param array $global_conf */ - private function __construct($appBox, Unicode $unicode, $global_conf, OutputInterface $output) + private function __construct($appBox, Unicode $unicode, $global_conf, bool $dryRun, OutputInterface $output) { $this->configuration = $global_conf; + $this->dryRun = $dryRun; // list databoxes and collections to access by id or by name $this->databoxes = []; @@ -59,7 +65,7 @@ Class GlobalConfiguration * @return GlobalConfiguration * @throws ConfigurationException */ - public static function create(appbox $appBox, Unicode $unicode, string $root, OutputInterface $output): GlobalConfiguration + public static function create(appbox $appBox, Unicode $unicode, string $root, bool $dryRun, OutputInterface $output): GlobalConfiguration { try { $config_file = ($config_dir = $root . self::CONFIG_DIR) . self::CONFIG_FILE; @@ -67,7 +73,7 @@ Class GlobalConfiguration @mkdir($config_dir, 0777, true); $config = Yaml::parse(file_get_contents($config_file)); - return new self($appBox, $unicode, $config['translator'], $output); + return new self($appBox, $unicode, $config['translator'], $dryRun, $output); } catch (\Exception $e) { throw new ConfigurationException(sprintf("missing or bad configuration (%s)", $e->getMessage())); @@ -98,6 +104,11 @@ Class GlobalConfiguration return $this->databoxes[$sbasIdOrName]['collections'][$collIdOrName] ?? null; } - - + /** + * @return bool + */ + public function isDryRun(): bool + { + return $this->dryRun; + } } diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php index c18f4aa8e7..b1f4da9f0d 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php @@ -14,14 +14,10 @@ use Unicode; class Job { - const ORIGINAL = 'original'; - const COMPLETE = 'complete'; - const INCOMPLETE = 'incomplete'; - const NOT_TRANSLATED = 'not_translated'; const NEVER_CLEANUP_SOURCE = 'never'; const ALWAYS_CLEANUP_SOURCE = 'always'; const CLEANUP_SOURCE_IF_TRANSLATED = 'if_translated'; - const TO_BE_DELETED = 'to_be_deleted'; + private $active = true; @@ -36,7 +32,7 @@ class Job private $selectRecordsSql = null; - /** @var array list of field ids of "fromField" (unique) and "toFields" (many) */ + /** @var array list of field ids of "source_field" (unique) and "destination_fields" (many) */ private $selectRecordFieldIds; /** @@ -44,8 +40,8 @@ class Job */ private $output; - private $from_field; // infos about the "from_field" - private $to_fields; // infos about the "to_fields" (key=lng) + private $source_field; // infos about the "source_field" + private $destination_fields; // infos about the "destination_fields" (key=lng) /** * @var Unicode @@ -57,15 +53,31 @@ class Job /** * @var DOMNodeList - * The thesaurus branch(es) linked to the "from_field" + * The thesaurus branch(es) linked to the "source_field" */ private $tbranches; /** @var bool */ private $cleanupDestination; - /** @var string */ + /** @var string */ private $cleanupSource = self::NEVER_CLEANUP_SOURCE; + /** + * @var GlobalConfiguration + */ + private $globalConfiguration; + /** + * @var array + */ + private $job_conf; + /** + * @var \collection|null + */ + private $setCollection = null; + /** + * @var string + */ + private $setStatus = null; // format 0xx1100xx01xxxx /** * @param GlobalConfiguration $globalConfiguration @@ -73,8 +85,10 @@ class Job */ public function __construct($globalConfiguration, $job_conf, Unicode $unicode, OutputInterface $output) { - $this->output = $output; + $this->globalConfiguration = $globalConfiguration; + $this->job_conf = $job_conf; $this->unicode = $unicode; + $this->output = $output; if (array_key_exists('active', $job_conf) && $job_conf['active'] === false) { $this->active = false; @@ -83,7 +97,7 @@ class Job } $this->errors = []; - foreach (['from_databox', 'from_field', 'from_lng'] as $mandatory) { + foreach (['databox', 'source_field', 'source_lng'] as $mandatory) { if (!isset($job_conf[$mandatory])) { $this->errors[] = sprintf("Missing mandatory setting (%s).", $mandatory); } @@ -92,55 +106,68 @@ class Job return; } - if (!($this->databox = $globalConfiguration->getDatabox($job_conf['from_databox']))) { - $this->errors[] = sprintf("unknown databox (%s).", $job_conf['from_databox']); + if (!($this->databox = $globalConfiguration->getDatabox($job_conf['databox']))) { + $this->errors[] = sprintf("unknown databox (%s).", $job_conf['databox']); return; } + if(array_key_exists('set_collection', $job_conf)) { + if(!($this->setCollection = $globalConfiguration->getCollection($this->databox->get_sbas_id(), $job_conf['set_collection']))) { + $this->errors[] = sprintf("unknown setCollection (%s).", $job_conf['set_collection']); + + return; + } + } + + if(array_key_exists('set_status', $job_conf)) { + $this->setStatus = $job_conf['set_status']; + } + + $cnx = $this->databox->get_connection(); - // get infos about the "from_field" + // get infos about the "source_field" // $sql = "SELECT `id`, `tbranch` FROM `metadatas_structure` WHERE `name` = :name AND `tbranch` != ''"; - $stmt = $cnx->executeQuery($sql, [':name' => $job_conf['from_field']]); - $this->from_field = $stmt->fetch(PDO::FETCH_ASSOC); + $stmt = $cnx->executeQuery($sql, [':name' => $job_conf['source_field']]); + $this->source_field = $stmt->fetch(PDO::FETCH_ASSOC); $stmt->closeCursor(); - if (!$this->from_field) { - $this->errors[] = sprintf("field (%s) not found or not linked to thesaurus.", $job_conf['from_field']); + if (!$this->source_field) { + $this->errors[] = sprintf("field (%s) not found or not linked to thesaurus.", $job_conf['source_field']); return; } - $this->from_field['lng'] = $job_conf['from_lng']; - $this->selectRecordFieldIds[] = $this->from_field['id']; + $this->source_field['lng'] = $job_conf['source_lng']; + $this->selectRecordFieldIds[] = $this->source_field['id']; $this->xpathTh = $this->databox->get_xpath_thesaurus(); - $this->tbranches = $this->xpathTh->query($this->from_field['tbranch']); + $this->tbranches = $this->xpathTh->query($this->source_field['tbranch']); if (!$this->tbranches || $this->tbranches->length <= 0) { - $this->errors[] = sprintf("thesaurus branch(es) (%s) not found.", $this->from_field['tbranch']); + $this->errors[] = sprintf("thesaurus branch(es) (%s) not found.", $this->source_field['tbranch']); return; } - // get infos about the "to_fields" + // get infos about the "destination_fields" // - $this->to_fields = []; + $this->destination_fields = []; $sql = "SELECT `id`, `name` FROM `metadatas_structure` WHERE `name` = :name "; $stmt = $cnx->prepare($sql); - foreach ($job_conf['to_fields'] as $tf) { + foreach ($job_conf['destination_fields'] as $tf) { list($lng, $fname) = explode(':', $tf); $stmt->execute([':name' => $fname]); if (!($row = $stmt->fetch(PDO::FETCH_ASSOC))) { $this->output->writeln(sprintf("undefined field (%s) (ignored).", $fname)); continue; } - $this->to_fields[$lng] = $row; + $this->destination_fields[$lng] = $row; $stmt->closeCursor(); $this->selectRecordFieldIds[] = $row['id']; } - if (empty($this->to_fields)) { - $this->errors[] = sprintf("no \"to_field\" found."); + if (empty($this->destination_fields)) { + $this->errors[] = sprintf("no \"destination_field\" found."); return; } @@ -153,9 +180,9 @@ class Job // $selectRecordClauses = []; $this->selectRecordParams = []; - if (array_key_exists('from_collection', $job_conf)) { - if (!($coll = $globalConfiguration->getCollection($job_conf['from_databox'], $job_conf['from_collection']))) { - $this->errors[] = sprintf("unknown collection (%s)", $job_conf['from_collection']); + if (array_key_exists('if_collection', $job_conf)) { + if (!($coll = $globalConfiguration->getCollection($job_conf['databox'], $job_conf['if_collection']))) { + $this->errors[] = sprintf("unknown collection (%s)", $job_conf['if_collection']); return; } @@ -169,17 +196,14 @@ class Job $this->selectRecordParams[':sb_equ'] = str_replace('x', '0', $job_conf['if_status']); } -// if ($this->cleanupDestination) { -// // if we must empty the destination field(s), no need to get the values -// $selectRecordClauses[] = "`meta_struct_id` = :ffid"; -// $this->selectRecordParams[':ffid'] = $this->from_field['id']; -// } -// else { - // if we add translations, we must fetch the actual values - $selectRecordClauses[] = "`meta_struct_id` IN (" . join(',', array_map(function ($id) use ($cnx) { + $selectRecordClauses[] = "`meta_struct_id` IN (" + . join( + ',', + array_map(function ($id) use ($cnx) { return $cnx->quote($id); - }, $this->selectRecordFieldIds)) . ")"; -// } + }, $this->selectRecordFieldIds) + ) + . ")"; $sql = "SELECT `record_id`, `meta_struct_id`, `metadatas`.`id` AS meta_id, `value` FROM"; $sql .= " `record` INNER JOIN `metadatas` USING(`record_id`)"; @@ -208,7 +232,7 @@ class Job $metas = $emptyValues; } - $metas[$row['meta_struct_id']][$row['meta_id']] = ['value' => $row['value'], 'status' => self::ORIGINAL]; + $metas[$row['meta_struct_id']][$row['meta_id']] = $row['value']; } $this->doRecord($currentRid, $metas); // flush last record @@ -217,31 +241,43 @@ class Job private function doRecord($record_id, $metas) { - // loop on every "from" values - $from_field_id = $this->from_field['id']; $this->output->writeln(sprintf("record id: %s", $record_id)); - // loop on every value of the "from_field" - // - foreach ($metas[$from_field_id] as $kmeta => $meta) { - $value = $meta['value']; - // $this->output->write(sprintf(" - \"%s\"", $value)); + $source_field_id = $this->source_field['id']; + $meta_to_delete = []; // key = id, to easily keep unique + $meta_to_add = []; - $t = $this->splitTermAndContext($value); + if ($this->cleanupDestination) { + foreach ($this->destination_fields as $lng => $destination_field) { + $destination_field_id = $destination_field['id']; + foreach ($metas[$destination_field_id] as $meta_id => $value) { + $meta_to_delete[$meta_id] = $value; + } + unset($meta_id, $value); + } + unset($lng, $destination_field, $destination_field_id); + } + + // loop on every value of the "source_field" + // + foreach ($metas[$source_field_id] as $source_meta_id => $source_value) { + + $t = $this->splitTermAndContext($source_value); $q = '@w=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[0])) . '\''; if ($t[1]) { $q .= ' and @k=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[1])) . '\''; } - $q .= ' and @lng=\'' . \thesaurus::xquery_escape($this->from_field['lng']) . '\''; + $q .= ' and @lng=\'' . \thesaurus::xquery_escape($this->source_field['lng']) . '\''; $q = '//sy[' . $q . ']/../sy'; + unset($t); - // loop on every tbranch (one field may be linked on many branches) + // loop on every tbranch (one field may be linked to many branches) // $translations = []; // ONE translation per lng (first found in th) /** @var DOMNode $tbranch */ foreach ($this->tbranches as $tbranch) { if (!($nodes = $this->xpathTh->query($q, $tbranch))) { - $this->output->writeln(sprintf(" - \"%s\" xpath error on (%s), ignored.", $value, $q)); + $this->output->writeln(sprintf(" - \"%s\" xpath error on (%s), ignored.", $source_value, $q)); continue; } @@ -251,56 +287,87 @@ class Job foreach ($nodes as $node) { $lng = $node->getAttribute('lng'); - // ignore synonyms not in one of the "to_field" languages + // ignore synonyms not in one of the "destination_field" languages // - if (!array_key_exists($lng, $this->to_fields)) { + if (!array_key_exists($lng, $this->destination_fields)) { continue; } - if (empty($translations)) { - // first translation: begin list - $this->output->writeln(sprintf(" - \"%s\"", $value)); - } - - $to_field_id = $this->to_fields[$lng]['id']; + $translated_value = $node->getAttribute('v'); + $destination_field_id = $this->destination_fields[$lng]['id']; if (!array_key_exists($lng, $translations)) { - $translations[$lng] = $node->getAttribute('v'); - $this->output->writeln(sprintf(" - [%s] \"%s\" --> %s", $lng, $translations[$lng], $this->to_fields[$lng]['name'])); + if (($destination_meta_id = array_search($translated_value, $metas[$destination_field_id])) === false) { + $translations[$lng] = [ + 'val' => $translated_value, + 'msg' => sprintf(" --> %s", $this->destination_fields[$lng]['name']) + ];; + $meta_to_add[$destination_field_id][] = $translated_value; + } + else { + $translations[$lng] = [ + 'val' => $translated_value, + 'msg' => sprintf("already in %s", $this->destination_fields[$lng]['name']) + ]; + unset($meta_to_delete[$destination_meta_id]); + } + unset($destination_meta_id); } + unset($lng, $destination_field_id, $translated_value); } + unset($nodes, $node, $tbranch); } + unset($q); // cleanup source // if (empty($translations)) { - $this->output->writeln(sprintf(" - \"%s\" no translation found.", $value)); - $metas[$from_field_id][$kmeta]['status'] = self::NOT_TRANSLATED; + $this->output->writeln(sprintf(" - \"%s\" : no translation found.", $source_value)); } - else if (count($translations) < count($this->to_fields)) { - $this->output->writeln(sprintf(" (incomplete translation).")); - $metas[$from_field_id][$kmeta]['status'] = self::INCOMPLETE; + else if (count($translations) < count($this->destination_fields)) { + $this->output->writeln(sprintf(" - \"%s\" : incomplete translation.", $source_value)); } else { // complete translation (all target lng) - $metas[$from_field_id][$kmeta]['status'] = self::COMPLETE; - if($this->cleanupSource === self::CLEANUP_SOURCE_IF_TRANSLATED) { - $metas[$from_field_id][$kmeta]['status'] = self::TO_BE_DELETED; + $this->output->writeln(sprintf(" - \"%s\" :", $source_value)); + if ($this->cleanupSource === self::CLEANUP_SOURCE_IF_TRANSLATED) { + $meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id]; } } - if($this->cleanupSource === self::ALWAYS_CLEANUP_SOURCE) { - $metas[$from_field_id][$kmeta]['status'] = self::TO_BE_DELETED; + + foreach ($translations as $lng => $translation) { + $this->output->writeln(sprintf(" - [%s] \"%s\" %s", $lng, $translation['val'], $translation['msg'])); } - // add / merge translations to targets - // - foreach($translations as $lng => $value) { - $to_field_id = $this->to_fields[$lng]['id']; + if ($this->cleanupSource === self::ALWAYS_CLEANUP_SOURCE) { + $meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id]; } + unset($lng, $translations, $translation); + } + + unset($metas, $source_meta_id, $source_value); + + if (!$this->globalConfiguration->isDryRun()) { + $record = $this->getDatabox()->getRecordRepository()->find($record_id); + + // todo : delete meta where id in array_keys($meta_to_delete) ; $meta_to_delete[meta_id] = meta_value + $this->output->writeln(sprintf("DELETE : %s", var_export($meta_to_delete, true))); + + // todo : add meta from $meta_to_add ; $meta_to_add[meta_struct_id] = array of values + $this->output->writeln(sprintf("ADD : %s", var_export($meta_to_add, true))); + + if(!is_null($this->setCollection)) { + // todo : move record + $this->output->writeln(sprintf("MOVE TO : %s", $this->setCollection->get_name())); + } + + if(!is_null($this->setStatus)) { + // todo : change status + $this->output->writeln(sprintf("SET STATUS : %s", $this->setStatus)); + } } - return; } private function splitTermAndContext($word) diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php index f0c62ecad6..6549c11731 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php @@ -44,14 +44,7 @@ class TranslateCommand extends phrCommand { $this->setName('thesaurus:translate') ->setDescription('Translate fields values using thesaurus') - ->addOption('from_collection', null, InputOption::VALUE_REQUIRED, "", null) - ->addOption('if_status', null, InputOption::VALUE_REQUIRED, "", null) - ->addOption('from_field', null, InputOption::VALUE_REQUIRED, "", null) - ->addOption('from_lng', null, InputOption::VALUE_REQUIRED, "", null) - ->addOption('to_field', null, InputOption::VALUE_IS_ARRAY | InputOption::VALUE_REQUIRED, "", null) - ->addOption('cleanup_source', null, InputOption::VALUE_REQUIRED, "", null) - ->addOption('cleanup_destination', null, InputOption::VALUE_NONE, "", null) - ->addOption('to_collection', null, InputOption::VALUE_REQUIRED, "", null) + ->addOption('dry', null, InputOption::VALUE_NONE, "list translations but don't apply.", null) ; } @@ -76,6 +69,7 @@ class TranslateCommand extends phrCommand $this->container['phraseanet.appbox'], $this->container['unicode'], $this->container['root.path'], + $input->getOption('dry'), $output ); } diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/configuration-sample.yml b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/configuration-sample.yml new file mode 100644 index 0000000000..7d3a28c72a --- /dev/null +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/configuration-sample.yml @@ -0,0 +1,45 @@ +translator: + jobs: + # + # first job : translate EN keywords to FR and DE + # then change status-bit to elect record for job 2 + keywords_EN_to_FR_DE: + active: true + databox: my_databox + if_collection: to_translate + if_status: xx1xxxx + source_field: KeywordsEN + source_lng: en + destination_fields: + - fr:keywordsFR + - de:keywordsDE + cleanup_source: if_translated + # job 1 cleans the destination fields + cleanup_destination: true + # do NOT change collection because job 2 looks here... + #-- set_collection: online + # ... but change status + set_status: 010xxxx + + # + # second (last) job : translate EN country to FR and DE, add as keywords ! + # + country_EN_to_FR_DE: + active: true + databox: my_databox + # same collection as job 1 + if_collection: to_translate + # status was changet by job 1 + if_status: 010xxxx + source_field: CountryEN + source_lng: en + # add translated country to the keywords + destination_fields: + - fr:keywordsFR + - de:keywordsDE + cleanup_source: if_translated + # job 2 must NOT erase what job 1 did + cleanup_destination: false + # the last job will change collection + set_collection: online + set_status: 100xxxx From 77c5521243a0985174ca28aa0a04a144756c031d Mon Sep 17 00:00:00 2001 From: jygaulier Date: Wed, 10 May 2023 19:37:53 +0200 Subject: [PATCH 3/7] apply actions (translations, coll, sb) ; enhance setMetadatasByActions():"add" action to act on mono-fields --- .../Command/Thesaurus/Translator/Job.php | 50 +++++++++++++++---- .../Translator/doc/configuration-sample.yml | 4 +- lib/classes/record/adapter.php | 16 +++--- 3 files changed, 51 insertions(+), 19 deletions(-) diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php index b1f4da9f0d..debaf9b218 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php @@ -234,7 +234,9 @@ class Job $metas[$row['meta_struct_id']][$row['meta_id']] = $row['value']; } - $this->doRecord($currentRid, $metas); // flush last record + if($currentRid !== '?') { + $this->doRecord($currentRid, $metas); // flush last record + } $stmt->closeCursor(); } @@ -350,22 +352,50 @@ class Job if (!$this->globalConfiguration->isDryRun()) { $record = $this->getDatabox()->getRecordRepository()->find($record_id); + $actions = []; - // todo : delete meta where id in array_keys($meta_to_delete) ; $meta_to_delete[meta_id] = meta_value - $this->output->writeln(sprintf("DELETE : %s", var_export($meta_to_delete, true))); - - // todo : add meta from $meta_to_add ; $meta_to_add[meta_struct_id] = array of values - $this->output->writeln(sprintf("ADD : %s", var_export($meta_to_add, true))); + $metadatas = []; + foreach (array_keys($meta_to_delete) as $id) { + $metadatas[] = [ + 'action' => "delete", + 'meta_id' => $id + ]; + } + foreach($meta_to_add as $struct_id => $values) { + $metadatas[] = [ + 'action' => "add", + 'meta_struct_id' => $struct_id, + 'value' => $values + ]; + } + if(!empty($metadatas)) { + $actions['metadatas'] = $metadatas; + } + unset($metadatas); if(!is_null($this->setCollection)) { - // todo : move record - $this->output->writeln(sprintf("MOVE TO : %s", $this->setCollection->get_name())); + $actions['base_id'] = $this->setCollection->get_base_id(); } if(!is_null($this->setStatus)) { - // todo : change status - $this->output->writeln(sprintf("SET STATUS : %s", $this->setStatus)); + $status = []; + foreach(str_split(strrev($this->setStatus), 1) as $bit => $v) { + if($v === '0' || $v === '1') { + $status[] = [ + 'bit' => $bit, + 'state' => $v === '1' + ]; + } + } + if(!empty($status)) { + $actions['status'] = $status; + } } + + $jsActions = json_encode($actions, JSON_PRETTY_PRINT); + // $this->output->writeln(sprintf("JS : %s", $jsActions)); + + $record->setMetadatasByActions(json_decode($jsActions)); } } diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/configuration-sample.yml b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/configuration-sample.yml index 7d3a28c72a..dd4dfe5224 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/configuration-sample.yml +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/configuration-sample.yml @@ -1,3 +1,5 @@ +# /config/translator/configuration.yml + translator: jobs: # @@ -22,7 +24,7 @@ translator: set_status: 010xxxx # - # second (last) job : translate EN country to FR and DE, add as keywords ! + # second (last) job : translate EN country to FR and DE, add also as keywords ! # country_EN_to_FR_DE: active: true diff --git a/lib/classes/record/adapter.php b/lib/classes/record/adapter.php index 69d07273f2..d66146709c 100644 --- a/lib/classes/record/adapter.php +++ b/lib/classes/record/adapter.php @@ -1490,7 +1490,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface foreach ($values as $value) { if ($value) { $ops[] = [ - 'expain' => sprintf('set:: adding value "%s" to "%s" (multi)', $value, $sf->get_name()), + 'explain' => sprintf('set:: adding value "%s" to "%s" (multi)', $value, $sf->get_name()), 'meta_struct_id' => $sf->get_id(), 'meta_id' => $meta_id, // can be null 'value' => $value @@ -1505,7 +1505,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface } if( ($value = $values[0]) ) { $ops[] = [ - 'expain' => sprintf('adding value "%s" to "%s" (mono)', $value, $sf->get_name()), + 'explain' => sprintf('adding value "%s" to "%s" (mono)', $value, $sf->get_name()), 'meta_struct_id' => $sf->get_id(), 'meta_id' => $meta_id, // probably null, 'value' => $value @@ -1522,7 +1522,6 @@ class record_adapter implements RecordInterface, cache_cacheableInterface * @param string[] $values * * @return array ops to execute - * @throws Exception */ private function metadata_add($struct_fields, $values) { @@ -1531,11 +1530,12 @@ class record_adapter implements RecordInterface, cache_cacheableInterface // now set values to matching struct_fields foreach ($struct_fields as $sf) { if(!$sf->is_multi()) { - throw new Exception(sprintf("can't \"add\" to mono-valued (%s).", $sf->get_name())); + // easy support "add" on mono : join values... + $values = [ join(' ; ', $values) ]; } foreach ($values as $value) { $ops[] = [ - 'expain' => sprintf('add:: adding value "%s" to "%s"', $value, $sf->get_name()), + 'explain' => sprintf('add:: adding value "%s" to "%s"', $value, $sf->get_name()), 'meta_struct_id' => $sf->get_id(), 'meta_id' => null, 'value' => $value @@ -1577,7 +1577,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface } // then add the replacing value $ops[] = [ - 'expain' => sprintf('rpl::match_all: adding value "%s" to "%s"', $replace_with, $cf->get_name()), + 'explain' => sprintf('rpl::match_all: adding value "%s" to "%s"', $replace_with, $cf->get_name()), 'meta_struct_id' => $cf->get_meta_struct_id(), 'meta_id' => null, 'value' => $replace_with @@ -1590,7 +1590,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface foreach ($cf->get_values() as $field_value) { if ($field_value->getId() === $meta_id) { $ops[] = [ - 'expain' => sprintf('rpl::match_meta_id %s (field "%s") set value "%s"', $field_value->getId(), $cf->get_name(), $replace_with), + 'explain' => sprintf('rpl::match_meta_id %s (field "%s") set value "%s"', $field_value->getId(), $cf->get_name(), $replace_with), 'meta_struct_id' => $cf->get_meta_struct_id(), 'meta_id' => $field_value->getId(), 'value' => $replace_with @@ -1609,7 +1609,7 @@ class record_adapter implements RecordInterface, cache_cacheableInterface } if ($this->match($value, $match_method, $field_value->getValue())) { $ops[] = [ - 'expain' => sprintf('rpl::match_value "%s" (field "%s") set value "%s"', $field_value->getValue(), $cf->get_name(), $rw), + 'explain' => sprintf('rpl::match_value "%s" (field "%s") set value "%s"', $field_value->getValue(), $cf->get_name(), $rw), 'meta_struct_id' => $cf->get_meta_struct_id(), 'meta_id' => $field_value->getId(), 'value' => $rw From a9b7c77b6e2d4d513c2f06391397aee084c22542 Mon Sep 17 00:00:00 2001 From: jygaulier Date: Thu, 11 May 2023 20:21:33 +0200 Subject: [PATCH 4/7] add doc ; handle source = destination --- .../Command/Thesaurus/Translator/Job.php | 124 ++++++---- .../Thesaurus/Translator/doc/translator.md | 218 ++++++++++++++++++ 2 files changed, 292 insertions(+), 50 deletions(-) create mode 100644 lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/translator.md diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php index debaf9b218..0c15239079 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php @@ -97,7 +97,7 @@ class Job } $this->errors = []; - foreach (['databox', 'source_field', 'source_lng'] as $mandatory) { + foreach (['active', 'databox', 'source_field', 'destination_fields'] as $mandatory) { if (!isset($job_conf[$mandatory])) { $this->errors[] = sprintf("Missing mandatory setting (%s).", $mandatory); } @@ -138,7 +138,7 @@ class Job return; } - $this->source_field['lng'] = $job_conf['source_lng']; + $this->source_field['lng'] = array_key_exists('source_lng', $job_conf) ? $job_conf['source_lng'] : null; $this->selectRecordFieldIds[] = $this->source_field['id']; $this->xpathTh = $this->databox->get_xpath_thesaurus(); $this->tbranches = $this->xpathTh->query($this->source_field['tbranch']); @@ -269,7 +269,9 @@ class Job if ($t[1]) { $q .= ' and @k=\'' . \thesaurus::xquery_escape($this->unicode->remove_indexer_chars($t[1])) . '\''; } - $q .= ' and @lng=\'' . \thesaurus::xquery_escape($this->source_field['lng']) . '\''; + if(!is_null($this->source_field['lng'])) { + $q .= ' and @lng=\'' . \thesaurus::xquery_escape($this->source_field['lng']) . '\''; + } $q = '//sy[' . $q . ']/../sy'; unset($t); @@ -302,13 +304,15 @@ class Job if (($destination_meta_id = array_search($translated_value, $metas[$destination_field_id])) === false) { $translations[$lng] = [ 'val' => $translated_value, + 'id' => null, 'msg' => sprintf(" --> %s", $this->destination_fields[$lng]['name']) - ];; + ]; $meta_to_add[$destination_field_id][] = $translated_value; } else { $translations[$lng] = [ 'val' => $translated_value, + 'id' => $destination_meta_id, 'msg' => sprintf("already in %s", $this->destination_fields[$lng]['name']) ]; unset($meta_to_delete[$destination_meta_id]); @@ -333,7 +337,17 @@ class Job // complete translation (all target lng) $this->output->writeln(sprintf(" - \"%s\" :", $source_value)); if ($this->cleanupSource === self::CLEANUP_SOURCE_IF_TRANSLATED) { - $meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id]; + // do NOT delete the source term if one translation found it as already present as destination (possible if source=destination) + $used = false; + foreach($translations as $l => $t) { + if($t['id'] === $source_meta_id) { + $used = true; + break; + } + } + if(!$used) { + $meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id]; + } } } @@ -342,7 +356,17 @@ class Job } if ($this->cleanupSource === self::ALWAYS_CLEANUP_SOURCE) { - $meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id]; + // do NOT delete the source term if one translation found it as already present as destination (possible if source=destination) + $used = false; + foreach($translations as $l => $t) { + if($t['id'] === $source_meta_id) { + $used = true; + break; + } + } + if(!$used) { + $meta_to_delete[$source_meta_id] = $metas[$source_field_id][$source_meta_id]; + } } unset($lng, $translations, $translation); @@ -350,52 +374,52 @@ class Job unset($metas, $source_meta_id, $source_value); + $actions = []; + + $metadatas = []; + foreach ($meta_to_delete as $id => $value) { + $metadatas[] = [ + 'action' => "delete", + 'meta_id' => $id, + '_value_' => $value + ]; + } + foreach($meta_to_add as $struct_id => $values) { + $metadatas[] = [ + 'action' => "add", + 'meta_struct_id' => $struct_id, + 'value' => $values + ]; + } + if(!empty($metadatas)) { + $actions['metadatas'] = $metadatas; + } + unset($metadatas); + + if(!is_null($this->setCollection)) { + $actions['base_id'] = $this->setCollection->get_base_id(); + } + + if(!is_null($this->setStatus)) { + $status = []; + foreach(str_split(strrev($this->setStatus), 1) as $bit => $v) { + if($v === '0' || $v === '1') { + $status[] = [ + 'bit' => $bit, + 'state' => $v === '1' + ]; + } + } + if(!empty($status)) { + $actions['status'] = $status; + } + } + $jsActions = json_encode($actions, JSON_PRETTY_PRINT); + $this->output->writeln(sprintf("JS : %s", $jsActions)); + if (!$this->globalConfiguration->isDryRun()) { $record = $this->getDatabox()->getRecordRepository()->find($record_id); - $actions = []; - - $metadatas = []; - foreach (array_keys($meta_to_delete) as $id) { - $metadatas[] = [ - 'action' => "delete", - 'meta_id' => $id - ]; - } - foreach($meta_to_add as $struct_id => $values) { - $metadatas[] = [ - 'action' => "add", - 'meta_struct_id' => $struct_id, - 'value' => $values - ]; - } - if(!empty($metadatas)) { - $actions['metadatas'] = $metadatas; - } - unset($metadatas); - - if(!is_null($this->setCollection)) { - $actions['base_id'] = $this->setCollection->get_base_id(); - } - - if(!is_null($this->setStatus)) { - $status = []; - foreach(str_split(strrev($this->setStatus), 1) as $bit => $v) { - if($v === '0' || $v === '1') { - $status[] = [ - 'bit' => $bit, - 'state' => $v === '1' - ]; - } - } - if(!empty($status)) { - $actions['status'] = $status; - } - } - - $jsActions = json_encode($actions, JSON_PRETTY_PRINT); - // $this->output->writeln(sprintf("JS : %s", $jsActions)); - - $record->setMetadatasByActions(json_decode($jsActions)); +// $record->setMetadatasByActions(json_decode($jsActions)); } } diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/translator.md b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/translator.md new file mode 100644 index 0000000000..3aa1dbea40 --- /dev/null +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/doc/translator.md @@ -0,0 +1,218 @@ +#Translator + +Translator is a console command that uses the thesaurus to translate terms from one field (source), to one or many fields (destinations). + +It will act on records matching conditions like "from this collection" or +"if this status-bit is 1". + +Translator play __jobs__ one after one, each __job__ can define his own settings. +Jobs and settings are declared in a configuration file (yml): + +```yaml +# /config/translator/configuration.yml + +translator: + jobs: + keywords_EN_to_FR_DE: + active: true + databox: my_databox + ... + country_EN_to_FR_DE: + active: false + ... +``` + +##Job settings: + +- `active` : (mandatory) boolean to activate the job. +- `databox`: (mandatory) The databox name|id to act on. +- `if_collection`: (optional) The unique collection name|id to act on; Default if not set: All collections. +- `if_status`: (optional) Act only on records matching this status-bits mask; Format 01x10xxxx; Default: All records. +- `source_field`: (mandatory) The name of the source field containing terms to be translated. +- `source_lng`: (optional) The language of the source terms to translate. If set, only terms matching this lng will be searched into thesaurus. Default if not set: Search term without language criteria. +- `destination_fields` (mandatory) A __list__ of destinations using format `{lng}:{field name}`; +Each translated term (from thesaurus) will be directed to the matching field, depending on his lng (see examples). +- `cleanup_source`: (optional) Whether to remove or keep the source term, depending on it was succesfully translated or not. + - `never`: keep the term (default). + - `if-translated`: remove if fully translated (all destination lngs). + - `always`: remove the term even it was not translated. +- `cleanup_destination`: (optional) Empty the destination(s) field(s) before translation (default `false`) +- `set_collection`: (optional) collection where to move the record after translation. +- `set_status`: (optional) status-bit mask to apply on record after translation. + +##Important: + +#### After playing job(s), no more record must match the selection conditions `if_collection`, `if_status`. + +- Because a job will act on __all__ records matching the `if_collection` and `if_status` conditions, +one __should__ change the collection or sb after translation (`set_colllection` and `set_status` settings). + + +- Because each job declares his own conditions, playing multiple jobs must implement a _workflow_ mechanism: + - job 1 selects records matching conditions A (coll/sb) __must__ change collection and/or status to match conditions (B) of job 2. + - job 2 selects records matching conditions B and __must__ set new final values that matches neither A or B. + + +- Because jobs are played one after one, in case of many jobs acting on same records, workflow can be simplified: + - __first__ job 1 selects records matching "work-on" conditions, and does not change anything after translation. + - job 2 selects using the same conditions and does not change conditions either. + - __last__ job 3 selects using the same conditions, and is responsible to change collection and/or status when done. + +Those rules prevent the job(s) to run multiple times on the same records. Of course care must be taken if one part of a workflow is de-activated. + +#### Cleanup with multiple jobs. + +- Because job n+1 is played after job n is fully completed, care must be taken when using `cleanup` options: + - If acting on same source, `cleanup_source: always` must only be applied on __last__ job, else job 1 will remove every term that job 2 should work on. + (This case might not happen since - thanks to multiple destinations - there is no reason to act on same source twice). + + - Same care with multiples jobs writing on same destination(s): `cleanup_destination: true` should be set only on __first__ job, else job 2 will erase what job 1 has done. + + + +##Example 1: +### translate new records (having default sb=0). +```yaml +translator: + jobs: + example: + active: true + databox: my_databox + # condition: act on new records having "translated" sb[4]=0 + if_status: 0xxxx + # original keywords are expected to be EN + source_field: KeywordsEN + source_lng: en + # translate to 2 separate fields + destination_fields: + - fr:keywordsFR + - de:keywordsDE + # keep original EN keywords + cleanup_source: never + # remove existing terms on destinations before translating + cleanup_destination: true + # end: set "translated" sb to 1 + set_status: 1xxxx +``` + +##Example 2: +### manually select records to translate by setting sb[4]. +```yaml +translator: + jobs: + example: + # ... + # condition: act on records having "to translate" sb[4]=1 + if_status: x1xxxx + # end: mark the record as "translated" + set_status: 10xxxx +``` + +##Example 3: +### translate new records from temporary collection. +```yaml +translator: + jobs: + example: + # ... + if_collection: 'upload' + set_collection: 'online' +``` + +##Example 4: +### add translations to the same field + +__Trick__: +If one cleans the destination field - the __same as the source__ -, the original source will be deleted. +If the intent is to preserve the original term (adding translations), it must be added again. + +The program will detect that the same term is to be deleted then added, and will preserve the original one. + +```yaml +translator: + jobs: + example: + # ... + source_field: Keywords + source_lng: en + # since source=destination, source will be cleaned of all not-translatable terms... + cleanup_destination: true + destination_fields: + # ... this is why one must re-add the EN "translated" term (same as source) + - en:Keywords + - fr:Keywords + - de:Keywords +``` + +##Example 4-bis: +### removing terms that are not in the thesaurus + +```yaml +translator: + jobs: + example: + # ... + source_field: Keywords + source_lng: en + cleanup_source: always + destination_fields: + - en:Keywords +``` + +##Example 5: +### merge many sources to one "tote bag" +```yaml +translator: + jobs: + keywords: + active: true + databox: my_databox + # manually start condition: set sb[4] + if_status: xxx1xxxx + # original keywords are expected to be EN + source_field: keywords + source_lng: en + # translate to a common field + destination_fields: + - fr:motscles + # each job can clean his own distinct source + cleanup_source: always + # first job cleanups destination + cleanup_destination: true + # end: set ready for next job + set_status: 0010xxxx + country: + active: true + databox: my_databox + # condition: set by previous job + if_status: 0010xxxx + # original country is expected to be EN + source_field: country + source_lng: en + # translate to the same destination + destination_fields: + - fr:motscles + # each job can clean his own distinct source + cleanup_source: always + # do NOT cleanup destination, first job did it + cleanup_destination: false + # end: set ready for next job + set_status: 0100xxxx + city: + active: true + databox: my_databox + # condition: set by previous job + if_status: 0010xxxx + # original city is expected to be EN + source_field: city + source_lng: en + # translate to the same field + destination_fields: + - fr:motscles + # each job can clean his own distinct source + cleanup_source: always + # do NOT cleanup destination, first job did it + cleanup_destination: false + # end: set to "translated" + set_status: 1000xxxx +``` From ecb6baaa89b58fc80e473867f720a18ac7c15c43 Mon Sep 17 00:00:00 2001 From: jygaulier Date: Mon, 15 May 2023 10:16:44 +0200 Subject: [PATCH 5/7] remove debug & restore write --- lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php index 0c15239079..5722677818 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php @@ -415,11 +415,11 @@ class Job } } $jsActions = json_encode($actions, JSON_PRETTY_PRINT); - $this->output->writeln(sprintf("JS : %s", $jsActions)); + // $this->output->writeln(sprintf("JS : %s", $jsActions)); if (!$this->globalConfiguration->isDryRun()) { $record = $this->getDatabox()->getRecordRepository()->find($record_id); -// $record->setMetadatasByActions(json_decode($jsActions)); + $record->setMetadatasByActions(json_decode($jsActions)); } } From a4210fe26cf063388385ac071f7e0356912ebf3f Mon Sep 17 00:00:00 2001 From: jygaulier Date: Mon, 15 May 2023 10:22:56 +0200 Subject: [PATCH 6/7] add debug of actions (setMetadatasByActions) with -vvv --- lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php index 5722677818..52ba55050b 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php @@ -415,7 +415,9 @@ class Job } } $jsActions = json_encode($actions, JSON_PRETTY_PRINT); - // $this->output->writeln(sprintf("JS : %s", $jsActions)); + if($this->output->getVerbosity() >= OutputInterface::VERBOSITY_DEBUG) { + $this->output->writeln(sprintf("JS : %s", $jsActions)); + } if (!$this->globalConfiguration->isDryRun()) { $record = $this->getDatabox()->getRecordRepository()->find($record_id); From 743070a92ca7e096321d8cda0af98def874b5cb1 Mon Sep 17 00:00:00 2001 From: jygaulier Date: Wed, 17 May 2023 15:25:26 +0200 Subject: [PATCH 7/7] mark as beta ; add --report ; display json with -vv --- .../Translator/GlobalConfiguration.php | 19 +++- .../Command/Thesaurus/Translator/Job.php | 92 +++++++++++++++++-- .../Thesaurus/Translator/TranslateCommand.php | 11 ++- 3 files changed, 111 insertions(+), 11 deletions(-) diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php index 3a387cf208..c097c63634 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/GlobalConfiguration.php @@ -25,15 +25,20 @@ Class GlobalConfiguration * @var bool */ private $dryRun; + /** + * @var string + */ + private $reportFormat; /** * @param appbox $appBox * @param array $global_conf */ - private function __construct($appBox, Unicode $unicode, $global_conf, bool $dryRun, OutputInterface $output) + private function __construct($appBox, Unicode $unicode, $global_conf, bool $dryRun, string $reportFormat, OutputInterface $output) { $this->configuration = $global_conf; $this->dryRun = $dryRun; + $this->reportFormat = $reportFormat; // list databoxes and collections to access by id or by name $this->databoxes = []; @@ -65,7 +70,7 @@ Class GlobalConfiguration * @return GlobalConfiguration * @throws ConfigurationException */ - public static function create(appbox $appBox, Unicode $unicode, string $root, bool $dryRun, OutputInterface $output): GlobalConfiguration + public static function create(appbox $appBox, Unicode $unicode, string $root, bool $dryRun, string $reportFormat, OutputInterface $output): GlobalConfiguration { try { $config_file = ($config_dir = $root . self::CONFIG_DIR) . self::CONFIG_FILE; @@ -73,7 +78,7 @@ Class GlobalConfiguration @mkdir($config_dir, 0777, true); $config = Yaml::parse(file_get_contents($config_file)); - return new self($appBox, $unicode, $config['translator'], $dryRun, $output); + return new self($appBox, $unicode, $config['translator'], $dryRun, $reportFormat, $output); } catch (\Exception $e) { throw new ConfigurationException(sprintf("missing or bad configuration (%s)", $e->getMessage())); @@ -111,4 +116,12 @@ Class GlobalConfiguration { return $this->dryRun; } + + /** + * @return string + */ + public function getReportFormat(): string + { + return $this->reportFormat; + } } diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php index 52ba55050b..fcd5d21ebd 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/Job.php @@ -79,6 +79,23 @@ class Job */ private $setStatus = null; // format 0xx1100xx01xxxx + /** + * @var array + */ + private $notTranslated; // for condensed report + /** + * @var array + */ + private $incompletelyTranslated; // for condensed report + /** + * @var array + */ + private $fullyTranslated; // for condensed report + /** + * @var int + */ + private $recordsDone; // for condensed report + /** * @param GlobalConfiguration $globalConfiguration * @param array $job_conf @@ -218,6 +235,11 @@ class Job $stmt = $cnx->executeQuery($this->selectRecordsSql, $this->selectRecordParams); $currentRid = '?'; + $this->recordsDone = 0; + $this->notTranslated = []; + $this->incompletelyTranslated = []; + $this->fullyTranslated = []; + $metas = $emptyValues = array_map(function () { return []; }, array_flip($this->selectRecordFieldIds)); @@ -239,11 +261,42 @@ class Job } $stmt->closeCursor(); + + // condensed report + // + if($this->globalConfiguration->getReportFormat() === 'condensed') { + $this->output->writeln(sprintf("%d records done.", $this->recordsDone)); + if(!empty($this->notTranslated)) { + ksort($this->notTranslated, SORT_STRING|SORT_FLAG_CASE); + $this->output->writeln("Not translated terms:"); + foreach ($this->notTranslated as $term => $n) { + $this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n)); + } + } + if(!empty($this->incompletelyTranslated)) { + ksort($this->incompletelyTranslated, SORT_STRING|SORT_FLAG_CASE); + $this->output->writeln("Incompletely translated terms:"); + foreach ($this->incompletelyTranslated as $term => $n) { + $this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n)); + } + } + if(!empty($this->fullyTranslated)) { + ksort($this->fullyTranslated, SORT_STRING|SORT_FLAG_CASE); + $this->output->writeln("Fully translated terms:"); + foreach ($this->fullyTranslated as $term => $n) { + $this->output->writeln(sprintf(" - \"%s\" (%d times)", $term, $n)); + } + } + } } private function doRecord($record_id, $metas) { - $this->output->writeln(sprintf("record id: %s", $record_id)); + $reportFormat = $this->globalConfiguration->getReportFormat(); + + if($reportFormat !== 'condensed') { + $this->output->writeln(sprintf("record id: %s", $record_id)); + } $source_field_id = $this->source_field['id']; $meta_to_delete = []; // key = id, to easily keep unique @@ -328,14 +381,24 @@ class Job // cleanup source // if (empty($translations)) { - $this->output->writeln(sprintf(" - \"%s\" : no translation found.", $source_value)); + if($reportFormat === 'all') { + $this->output->writeln(sprintf(" - \"%s\" : no translation found.", $source_value)); + } + $this->addToCondensedReport($source_value, $this->notTranslated); } else if (count($translations) < count($this->destination_fields)) { - $this->output->writeln(sprintf(" - \"%s\" : incomplete translation.", $source_value)); + if(in_array($reportFormat, ['all', 'translated'])) { + $this->output->writeln(sprintf(" - \"%s\" : incomplete translation.", $source_value)); + } + $this->addToCondensedReport($source_value, $this->incompletelyTranslated); } else { // complete translation (all target lng) - $this->output->writeln(sprintf(" - \"%s\" :", $source_value)); + if(in_array($reportFormat, ['all', 'translated'])) { + $this->output->writeln(sprintf(" - \"%s\" :", $source_value)); + } + $this->addToCondensedReport($source_value, $this->fullyTranslated); + if ($this->cleanupSource === self::CLEANUP_SOURCE_IF_TRANSLATED) { // do NOT delete the source term if one translation found it as already present as destination (possible if source=destination) $used = false; @@ -351,8 +414,10 @@ class Job } } - foreach ($translations as $lng => $translation) { - $this->output->writeln(sprintf(" - [%s] \"%s\" %s", $lng, $translation['val'], $translation['msg'])); + if(in_array($reportFormat, ['all', 'translated'])) { + foreach ($translations as $lng => $translation) { + $this->output->writeln(sprintf(" - [%s] \"%s\" %s", $lng, $translation['val'], $translation['msg'])); + } } if ($this->cleanupSource === self::ALWAYS_CLEANUP_SOURCE) { @@ -414,8 +479,9 @@ class Job $actions['status'] = $status; } } + $jsActions = json_encode($actions, JSON_PRETTY_PRINT); - if($this->output->getVerbosity() >= OutputInterface::VERBOSITY_DEBUG) { + if($this->output->getVerbosity() >= OutputInterface::VERBOSITY_VERY_VERBOSE) { $this->output->writeln(sprintf("JS : %s", $jsActions)); } @@ -424,6 +490,18 @@ class Job $record->setMetadatasByActions(json_decode($jsActions)); } + $this->recordsDone++; + } + + private function addToCondensedReport($term, &$where) + { + if($this->globalConfiguration->getReportFormat() !== 'condensed') { + return; + } + if(!array_key_exists($term, $where)) { + $where[$term] = 0; + } + $where[$term]++; } private function splitTermAndContext($word) diff --git a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php index 6549c11731..1758d50c82 100644 --- a/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php +++ b/lib/Alchemy/Phrasea/Command/Thesaurus/Translator/TranslateCommand.php @@ -43,8 +43,10 @@ class TranslateCommand extends phrCommand public function configure() { $this->setName('thesaurus:translate') - ->setDescription('Translate fields values using thesaurus') + ->setDescription('(BETA) Translate fields values using thesaurus') + ->addOption('report', null, InputOption::VALUE_REQUIRED, "Report output format (all|condensed|translated|record)", "all") ->addOption('dry', null, InputOption::VALUE_NONE, "list translations but don't apply.", null) + ->setHelp("--report:\n - all : list every term.\n - translated : list only translated terms.\n - record : list only selected record ids.\n - condensed : count terms occurences.") ; } @@ -59,6 +61,12 @@ class TranslateCommand extends phrCommand $style = new OutputFormatterStyle('black', 'yellow'); // , array('bold')); $output->getFormatter()->setStyle('warning', $style); + // sanitize args + if(!in_array($input->getOption('report'), ['all', 'condensed', 'translated', 'record'])) { + $output->writeln(sprintf("bad --report value (%s), use all|condensed|translated|record", $input->getOption('report'))); + return 1; + } + $this->input = $input; $this->output = $output; @@ -70,6 +78,7 @@ class TranslateCommand extends phrCommand $this->container['unicode'], $this->container['root.path'], $input->getOption('dry'), + $input->getOption('report'), $output ); }