Files
resourcespace/pages/tools/commons_import.php
2025-07-18 16:20:14 +07:00

166 lines
4.7 KiB
PHP

<?php
/*
Populate ResourceSpace with random images from Wikimedia Commmons to aid testing at scale.
*/
include_once dirname(__FILE__) . '/../../include/boot.php';
include_once dirname(__FILE__) . '/../../include/image_processing.php';
command_line_only();
setup_command_line_user();
// Settings
$resource_type = 1; // Adjust to your desired type
$title_field = 8; // Adjust to your 'Title' field ref
$image_size = 400; // Image size in pixels to request
// Disable output buffering
while (ob_get_level() > 0) {
ob_end_flush();
}
ob_implicit_flush(true);
function download_file_with_user_agent($url)
{
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// Replace with your own project/contact info per Wikimedia policy
curl_setopt($ch, CURLOPT_USERAGENT, 'ResourceSpaceCommonsImport/1.0 (info@resourcespace.com)');
$data = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
if ($http_code !== 200 || $data === false) {
echo " CURL error or non-200 response ($http_code) from $url\n";
return false;
}
curl_close($ch);
return $data;
}
// Get number of images to import from CLI argument
if ($argc < 2 || !is_numeric($argv[1]) || (int)$argv[1] < 1) {
echo "Usage: php commons_import.php <number_of_images>\n";
exit(1);
}
$number_to_import = (int)$argv[1];
echo "Importing $number_to_import images from Wikimedia Commons...\n";
// Function to fetch random image metadata from Wikimedia Commons
function get_random_commons_images($count)
{
global $image_size;
$url = 'https://commons.wikimedia.org/w/api.php?' . http_build_query([
'action' => 'query',
'generator' => 'random',
'grnnamespace' => 6, // File namespace
'grnlimit' => $count,
'prop' => 'imageinfo',
'iiprop' => 'url',
'iiurlwidth' => $image_size, // Request 400px wide image
'iiurlheight' => $image_size, // (optional) limit height too
'format' => 'json'
]);
$response = file_get_contents($url);
if ($response === false) {
die("Error: Failed to fetch from Wikimedia Commons API.\n");
}
$data = json_decode($response, true);
if (!isset($data['query']['pages'])) {
die("Error: No images found in API response.\n");
}
$images = [];
foreach ($data['query']['pages'] as $page) {
if (!isset($page['imageinfo'][0]['thumburl'])) {
continue; // No thumbnail, skip
}
$images[] = [
'title' => $page['title'],
'url' => $page['imageinfo'][0]['thumburl']
];
}
return $images;
}
$imported_count = 0;
$batch_size = 10;
while ($imported_count < $number_to_import) {
$remaining = $number_to_import - $imported_count;
$fetch_count = min($batch_size, $remaining);
$images = get_random_commons_images($fetch_count);
foreach ($images as $img) {
echo "Processing: {$img['title']}\n";
// Skip non-JPEGs based on file extension
$ext = strtolower(pathinfo(parse_url($img['url'], PHP_URL_PATH), PATHINFO_EXTENSION));
if (!in_array($ext, ['jpg', 'jpeg'])) {
echo " Skipping non-JPEG file (.$ext).\n";
continue;
}
// Download the image
$tmp_file = get_temp_dir() . '/' . uniqid('commons_') . '.jpg';
$image_data = download_file_with_user_agent($img['url']);
if ($image_data === false) {
echo " Failed to download image.\n";
continue;
}
file_put_contents($tmp_file, $image_data);
// Create new resource
$ref = create_resource($resource_type, 0);
if ($ref <= 0) {
echo " Failed to create resource.\n";
unlink($tmp_file);
continue;
}
echo " Uploading from $tmp_file\n";
// Upload file to resource
$result = upload_file($ref, true, false, false, $tmp_file, false);
if ($result !== false) {
// Set title
$clean_title = preg_replace('/^File:/', '', $img['title']);
$clean_title = preg_replace('/\.[^.]+$/', '', $clean_title);
update_field($ref, $title_field, $clean_title);
echo " Resource $ref created and image uploaded.\n";
$imported_count++;
echo " Imported so far: $imported_count / $number_to_import\n";
} else {
echo " Failed to upload image to resource $ref.\n";
delete_resource($ref);
}
if (file_exists($tmp_file)) {unlink($tmp_file);}
if ($imported_count >= $number_to_import) {
break; // Exit early if we hit the target mid-batch
}
}
}
echo "✅ Import complete: $imported_count JPEG images added.\n";