From 7aab6efeec9e7592978756a4feab3c5c4217e2ca Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Wed, 15 Oct 2025 11:24:30 +0100 Subject: [PATCH] Switch to requests.get for images --- .../images_and_vernaculars/get_wiki_images.py | 16 +++++++++------- tests/test_get_wiki_images.py | 14 ++++---------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/oz_tree_build/images_and_vernaculars/get_wiki_images.py b/oz_tree_build/images_and_vernaculars/get_wiki_images.py index a4c6dcd..065d4b4 100644 --- a/oz_tree_build/images_and_vernaculars/get_wiki_images.py +++ b/oz_tree_build/images_and_vernaculars/get_wiki_images.py @@ -38,7 +38,6 @@ import re import sys import time -import urllib.request from pathlib import Path import requests @@ -71,6 +70,9 @@ "img", ) +# See https://meta.wikimedia.org/wiki/User-Agent_policy +wiki_http_headers = {"User-Agent": "OneZoomBot/0.1 (https://www.onezoom.org/; " "mail@onezoom.org) get-wiki-images/0.1"} + # Copied from OZTree/OZprivate/ServerScripts/Utilities/getEOL_crops.py def subdir_name(doID): @@ -90,11 +92,6 @@ def make_http_request_with_retries(url): retrying if we get a 429 rate limit error. """ - # See https://meta.wikimedia.org/wiki/User-Agent_policy - wiki_http_headers = { - "User-Agent": "OneZoomBot/0.1 (https://www.onezoom.org/; " "mail@onezoom.org) get-wiki-images/0.1" - } - retries = 6 delay = 1 for i in range(retries): @@ -374,7 +371,12 @@ def save_wiki_image(db, leaf_data, image_name, src, src_id, rating, output_dir, # Download the uncropped image uncropped_image_path = f"{image_dir}/{src_id}_uncropped.jpg" - urllib.request.urlretrieve(image_url, uncropped_image_path) + try: + response = requests.get(image_url, headers=wiki_http_headers) + with open(uncropped_image_path, "wb") as f: + f.write(response.content) + except Exception as e: + logger.warning(f"Error downloading {image_url}: {e}") if cropper is None: # Default to centering the crop diff --git a/tests/test_get_wiki_images.py b/tests/test_get_wiki_images.py index 6b3f13f..b57504a 100644 --- a/tests/test_get_wiki_images.py +++ b/tests/test_get_wiki_images.py @@ -1,11 +1,10 @@ import logging import os -import shutil -import urllib.request from types import SimpleNamespace from unittest import mock import pytest +import requests from PIL import Image from oz_tree_build._OZglobals import src_flags @@ -57,7 +56,9 @@ def __init__(self, mock_qid): self.temp_image_path = "/tmp/mocked_urlretrieve_image.jpg" if not os.path.exists(self.temp_image_path): image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/7/73/Lion_waiting_in_Namibia.jpg/500px-Lion_waiting_in_Namibia.jpg" - urllib.request.urlretrieve(image_url, self.temp_image_path) + response = requests.get(image_url, headers=get_wiki_images.wiki_http_headers) + with open(self.temp_image_path, "wb") as f: + f.write(response.content) with open(self.temp_image_path, "rb") as f: self.temp_image_content = f.read() @@ -108,12 +109,6 @@ def mocked_requests_get(self, *args, **kwargs): return MockResponse(200, self.mocked_requests[args[0]], content) return MockResponse(404) - def mocked_urlretrieve(self, *args, **kwargs): - # Instead of actually downloading, just copy the test image to the destination - if not args[0].startswith("http"): - raise ValueError("Only HTTP URLs are supported in these tests") - shutil.copyfile(self.temp_image_path, args[1]) - # Mock the Azure Vision API smart crop response def mocked_analyze_from_url(self, *args, **kwargs): return SimpleNamespace( @@ -194,7 +189,6 @@ def wikidata_response(self, image_data, vernacular_data): def mock_patch_all_web_request_methods(self, f): @mock.patch("requests.get", side_effect=self.mocked_requests_get) - @mock.patch("urllib.request.urlretrieve", side_effect=self.mocked_urlretrieve) @mock.patch( "azure.ai.vision.imageanalysis.ImageAnalysisClient.analyze_from_url", side_effect=self.mocked_analyze_from_url,