#!/usr/bin/env python3
import argparse
import os
import re
import time
from pathlib import Path
from urllib.parse import unquote, urlparse
import requests
API_URL = "https://commons.wikimedia.org/w/api.php"
USER_AGENT = "CommonsCategoryDownloader/1.0 (contact: your-email@example.com)"
MEDIA_EXTENSIONS = {
# images
".jpg", ".jpeg", ".png", ".gif", ".svg", ".webp", ".tif", ".tiff", ".bmp",
# audio
".ogg", ".oga", ".mp3", ".wav", ".flac", ".mid", ".midi", ".opus",
}
def safe_name(name: str) -> str:
name = name.replace("Category:", "").replace("File:", "")
name = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", name)
name = re.sub(r"\s+", " ", name).strip()
return name[:180] or "unnamed"
def api_get(params):
base = {
"format": "json",
"formatversion": "2",
}
base.update(params)
r = requests.get(
API_URL,
params=base,
headers={"User-Agent": USER_AGENT},
timeout=60,
)
r.raise_for_status()
return r.json()
def iter_category_members(category_title):
"""
Yields members of a Commons category.
Namespace 6 = File, namespace 14 = Category.
"""
cont = {}
while True:
data = api_get({
"action": "query",
"list": "categorymembers",
"cmtitle": category_title,
"cmnamespace": "6|14",
"cmlimit": "500",
**cont,
})
for item in data.get("query", {}).get("categorymembers", []):
yield item
if "continue" not in data:
break
cont = data["continue"]
def get_file_url(file_title):
data = api_get({
"action": "query",
"titles": file_title,
"prop": "imageinfo",
"iiprop": "url|mime|mediatype|size",
})
pages = data.get("query", {}).get("pages", [])
if not pages:
return None
imageinfo = pages[0].get("imageinfo", [])
if not imageinfo:
return None
info = imageinfo[0]
return info.get("url"), info.get("mime"), info.get("mediatype")
def filename_from_url(url, fallback_title):
parsed = urlparse(url)
name = unquote(Path(parsed.path).name)
return safe_name(name or fallback_title.replace("File:", ""))
def download_file(url, destination):
destination.parent.mkdir(parents=True, exist_ok=True)
if destination.exists() and destination.stat().st_size > 0:
print(f"Skip existing: {destination}")
return
tmp = destination.with_suffix(destination.suffix + ".part")
with requests.get(
url,
stream=True,
headers={"User-Agent": USER_AGENT},
timeout=120,
) as r:
r.raise_for_status()
with open(tmp, "wb") as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk:
f.write(chunk)
tmp.rename(destination)
print(f"Downloaded: {destination}")
def download_category(category_title, output_dir, visited, delay):
if not category_title.startswith("Category:"):
category_title = "Category:" + category_title
if category_title in visited:
return
visited.add(category_title)
folder = output_dir / safe_name(category_title)
folder.mkdir(parents=True, exist_ok=True)
print(f"\nCategory: {category_title}")
for member in iter_category_members(category_title):
title = member["title"]
ns = member["ns"]
if ns == 14: # subcategory
subfolder = folder / safe_name(title)
download_category(title, subfolder.parent, visited, delay)
elif ns == 6: # file
try:
result = get_file_url(title)
if not result:
print(f"No file URL: {title}")
continue
url, mime, mediatype = result
ext = Path(unquote(urlparse(url).path)).suffix.lower()
if ext not in MEDIA_EXTENSIONS:
print(f"Skip non-image/audio: {title} ({mime}, {mediatype})")
continue
filename = filename_from_url(url, title)
download_file(url, folder / filename)
if delay:
time.sleep(delay)
except Exception as e:
print(f"Error downloading {title}: {e}")
def main():
parser = argparse.ArgumentParser(
description="Download all image/audio media from a Wikimedia Commons category recursively."
)
parser.add_argument(
"category",
help='Commons category name, e.g. "Bird sounds" or "Category:Bird sounds"',
)
parser.add_argument(
"-o", "--output",
default="commons_downloads",
help="Output directory",
)
parser.add_argument(
"--delay",
type=float,
default=0.2,
help="Delay between downloads in seconds",
)
args = parser.parse_args()
output_dir = Path(args.output).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
download_category(args.category, output_dir, visited=set(), delay=args.delay)
if __name__ == "__main__":
main()