Idea started because I wanted to download all 290 images from the Dr. No IMDB page: https://www.imdb.com/title/tt0055928/mediaindex/
How to use this script:
That's it!
The script will create a folder with all the images inside.
How to use this script:
- Install Python from https://www.python.org/downloads/
- Open Powershell and run
pip install selenium webdriver-manager requests beautifulsoup4 python-slugify
- Save the script below as imdb.py
- Navigate with Powershell to the correct directory. For example:
cd "C:\Users\tms\Downloads\scripts"
- Run
python imdb.py "https://www.imdb.com/title/tt0055928/"
Python:
#!/usr/bin/env python3
import os, re, sys, time, json, mimetypes, requests, urllib.parse
from bs4 import BeautifulSoup
from slugify import slugify
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
}
# ------------------ helpers ------------------
def maximize_amazon_image_url(url: str) -> str:
if not url:
return url
return re.sub(r"_V1_[^./]*(_)?\.", r"_V1_.", url)
def _meta(sp: BeautifulSoup, name: str, attr: str = "property"):
tag = sp.find("meta", attrs={attr: name})
return tag.get("content") if tag and tag.get("content") else None
def get_fullsize_from_media_viewer(media_url: str) -> str | None:
"""Return largest image URL from a mediaviewer page, using several strategies."""
try:
rp = requests.get(media_url, headers=HEADERS, timeout=25)
rp.raise_for_status()
sp = BeautifulSoup(rp.text, "html.parser")
# 1) Big <img>
img = sp.find(lambda tag: tag.name == "img" and (
tag.get("data-testid") in {"media-viewer-image"} or
"ipc-image" in (tag.get("class") or []) or
tag.get("id") in {"primary-img", "media-viewer-image"}
))
if img:
if img.get("srcset"):
last = img["srcset"].split(",")[-1].strip().split(" ")[0]
return maximize_amazon_image_url(last)
if img.get("src"):
return maximize_amazon_image_url(img["src"])
# 2) Meta fallbacks
for name, attr in [("og:image:secure_url","property"),("og:image","property"),("twitter:image","name")]:
u = _meta(sp, name, attr)
if u:
return maximize_amazon_image_url(u)
# 3) JSON-LD
for s in sp.find_all("script", type="application/ld+json"):
try:
data = json.loads(s.string or "{}")
items = [data] if isinstance(data, dict) else (data if isinstance(data, list) else [])
for d in items:
imgval = d.get("image")
if isinstance(imgval, str) and imgval:
return maximize_amazon_image_url(imgval)
if isinstance(imgval, list) and imgval:
return maximize_amazon_image_url(imgval[-1])
except Exception:
pass
# 4) Preload hints
link = sp.find("link", attrs={"as":"image","href":True})
if link:
return maximize_amazon_image_url(link["href"])
return None
except Exception as e:
print(f" [!] Mediaviewer parse failed: {e}")
return None
def guess_ext_from_response(resp, fallback_url):
ct = resp.headers.get("Content-Type") or ""
ext = mimetypes.guess_extension(ct)
if not ext:
ext = mimetypes.guess_extension(mimetypes.guess_type(fallback_url)[0] or "")
return ext or ".jpg"
def parse_title_bits(html_title: str):
try:
show, ep_title = re.search(r'"(.*)" ([^(]+)', html_title).groups()
return slugify(show), slugify(ep_title.strip())
except Exception:
return None, slugify(html_title)
def extract_ttid(url: str) -> str:
m = re.search(r"(tt\d+)", url)
return m.group(1) if m else "ttXXXXXXX"
# ---------- Selenium helpers ----------
def collect_mediaviewer_links_on_current_page(driver, pause=1.0, idle_rounds=4) -> set[str]:
"""Scroll to bottom, letting the page lazy-load, and collect all /mediaviewer/ links on THIS page."""
seen = set()
idle = 0
while True:
new = 0
anchors = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/mediaviewer/"]')
for a in anchors:
try:
href = a.get_attribute("href")
if href and "/mediaviewer/" in href:
href = href.split("?")[0]
if href not in seen:
seen.add(href); new += 1
except StaleElementReferenceException:
continue
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
idle = idle + 1 if new == 0 else 0
if idle >= idle_rounds:
break
return seen
def get_mediaindex_next_url(page_source_html: str, current_url: str) -> str | None:
"""Find the 'Next' pagination link on /mediaindex pages."""
sp = BeautifulSoup(page_source_html, "html.parser")
# Try explicit 'Next' text or aria labels
for a in sp.find_all("a", href=True):
txt = (a.get_text() or "").strip().lower()
aria = (a.get("aria-label") or "").strip().lower()
href = a["href"]
if ("next" in txt or "next" in aria) and "mediaindex" in href:
return urllib.parse.urljoin(current_url, href.split("#")[0])
# As a fallback, pick the highest page number link greater than current page
candidates = []
for a in sp.find_all("a", href=True):
if "mediaindex" in a["href"]:
m = re.search(r"[?&]page=(\d+)", a["href"])
if m:
candidates.append((int(m.group(1)), urllib.parse.urljoin(current_url, a["href"])))
if candidates:
candidates.sort()
# choose the next higher page than current if any
return candidates[-1] # last one (largest) as heuristic
return None
# ------------------ main ------------------
def main():
if len(sys.argv) < 2:
print("Usage: python imdb_smart_fullres.py <IMDB title/episode URL>")
sys.exit(1)
base_url = sys.argv[1].strip()
# naming + output folder
ttid = extract_ttid(base_url)
r = requests.get(base_url, headers=HEADERS, timeout=25)
r.raise_for_status()
title_html = BeautifulSoup(r.text, "html.parser").find("title").text
show_slug, simple_title_slug = parse_title_bits(title_html)
base_slug = show_slug or simple_title_slug or ttid
outdir = f"{base_slug}-{ttid}"
os.makedirs(outdir, exist_ok=True)
print(f"Output folder: {outdir}")
# Launch headless Chrome
opts = Options()
opts.add_argument("--headless=new"); opts.add_argument("--disable-gpu")
opts.add_argument("--no-sandbox"); opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1200,2500")
opts.add_argument(f"--user-agent={HEADERS['User-Agent']}")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
links = set()
try:
# Try modern /media
media_url = base_url.rstrip("/") + "/media"
print(f"Opening media grid: {media_url}")
driver.get(media_url)
time.sleep(2)
links |= collect_mediaviewer_links_on_current_page(driver)
# If /media was empty, or to be safe for classic titles, walk /mediaindex pagination
mediaindex_url = base_url.rstrip("/") + "/mediaindex/"
print(f"Opening classic index (with pagination): {mediaindex_url}")
next_url = mediaindex_url
visited_pages = set()
page_no = 0
while next_url and next_url not in visited_pages:
page_no += 1
visited_pages.add(next_url)
print(f" -> Page {page_no}: {next_url}")
driver.get(next_url)
time.sleep(1.5)
# scroll to trigger lazy load of full 100 per page
page_links = collect_mediaviewer_links_on_current_page(driver, pause=1.1, idle_rounds=4)
print(f" collected {len(page_links)} links on this page")
links |= page_links
# find explicit Next link
next_url = get_mediaindex_next_url(driver.page_source, next_url)
# normalize if function returned a tuple (from heuristic)
if isinstance(next_url, tuple):
next_url = next_url[1]
finally:
driver.quit()
print(f"Found {len(links)} media viewer links total.")
# Download full-size images
total = 0
for idx, mv_url in enumerate(sorted(links), start=1):
print(f"[{idx}/{len(links)}] {mv_url}")
full = get_fullsize_from_media_viewer(mv_url)
if not full:
print(" No full-size URL found, skipping."); continue
try:
dl_headers = dict(HEADERS); dl_headers["Referer"] = mv_url
resp = requests.get(full, headers=dl_headers, timeout=45)
if resp.status_code != 200:
print(f" HTTP {resp.status_code}, skipping."); continue
ext = guess_ext_from_response(resp, full)
fname = os.path.join(outdir, f"{base_slug}-{idx}{ext}")
if not os.path.exists(fname):
with open(fname, "wb") as f:
f.write(resp.content)
total += 1
time.sleep(0.25)
except Exception as e:
print(f" Error downloading: {e}")
print(f"\nDone. Downloaded {total} images into '{outdir}'.")
if __name__ == "__main__":
main()
That's it!
The script will create a folder with all the images inside.