Download all images from an IMDB page (in the highest quality)

Mave

TMS Founder
Administrator
Messages
236,546
Location
Belgium
Idea started because I wanted to download all 290 images from the Dr. No IMDB page: https://www.imdb.com/title/tt0055928/mediaindex/

How to use this script:

  1. Install Python from https://www.python.org/downloads/
  2. Open Powershell and run pip install selenium webdriver-manager requests beautifulsoup4 python-slugify
  3. Save the script below as imdb.py
  4. Navigate with Powershell to the correct directory. For example: cd "C:\Users\tms\Downloads\scripts"
  5. Run python imdb.py "https://www.imdb.com/title/tt0055928/"
EDIT: Make sure you have Chrome installed.

Python:
#!/usr/bin/env python3
import os, re, sys, time, json, mimetypes, requests, urllib.parse
from bs4 import BeautifulSoup
from slugify import slugify

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
}

# ------------------ helpers ------------------
def maximize_amazon_image_url(url: str) -> str:
    if not url:
        return url
    return re.sub(r"_V1_[^./]*(_)?\.", r"_V1_.", url)

def _meta(sp: BeautifulSoup, name: str, attr: str = "property"):
    tag = sp.find("meta", attrs={attr: name})
    return tag.get("content") if tag and tag.get("content") else None

def get_fullsize_from_media_viewer(media_url: str) -> str | None:
    """Return largest image URL from a mediaviewer page, using several strategies."""
    try:
        rp = requests.get(media_url, headers=HEADERS, timeout=25)
        rp.raise_for_status()
        sp = BeautifulSoup(rp.text, "html.parser")

        # 1) Big <img>
        img = sp.find(lambda tag: tag.name == "img" and (
            tag.get("data-testid") in {"media-viewer-image"} or
            "ipc-image" in (tag.get("class") or []) or
            tag.get("id") in {"primary-img", "media-viewer-image"}
        ))
        if img:
            if img.get("srcset"):
                last = img["srcset"].split(",")[-1].strip().split(" ")[0]
                return maximize_amazon_image_url(last)
            if img.get("src"):
                return maximize_amazon_image_url(img["src"])

        # 2) Meta fallbacks
        for name, attr in [("og:image:secure_url","property"),("og:image","property"),("twitter:image","name")]:
            u = _meta(sp, name, attr)
            if u:
                return maximize_amazon_image_url(u)

        # 3) JSON-LD
        for s in sp.find_all("script", type="application/ld+json"):
            try:
                data = json.loads(s.string or "{}")
                items = [data] if isinstance(data, dict) else (data if isinstance(data, list) else [])
                for d in items:
                    imgval = d.get("image")
                    if isinstance(imgval, str) and imgval:
                        return maximize_amazon_image_url(imgval)
                    if isinstance(imgval, list) and imgval:
                        return maximize_amazon_image_url(imgval[-1])
            except Exception:
                pass

        # 4) Preload hints
        link = sp.find("link", attrs={"as":"image","href":True})
        if link:
            return maximize_amazon_image_url(link["href"])
        return None
    except Exception as e:
        print(f"  [!] Mediaviewer parse failed: {e}")
        return None

def guess_ext_from_response(resp, fallback_url):
    ct = resp.headers.get("Content-Type") or ""
    ext = mimetypes.guess_extension(ct)
    if not ext:
        ext = mimetypes.guess_extension(mimetypes.guess_type(fallback_url)[0] or "")
    return ext or ".jpg"

def parse_title_bits(html_title: str):
    try:
        show, ep_title = re.search(r'"(.*)" ([^(]+)', html_title).groups()
        return slugify(show), slugify(ep_title.strip())
    except Exception:
        return None, slugify(html_title)

def extract_ttid(url: str) -> str:
    m = re.search(r"(tt\d+)", url)
    return m.group(1) if m else "ttXXXXXXX"

# ---------- Selenium helpers ----------
def collect_mediaviewer_links_on_current_page(driver, pause=1.0, idle_rounds=4) -> set[str]:
    """Scroll to bottom, letting the page lazy-load, and collect all /mediaviewer/ links on THIS page."""
    seen = set()
    idle = 0
    while True:
        new = 0
        anchors = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/mediaviewer/"]')
        for a in anchors:
            try:
                href = a.get_attribute("href")
                if href and "/mediaviewer/" in href:
                    href = href.split("?")[0]
                    if href not in seen:
                        seen.add(href); new += 1
            except StaleElementReferenceException:
                continue
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        idle = idle + 1 if new == 0 else 0
        if idle >= idle_rounds:
            break
    return seen

def get_mediaindex_next_url(page_source_html: str, current_url: str) -> str | None:
    """Find the 'Next' pagination link on /mediaindex pages."""
    sp = BeautifulSoup(page_source_html, "html.parser")
    # Try explicit 'Next' text or aria labels
    for a in sp.find_all("a", href=True):
        txt = (a.get_text() or "").strip().lower()
        aria = (a.get("aria-label") or "").strip().lower()
        href = a["href"]
        if ("next" in txt or "next" in aria) and "mediaindex" in href:
            return urllib.parse.urljoin(current_url, href.split("#")[0])
    # As a fallback, pick the highest page number link greater than current page
    candidates = []
    for a in sp.find_all("a", href=True):
        if "mediaindex" in a["href"]:
            m = re.search(r"[?&]page=(\d+)", a["href"])
            if m:
                candidates.append((int(m.group(1)), urllib.parse.urljoin(current_url, a["href"])))
    if candidates:
        candidates.sort()
        # choose the next higher page than current if any
        return candidates[-1]  # last one (largest) as heuristic
    return None

# ------------------ main ------------------
def main():
    if len(sys.argv) < 2:
        print("Usage: python imdb_smart_fullres.py <IMDB title/episode URL>")
        sys.exit(1)
    base_url = sys.argv[1].strip()

    # naming + output folder
    ttid = extract_ttid(base_url)
    r = requests.get(base_url, headers=HEADERS, timeout=25)
    r.raise_for_status()
    title_html = BeautifulSoup(r.text, "html.parser").find("title").text
    show_slug, simple_title_slug = parse_title_bits(title_html)
    base_slug = show_slug or simple_title_slug or ttid
    outdir = f"{base_slug}-{ttid}"
    os.makedirs(outdir, exist_ok=True)
    print(f"Output folder: {outdir}")

    # Launch headless Chrome
    opts = Options()
    opts.add_argument("--headless=new"); opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox"); opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--window-size=1200,2500")
    opts.add_argument(f"--user-agent={HEADERS['User-Agent']}")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

    links = set()
    try:
        # Try modern /media
        media_url = base_url.rstrip("/") + "/media"
        print(f"Opening media grid: {media_url}")
        driver.get(media_url)
        time.sleep(2)
        links |= collect_mediaviewer_links_on_current_page(driver)

        # If /media was empty, or to be safe for classic titles, walk /mediaindex pagination
        mediaindex_url = base_url.rstrip("/") + "/mediaindex/"
        print(f"Opening classic index (with pagination): {mediaindex_url}")
        next_url = mediaindex_url
        visited_pages = set()
        page_no = 0
        while next_url and next_url not in visited_pages:
            page_no += 1
            visited_pages.add(next_url)
            print(f"  -> Page {page_no}: {next_url}")
            driver.get(next_url)
            time.sleep(1.5)
            # scroll to trigger lazy load of full 100 per page
            page_links = collect_mediaviewer_links_on_current_page(driver, pause=1.1, idle_rounds=4)
            print(f"     collected {len(page_links)} links on this page")
            links |= page_links
            # find explicit Next link
            next_url = get_mediaindex_next_url(driver.page_source, next_url)
            # normalize if function returned a tuple (from heuristic)
            if isinstance(next_url, tuple):
                next_url = next_url[1]
    finally:
        driver.quit()

    print(f"Found {len(links)} media viewer links total.")

    # Download full-size images
    total = 0
    for idx, mv_url in enumerate(sorted(links), start=1):
        print(f"[{idx}/{len(links)}] {mv_url}")
        full = get_fullsize_from_media_viewer(mv_url)
        if not full:
            print("  No full-size URL found, skipping."); continue
        try:
            dl_headers = dict(HEADERS); dl_headers["Referer"] = mv_url
            resp = requests.get(full, headers=dl_headers, timeout=45)
            if resp.status_code != 200:
                print(f"  HTTP {resp.status_code}, skipping."); continue
            ext = guess_ext_from_response(resp, full)
            fname = os.path.join(outdir, f"{base_slug}-{idx}{ext}")
            if not os.path.exists(fname):
                with open(fname, "wb") as f:
                    f.write(resp.content)
                total += 1
            time.sleep(0.25)
        except Exception as e:
            print(f"  Error downloading: {e}")

    print(f"\nDone. Downloaded {total} images into '{outdir}'.")
 
if __name__ == "__main__":
    main()

That's it!
The script will create a folder with all the images inside.

1759818081709.png


1759818177367.png
 
Back
Top Bottom