Spaces:

Riksarkivet
/

iiif_downloader

Running

File size: 7,740 Bytes

import gradio as gr
import os
import pycurl
import io
import json
import shutil
import tempfile
import certifi
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Optional, List
from zipfile import ZipFile


IIIF_URL = "https://lbiiif.riksarkivet.se"

def perform_curl_request(url: str) -> bytes:
    """Fast curl request with proper headers to avoid timeouts"""
    buffer = io.BytesIO()
    c = pycurl.Curl()
    
    try:
        # Critical settings that fix the slow connection issue
        c.setopt(c.URL, url)
        c.setopt(c.WRITEDATA, buffer)
        c.setopt(c.CAINFO, certifi.where())
        c.setopt(c.FOLLOWLOCATION, 1)
        c.setopt(c.MAXREDIRS, 5)
        c.setopt(c.CONNECTTIMEOUT, 5)
        c.setopt(c.TIMEOUT, 10)
        c.setopt(c.NOSIGNAL, 1)
        
        # These headers are crucial for the Swedish National Archives server
        c.setopt(c.HTTPHEADER, [
            'User-Agent: curl/8.7.1',
            'Accept: */*',
            'Connection: close'  # This prevents hanging connections!
        ])
        
        c.perform()
        
        http_code = c.getinfo(c.RESPONSE_CODE)
        if http_code != 200:
            raise Exception(f"HTTP {http_code}")
        
        return buffer.getvalue()
    finally:
        c.close()

def get_image_ids(batch_id: str) -> List[str]:
    """Fetch image IDs from IIIF manifest using fast pycurl"""
    print(f"Fetching manifest for batch: {batch_id}")
    
    manifest_url = f"{IIIF_URL}/arkis!{batch_id}/manifest"
    print(f"Manifest URL: {manifest_url}")
    
    try:
        response_data = perform_curl_request(manifest_url)
        manifest = json.loads(response_data.decode('utf-8'))
    except Exception as e:
        raise ValueError(f"Failed to fetch manifest: {e}")
    
    image_ids = []
    
    for item in manifest.get("items", []):
        id_parts = item["id"].split("!")
        if len(id_parts) > 1:
            image_id = id_parts[1][:14]
            image_ids.append(image_id)
    
    if not image_ids:
        raise ValueError(f"No images found in manifest for batch {batch_id}")
    
    print(f"Found {len(image_ids)} images in batch {batch_id}")
    return image_ids

def download_image_pycurl(url: str, dest: str) -> bool:
    """Download a single image using pycurl for speed"""
    try:
        image_data = perform_curl_request(url)
        
        with open(dest, "wb") as f:
            f.write(image_data)
        
        print(f"✓ Downloaded: {Path(dest).name}")
        return True
    except Exception as e:
        print(f"✗ Failed to download {Path(dest).name}: {e}")
        return False

def iiif_download_batch(
    batch_id: str,
    start_index: int = 1,
    end_index: Optional[int] = None,
    max_workers: int = 10  # Increased from 5 for faster downloads
) -> Optional[str]:
    """
    Download images from an IIIF batch using fast pycurl and return as zip.
    
    Args:
        batch_id: The batch/manifest ID to download
        start_index: Starting image number (1-based)
        end_index: Ending image number (inclusive). None = download all
        max_workers: Number of concurrent downloads (default 10)
        
    Returns:
        Path to zip file for download or None if failed
    """
    print(f"\n=== Starting Fast IIIF Download ===")
    print(f"Batch ID: {batch_id}")
    print(f"Start index: {start_index}")
    print(f"End index: {end_index}")
    print(f"Workers: {max_workers}")
    
    try:
        # Handle None or empty batch_id gracefully
        if not batch_id:
            print("Warning: No batch ID provided")
            return None
        
        all_image_ids = get_image_ids(batch_id)
        total_images = len(all_image_ids)
        
        start_idx = max(0, start_index - 1) 
        end_idx = end_index if end_index else total_images
        end_idx = min(end_idx, total_images)
        
        image_ids = all_image_ids[start_idx:end_idx]
        
        if not image_ids:
            raise ValueError(f"No images in specified range {start_index}-{end_index}")
        
        print(f"Downloading {len(image_ids)} images (range: {start_index}-{end_idx} of {total_images} total)")
        
        temp_dir = Path(tempfile.mkdtemp())
        batch_dir = temp_dir / batch_id
        batch_dir.mkdir(exist_ok=True)
        print(f"Temp directory: {temp_dir}")
        
        # Download images concurrently with pycurl
        def download_single(image_id: str):
            url = f"{IIIF_URL}/arkis!{image_id}/full/max/0/default.jpg"
            dest = batch_dir / f"{image_id}.jpg"
            success = download_image_pycurl(url, str(dest))
            return image_id, success
        
        downloaded = []
        failed = []
        
        print(f"Starting concurrent downloads ({max_workers} workers)...")
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = executor.map(download_single, image_ids)
            
            for image_id, success in results:
                if success:
                    downloaded.append(image_id)
                else:
                    failed.append(image_id)
        
        if not downloaded:
            print("No images were successfully downloaded")
            return None
        
        # Create zip file
        range_suffix = f"_{start_index}-{end_idx}" if end_index else "_all"
        zip_path = temp_dir / f"{batch_id}{range_suffix}.zip"
        
        print(f"Creating zip file: {zip_path.name}")
        with ZipFile(zip_path, 'w') as zipf:
            for image_id in downloaded:
                img_path = batch_dir / f"{image_id}.jpg"
                if img_path.exists():
                    zipf.write(img_path, arcname=f"{image_id}.jpg")
        
        print(f"✓ Success! Downloaded {len(downloaded)}/{len(image_ids)} images")
        if failed:
            print(f"⚠ Failed downloads: {len(failed)} images")
        print(f"Zip file created: {zip_path}")
        print("=== Download Complete ===\n")
        
        return str(zip_path)
    
    except Exception as e:
        print(f"ERROR: {e}")
        print("=== Download Failed ===\n")
        return None


# Create Gradio interface
print("Creating Gradio interface...")
demo = gr.Interface(
    fn=iiif_download_batch,
    inputs=[
        gr.Textbox(
            label="Batch ID",
            placeholder="Enter 8-digit batch ID (e.g., C0000263)",
            info="The IIIF manifest/batch identifier"
        ),
        gr.Number(
            label="Start Image",
            value=1,
            minimum=1,
            precision=0,
            info="First image to download (1 = first image)"
        ),
        gr.Number(
            label="End Image (Optional)",
            value=None,
            minimum=1,
            precision=0,
            info="Last image to download (leave empty for all)"
        )
    ],
    outputs=gr.File(label="Download Zip"),
    title="Fast IIIF Batch Downloader (PycURL)",
    description="High-speed downloader for Swedish National Archives IIIF manifests using optimized PycURL requests.",
    examples=[
        ["R0001210", 1, 10],     # Real batch - Download first 10
        ["R0001210", 11, 20],    # Real batch - Download images 11-20
        ["R0001210", 1, 50],     # Real batch - Download first 50
    ],
    cache_examples=False,
    api_name="iiif_download_batch"
)

if __name__ == "__main__":
    print("Launching Gradio app with MCP server enabled...")
    print("Note: Make sure pycurl and certifi are installed:")
    print("  pip install pycurl certifi")
    demo.launch(
        mcp_server=True,
        share=False,
        debug=False,
        ssr_mode=False
    )