Spaces:

Riksarkivet
/

iiif_downloader

Running

App Files Files Community

Gabriel commited on Sep 18

Commit

9f54891

verified ·

1 Parent(s): 5145c5c

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -23

app.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import gradio as gr
 import os
-import requests
 import shutil
 import tempfile
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Optional, List
@@ -11,17 +14,52 @@ from zipfile import ZipFile
 IIIF_URL = "https://lbiiif.riksarkivet.se"
 def get_image_ids(batch_id: str) -> List[str]:
-    """Fetch image IDs from IIIF manifest"""
     print(f"Fetching manifest for batch: {batch_id}")
     manifest_url = f"{IIIF_URL}/arkis!{batch_id}/manifest"
     print(f"Manifest URL: {manifest_url}")
-    response = requests.get(manifest_url)
-    response.raise_for_status()
-    manifest = response.json()
     image_ids = []
     for item in manifest.get("items", []):
@@ -36,14 +74,13 @@ def get_image_ids(batch_id: str) -> List[str]:
     print(f"Found {len(image_ids)} images in batch {batch_id}")
     return image_ids
-def download_image(url: str, dest: str) -> bool:
-    """Download a single image"""
     try:
-        response = requests.get(url, stream=True)
-        response.raise_for_status()
         with open(dest, "wb") as f:
-            shutil.copyfileobj(response.raw, f)
         print(f"✓ Downloaded: {Path(dest).name}")
         return True
@@ -54,26 +91,29 @@ def download_image(url: str, dest: str) -> bool:
 def iiif_download_batch(
     batch_id: str,
     start_index: int = 1,
-    end_index: Optional[int] = None
 ) -> Optional[str]:
     """
-    Download images from an IIIF batch and return as zip.
     Args:
         batch_id: The batch/manifest ID to download
         start_index: Starting image number (1-based)
         end_index: Ending image number (inclusive). None = download all
     Returns:
         Path to zip file for download or None if failed
     """
-    print(f"\n=== Starting IIIF Download ===")
     print(f"Batch ID: {batch_id}")
     print(f"Start index: {start_index}")
     print(f"End index: {end_index}")
     try:
-        # Handle None or empty batch_id gracefully for example caching
         if not batch_id:
             print("Warning: No batch ID provided")
             return None
@@ -97,18 +137,18 @@ def iiif_download_batch(
         batch_dir.mkdir(exist_ok=True)
         print(f"Temp directory: {temp_dir}")
-        # Download images concurrently
         def download_single(image_id: str):
             url = f"{IIIF_URL}/arkis!{image_id}/full/max/0/default.jpg"
             dest = batch_dir / f"{image_id}.jpg"
-            success = download_image(url, str(dest))
             return image_id, success
         downloaded = []
         failed = []
-        print(f"Starting concurrent downloads (5 workers)...")
-        with ThreadPoolExecutor(max_workers=5) as executor:
             results = executor.map(download_single, image_ids)
             for image_id, success in results:
@@ -153,7 +193,7 @@ demo = gr.Interface(
     inputs=[
         gr.Textbox(
             label="Batch ID",
-            placeholder="Enter 8-digit batch ID (e.g., 12345678)",
             info="The IIIF manifest/batch identifier"
         ),
         gr.Number(
@@ -172,22 +212,24 @@ demo = gr.Interface(
         )
     ],
     outputs=gr.File(label="Download Zip"),
-    title="IIIF Batch Downloader",
-    description="Download images from Swedish National Archives IIIF manifests. Specify a range to download partial batches.",
     examples=[
         ["C0000263", 1, 10],     # Real batch - Download first 10
         ["C0000263", 11, 20],    # Real batch - Download images 11-20
         ["C0000263", 1, 50],     # Real batch - Download first 50
     ],
-    cache_examples=False,
     api_name="iiif_download_batch"
 )
 if __name__ == "__main__":
     print("Launching Gradio app with MCP server enabled...")
     demo.launch(
         mcp_server=True,
         share=False,
         debug=False,
-        ssr_mode=False
     )

 import gradio as gr
 import os
+import pycurl
+import io
+import json
 import shutil
 import tempfile
+import certifi
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import Optional, List
 IIIF_URL = "https://lbiiif.riksarkivet.se"
+def perform_curl_request(url: str) -> bytes:
+    """Fast curl request with proper headers to avoid timeouts"""
+    buffer = io.BytesIO()
+    c = pycurl.Curl()
+    try:
+        # Critical settings that fix the slow connection issue
+        c.setopt(c.URL, url)
+        c.setopt(c.WRITEDATA, buffer)
+        c.setopt(c.CAINFO, certifi.where())
+        c.setopt(c.FOLLOWLOCATION, 1)
+        c.setopt(c.MAXREDIRS, 5)
+        c.setopt(c.CONNECTTIMEOUT, 5)
+        c.setopt(c.TIMEOUT, 10)
+        c.setopt(c.NOSIGNAL, 1)
+        # These headers are crucial for the Swedish National Archives server
+        c.setopt(c.HTTPHEADER, [
+            'User-Agent: curl/8.7.1',
+            'Accept: */*',
+            'Connection: close'  # This prevents hanging connections!
+        ])
+        c.perform()
+        http_code = c.getinfo(c.RESPONSE_CODE)
+        if http_code != 200:
+            raise Exception(f"HTTP {http_code}")
+        return buffer.getvalue()
+    finally:
+        c.close()
 def get_image_ids(batch_id: str) -> List[str]:
+    """Fetch image IDs from IIIF manifest using fast pycurl"""
     print(f"Fetching manifest for batch: {batch_id}")
     manifest_url = f"{IIIF_URL}/arkis!{batch_id}/manifest"
     print(f"Manifest URL: {manifest_url}")
+    try:
+        response_data = perform_curl_request(manifest_url)
+        manifest = json.loads(response_data.decode('utf-8'))
+    except Exception as e:
+        raise ValueError(f"Failed to fetch manifest: {e}")
     image_ids = []
     for item in manifest.get("items", []):
     print(f"Found {len(image_ids)} images in batch {batch_id}")
     return image_ids
+def download_image_pycurl(url: str, dest: str) -> bool:
+    """Download a single image using pycurl for speed"""
     try:
+        image_data = perform_curl_request(url)
         with open(dest, "wb") as f:
+            f.write(image_data)
         print(f"✓ Downloaded: {Path(dest).name}")
         return True
 def iiif_download_batch(
     batch_id: str,
     start_index: int = 1,
+    end_index: Optional[int] = None,
+    max_workers: int = 10  # Increased from 5 for faster downloads
 ) -> Optional[str]:
     """
+    Download images from an IIIF batch using fast pycurl and return as zip.
     Args:
         batch_id: The batch/manifest ID to download
         start_index: Starting image number (1-based)
         end_index: Ending image number (inclusive). None = download all
+        max_workers: Number of concurrent downloads (default 10)
     Returns:
         Path to zip file for download or None if failed
     """
+    print(f"\n=== Starting Fast IIIF Download ===")
     print(f"Batch ID: {batch_id}")
     print(f"Start index: {start_index}")
     print(f"End index: {end_index}")
+    print(f"Workers: {max_workers}")
     try:
+        # Handle None or empty batch_id gracefully
         if not batch_id:
             print("Warning: No batch ID provided")
             return None
         batch_dir.mkdir(exist_ok=True)
         print(f"Temp directory: {temp_dir}")
+        # Download images concurrently with pycurl
         def download_single(image_id: str):
             url = f"{IIIF_URL}/arkis!{image_id}/full/max/0/default.jpg"
             dest = batch_dir / f"{image_id}.jpg"
+            success = download_image_pycurl(url, str(dest))
             return image_id, success
         downloaded = []
         failed = []
+        print(f"Starting concurrent downloads ({max_workers} workers)...")
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
             results = executor.map(download_single, image_ids)
             for image_id, success in results:
     inputs=[
         gr.Textbox(
             label="Batch ID",
+            placeholder="Enter 8-digit batch ID (e.g., C0000263)",
             info="The IIIF manifest/batch identifier"
         ),
         gr.Number(
         )
     ],
     outputs=gr.File(label="Download Zip"),
+    title="Fast IIIF Batch Downloader (PycURL)",
+    description="High-speed downloader for Swedish National Archives IIIF manifests using optimized PycURL requests.",
     examples=[
         ["C0000263", 1, 10],     # Real batch - Download first 10
         ["C0000263", 11, 20],    # Real batch - Download images 11-20
         ["C0000263", 1, 50],     # Real batch - Download first 50
     ],
+    cache_examples=False,
     api_name="iiif_download_batch"
 )
 if __name__ == "__main__":
     print("Launching Gradio app with MCP server enabled...")
+    print("Note: Make sure pycurl and certifi are installed:")
+    print("  pip install pycurl certifi")
     demo.launch(
         mcp_server=True,
         share=False,
         debug=False,
+        ssr_mode=False
     )