File size: 7,740 Bytes
277cbc7
73c784b
9f54891
 
 
73c784b
277cbc7
9f54891
277cbc7
 
 
73c784b
741a077
d1d1d97
277cbc7
73c784b
9f54891
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277cbc7
9f54891
7a0f654
 
277cbc7
7a0f654
 
9f54891
 
 
 
 
277cbc7
c3b1fcc
277cbc7
 
c3b1fcc
 
 
 
277cbc7
a361cda
277cbc7
 
7a0f654
d1d1d97
73c784b
9f54891
 
277cbc7
9f54891
277cbc7
 
9f54891
277cbc7
7a0f654
277cbc7
 
7a0f654
277cbc7
73c784b
277cbc7
 
 
9f54891
 
277cbc7
73c784b
9f54891
277cbc7
 
 
 
 
9f54891
277cbc7
 
 
73c784b
9f54891
7a0f654
 
 
9f54891
277cbc7
7a0f654
9f54891
7a0f654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f54891
7a0f654
 
 
9f54891
7a0f654
 
 
 
 
9f54891
 
7a0f654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277cbc7
7a0f654
 
 
 
 
 
 
 
 
 
 
 
 
9f54891
7a0f654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f54891
 
7a0f654
4c2f5c0
 
 
277cbc7
9f54891
277cbc7
 
 
 
7a0f654
9f54891
 
277cbc7
 
 
7a0f654
9f54891
277cbc7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import gradio as gr
import os
import pycurl
import io
import json
import shutil
import tempfile
import certifi
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Optional, List
from zipfile import ZipFile


IIIF_URL = "https://lbiiif.riksarkivet.se"

def perform_curl_request(url: str) -> bytes:
    """Fast curl request with proper headers to avoid timeouts"""
    buffer = io.BytesIO()
    c = pycurl.Curl()
    
    try:
        # Critical settings that fix the slow connection issue
        c.setopt(c.URL, url)
        c.setopt(c.WRITEDATA, buffer)
        c.setopt(c.CAINFO, certifi.where())
        c.setopt(c.FOLLOWLOCATION, 1)
        c.setopt(c.MAXREDIRS, 5)
        c.setopt(c.CONNECTTIMEOUT, 5)
        c.setopt(c.TIMEOUT, 10)
        c.setopt(c.NOSIGNAL, 1)
        
        # These headers are crucial for the Swedish National Archives server
        c.setopt(c.HTTPHEADER, [
            'User-Agent: curl/8.7.1',
            'Accept: */*',
            'Connection: close'  # This prevents hanging connections!
        ])
        
        c.perform()
        
        http_code = c.getinfo(c.RESPONSE_CODE)
        if http_code != 200:
            raise Exception(f"HTTP {http_code}")
        
        return buffer.getvalue()
    finally:
        c.close()

def get_image_ids(batch_id: str) -> List[str]:
    """Fetch image IDs from IIIF manifest using fast pycurl"""
    print(f"Fetching manifest for batch: {batch_id}")
    
    manifest_url = f"{IIIF_URL}/arkis!{batch_id}/manifest"
    print(f"Manifest URL: {manifest_url}")
    
    try:
        response_data = perform_curl_request(manifest_url)
        manifest = json.loads(response_data.decode('utf-8'))
    except Exception as e:
        raise ValueError(f"Failed to fetch manifest: {e}")
    
    image_ids = []
    
    for item in manifest.get("items", []):
        id_parts = item["id"].split("!")
        if len(id_parts) > 1:
            image_id = id_parts[1][:14]
            image_ids.append(image_id)
    
    if not image_ids:
        raise ValueError(f"No images found in manifest for batch {batch_id}")
    
    print(f"Found {len(image_ids)} images in batch {batch_id}")
    return image_ids

def download_image_pycurl(url: str, dest: str) -> bool:
    """Download a single image using pycurl for speed"""
    try:
        image_data = perform_curl_request(url)
        
        with open(dest, "wb") as f:
            f.write(image_data)
        
        print(f"✓ Downloaded: {Path(dest).name}")
        return True
    except Exception as e:
        print(f"✗ Failed to download {Path(dest).name}: {e}")
        return False

def iiif_download_batch(
    batch_id: str,
    start_index: int = 1,
    end_index: Optional[int] = None,
    max_workers: int = 10  # Increased from 5 for faster downloads
) -> Optional[str]:
    """
    Download images from an IIIF batch using fast pycurl and return as zip.
    
    Args:
        batch_id: The batch/manifest ID to download
        start_index: Starting image number (1-based)
        end_index: Ending image number (inclusive). None = download all
        max_workers: Number of concurrent downloads (default 10)
        
    Returns:
        Path to zip file for download or None if failed
    """
    print(f"\n=== Starting Fast IIIF Download ===")
    print(f"Batch ID: {batch_id}")
    print(f"Start index: {start_index}")
    print(f"End index: {end_index}")
    print(f"Workers: {max_workers}")
    
    try:
        # Handle None or empty batch_id gracefully
        if not batch_id:
            print("Warning: No batch ID provided")
            return None
        
        all_image_ids = get_image_ids(batch_id)
        total_images = len(all_image_ids)
        
        start_idx = max(0, start_index - 1) 
        end_idx = end_index if end_index else total_images
        end_idx = min(end_idx, total_images)
        
        image_ids = all_image_ids[start_idx:end_idx]
        
        if not image_ids:
            raise ValueError(f"No images in specified range {start_index}-{end_index}")
        
        print(f"Downloading {len(image_ids)} images (range: {start_index}-{end_idx} of {total_images} total)")
        
        temp_dir = Path(tempfile.mkdtemp())
        batch_dir = temp_dir / batch_id
        batch_dir.mkdir(exist_ok=True)
        print(f"Temp directory: {temp_dir}")
        
        # Download images concurrently with pycurl
        def download_single(image_id: str):
            url = f"{IIIF_URL}/arkis!{image_id}/full/max/0/default.jpg"
            dest = batch_dir / f"{image_id}.jpg"
            success = download_image_pycurl(url, str(dest))
            return image_id, success
        
        downloaded = []
        failed = []
        
        print(f"Starting concurrent downloads ({max_workers} workers)...")
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = executor.map(download_single, image_ids)
            
            for image_id, success in results:
                if success:
                    downloaded.append(image_id)
                else:
                    failed.append(image_id)
        
        if not downloaded:
            print("No images were successfully downloaded")
            return None
        
        # Create zip file
        range_suffix = f"_{start_index}-{end_idx}" if end_index else "_all"
        zip_path = temp_dir / f"{batch_id}{range_suffix}.zip"
        
        print(f"Creating zip file: {zip_path.name}")
        with ZipFile(zip_path, 'w') as zipf:
            for image_id in downloaded:
                img_path = batch_dir / f"{image_id}.jpg"
                if img_path.exists():
                    zipf.write(img_path, arcname=f"{image_id}.jpg")
        
        print(f"✓ Success! Downloaded {len(downloaded)}/{len(image_ids)} images")
        if failed:
            print(f"⚠ Failed downloads: {len(failed)} images")
        print(f"Zip file created: {zip_path}")
        print("=== Download Complete ===\n")
        
        return str(zip_path)
    
    except Exception as e:
        print(f"ERROR: {e}")
        print("=== Download Failed ===\n")
        return None


# Create Gradio interface
print("Creating Gradio interface...")
demo = gr.Interface(
    fn=iiif_download_batch,
    inputs=[
        gr.Textbox(
            label="Batch ID",
            placeholder="Enter 8-digit batch ID (e.g., C0000263)",
            info="The IIIF manifest/batch identifier"
        ),
        gr.Number(
            label="Start Image",
            value=1,
            minimum=1,
            precision=0,
            info="First image to download (1 = first image)"
        ),
        gr.Number(
            label="End Image (Optional)",
            value=None,
            minimum=1,
            precision=0,
            info="Last image to download (leave empty for all)"
        )
    ],
    outputs=gr.File(label="Download Zip"),
    title="Fast IIIF Batch Downloader (PycURL)",
    description="High-speed downloader for Swedish National Archives IIIF manifests using optimized PycURL requests.",
    examples=[
        ["R0001210", 1, 10],     # Real batch - Download first 10
        ["R0001210", 11, 20],    # Real batch - Download images 11-20
        ["R0001210", 1, 50],     # Real batch - Download first 50
    ],
    cache_examples=False,
    api_name="iiif_download_batch"
)

if __name__ == "__main__":
    print("Launching Gradio app with MCP server enabled...")
    print("Note: Make sure pycurl and certifi are installed:")
    print("  pip install pycurl certifi")
    demo.launch(
        mcp_server=True,
        share=False,
        debug=False,
        ssr_mode=False
    )