Gabriel commited on
Commit
9f54891
·
verified ·
1 Parent(s): 5145c5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -23
app.py CHANGED
@@ -1,8 +1,11 @@
1
  import gradio as gr
2
  import os
3
- import requests
 
 
4
  import shutil
5
  import tempfile
 
6
  from concurrent.futures import ThreadPoolExecutor
7
  from pathlib import Path
8
  from typing import Optional, List
@@ -11,17 +14,52 @@ from zipfile import ZipFile
11
 
12
  IIIF_URL = "https://lbiiif.riksarkivet.se"
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def get_image_ids(batch_id: str) -> List[str]:
15
- """Fetch image IDs from IIIF manifest"""
16
  print(f"Fetching manifest for batch: {batch_id}")
17
 
18
  manifest_url = f"{IIIF_URL}/arkis!{batch_id}/manifest"
19
  print(f"Manifest URL: {manifest_url}")
20
 
21
- response = requests.get(manifest_url)
22
- response.raise_for_status()
 
 
 
23
 
24
- manifest = response.json()
25
  image_ids = []
26
 
27
  for item in manifest.get("items", []):
@@ -36,14 +74,13 @@ def get_image_ids(batch_id: str) -> List[str]:
36
  print(f"Found {len(image_ids)} images in batch {batch_id}")
37
  return image_ids
38
 
39
- def download_image(url: str, dest: str) -> bool:
40
- """Download a single image"""
41
  try:
42
- response = requests.get(url, stream=True)
43
- response.raise_for_status()
44
 
45
  with open(dest, "wb") as f:
46
- shutil.copyfileobj(response.raw, f)
47
 
48
  print(f"✓ Downloaded: {Path(dest).name}")
49
  return True
@@ -54,26 +91,29 @@ def download_image(url: str, dest: str) -> bool:
54
  def iiif_download_batch(
55
  batch_id: str,
56
  start_index: int = 1,
57
- end_index: Optional[int] = None
 
58
  ) -> Optional[str]:
59
  """
60
- Download images from an IIIF batch and return as zip.
61
 
62
  Args:
63
  batch_id: The batch/manifest ID to download
64
  start_index: Starting image number (1-based)
65
  end_index: Ending image number (inclusive). None = download all
 
66
 
67
  Returns:
68
  Path to zip file for download or None if failed
69
  """
70
- print(f"\n=== Starting IIIF Download ===")
71
  print(f"Batch ID: {batch_id}")
72
  print(f"Start index: {start_index}")
73
  print(f"End index: {end_index}")
 
74
 
75
  try:
76
- # Handle None or empty batch_id gracefully for example caching
77
  if not batch_id:
78
  print("Warning: No batch ID provided")
79
  return None
@@ -97,18 +137,18 @@ def iiif_download_batch(
97
  batch_dir.mkdir(exist_ok=True)
98
  print(f"Temp directory: {temp_dir}")
99
 
100
- # Download images concurrently
101
  def download_single(image_id: str):
102
  url = f"{IIIF_URL}/arkis!{image_id}/full/max/0/default.jpg"
103
  dest = batch_dir / f"{image_id}.jpg"
104
- success = download_image(url, str(dest))
105
  return image_id, success
106
 
107
  downloaded = []
108
  failed = []
109
 
110
- print(f"Starting concurrent downloads (5 workers)...")
111
- with ThreadPoolExecutor(max_workers=5) as executor:
112
  results = executor.map(download_single, image_ids)
113
 
114
  for image_id, success in results:
@@ -153,7 +193,7 @@ demo = gr.Interface(
153
  inputs=[
154
  gr.Textbox(
155
  label="Batch ID",
156
- placeholder="Enter 8-digit batch ID (e.g., 12345678)",
157
  info="The IIIF manifest/batch identifier"
158
  ),
159
  gr.Number(
@@ -172,22 +212,24 @@ demo = gr.Interface(
172
  )
173
  ],
174
  outputs=gr.File(label="Download Zip"),
175
- title="IIIF Batch Downloader",
176
- description="Download images from Swedish National Archives IIIF manifests. Specify a range to download partial batches.",
177
  examples=[
178
  ["C0000263", 1, 10], # Real batch - Download first 10
179
  ["C0000263", 11, 20], # Real batch - Download images 11-20
180
  ["C0000263", 1, 50], # Real batch - Download first 50
181
  ],
182
- cache_examples=False,
183
  api_name="iiif_download_batch"
184
  )
185
 
186
  if __name__ == "__main__":
187
  print("Launching Gradio app with MCP server enabled...")
 
 
188
  demo.launch(
189
  mcp_server=True,
190
  share=False,
191
  debug=False,
192
- ssr_mode=False
193
  )
 
1
  import gradio as gr
2
  import os
3
+ import pycurl
4
+ import io
5
+ import json
6
  import shutil
7
  import tempfile
8
+ import certifi
9
  from concurrent.futures import ThreadPoolExecutor
10
  from pathlib import Path
11
  from typing import Optional, List
 
14
 
15
  IIIF_URL = "https://lbiiif.riksarkivet.se"
16
 
17
+ def perform_curl_request(url: str) -> bytes:
18
+ """Fast curl request with proper headers to avoid timeouts"""
19
+ buffer = io.BytesIO()
20
+ c = pycurl.Curl()
21
+
22
+ try:
23
+ # Critical settings that fix the slow connection issue
24
+ c.setopt(c.URL, url)
25
+ c.setopt(c.WRITEDATA, buffer)
26
+ c.setopt(c.CAINFO, certifi.where())
27
+ c.setopt(c.FOLLOWLOCATION, 1)
28
+ c.setopt(c.MAXREDIRS, 5)
29
+ c.setopt(c.CONNECTTIMEOUT, 5)
30
+ c.setopt(c.TIMEOUT, 10)
31
+ c.setopt(c.NOSIGNAL, 1)
32
+
33
+ # These headers are crucial for the Swedish National Archives server
34
+ c.setopt(c.HTTPHEADER, [
35
+ 'User-Agent: curl/8.7.1',
36
+ 'Accept: */*',
37
+ 'Connection: close' # This prevents hanging connections!
38
+ ])
39
+
40
+ c.perform()
41
+
42
+ http_code = c.getinfo(c.RESPONSE_CODE)
43
+ if http_code != 200:
44
+ raise Exception(f"HTTP {http_code}")
45
+
46
+ return buffer.getvalue()
47
+ finally:
48
+ c.close()
49
+
50
  def get_image_ids(batch_id: str) -> List[str]:
51
+ """Fetch image IDs from IIIF manifest using fast pycurl"""
52
  print(f"Fetching manifest for batch: {batch_id}")
53
 
54
  manifest_url = f"{IIIF_URL}/arkis!{batch_id}/manifest"
55
  print(f"Manifest URL: {manifest_url}")
56
 
57
+ try:
58
+ response_data = perform_curl_request(manifest_url)
59
+ manifest = json.loads(response_data.decode('utf-8'))
60
+ except Exception as e:
61
+ raise ValueError(f"Failed to fetch manifest: {e}")
62
 
 
63
  image_ids = []
64
 
65
  for item in manifest.get("items", []):
 
74
  print(f"Found {len(image_ids)} images in batch {batch_id}")
75
  return image_ids
76
 
77
+ def download_image_pycurl(url: str, dest: str) -> bool:
78
+ """Download a single image using pycurl for speed"""
79
  try:
80
+ image_data = perform_curl_request(url)
 
81
 
82
  with open(dest, "wb") as f:
83
+ f.write(image_data)
84
 
85
  print(f"✓ Downloaded: {Path(dest).name}")
86
  return True
 
91
  def iiif_download_batch(
92
  batch_id: str,
93
  start_index: int = 1,
94
+ end_index: Optional[int] = None,
95
+ max_workers: int = 10 # Increased from 5 for faster downloads
96
  ) -> Optional[str]:
97
  """
98
+ Download images from an IIIF batch using fast pycurl and return as zip.
99
 
100
  Args:
101
  batch_id: The batch/manifest ID to download
102
  start_index: Starting image number (1-based)
103
  end_index: Ending image number (inclusive). None = download all
104
+ max_workers: Number of concurrent downloads (default 10)
105
 
106
  Returns:
107
  Path to zip file for download or None if failed
108
  """
109
+ print(f"\n=== Starting Fast IIIF Download ===")
110
  print(f"Batch ID: {batch_id}")
111
  print(f"Start index: {start_index}")
112
  print(f"End index: {end_index}")
113
+ print(f"Workers: {max_workers}")
114
 
115
  try:
116
+ # Handle None or empty batch_id gracefully
117
  if not batch_id:
118
  print("Warning: No batch ID provided")
119
  return None
 
137
  batch_dir.mkdir(exist_ok=True)
138
  print(f"Temp directory: {temp_dir}")
139
 
140
+ # Download images concurrently with pycurl
141
  def download_single(image_id: str):
142
  url = f"{IIIF_URL}/arkis!{image_id}/full/max/0/default.jpg"
143
  dest = batch_dir / f"{image_id}.jpg"
144
+ success = download_image_pycurl(url, str(dest))
145
  return image_id, success
146
 
147
  downloaded = []
148
  failed = []
149
 
150
+ print(f"Starting concurrent downloads ({max_workers} workers)...")
151
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
152
  results = executor.map(download_single, image_ids)
153
 
154
  for image_id, success in results:
 
193
  inputs=[
194
  gr.Textbox(
195
  label="Batch ID",
196
+ placeholder="Enter 8-digit batch ID (e.g., C0000263)",
197
  info="The IIIF manifest/batch identifier"
198
  ),
199
  gr.Number(
 
212
  )
213
  ],
214
  outputs=gr.File(label="Download Zip"),
215
+ title="Fast IIIF Batch Downloader (PycURL)",
216
+ description="High-speed downloader for Swedish National Archives IIIF manifests using optimized PycURL requests.",
217
  examples=[
218
  ["C0000263", 1, 10], # Real batch - Download first 10
219
  ["C0000263", 11, 20], # Real batch - Download images 11-20
220
  ["C0000263", 1, 50], # Real batch - Download first 50
221
  ],
222
+ cache_examples=False,
223
  api_name="iiif_download_batch"
224
  )
225
 
226
  if __name__ == "__main__":
227
  print("Launching Gradio app with MCP server enabled...")
228
+ print("Note: Make sure pycurl and certifi are installed:")
229
+ print(" pip install pycurl certifi")
230
  demo.launch(
231
  mcp_server=True,
232
  share=False,
233
  debug=False,
234
+ ssr_mode=False
235
  )