akhaliq HF Staff commited on
Commit
d4dd180
·
1 Parent(s): 931474d
Files changed (1) hide show
  1. backend_parsers.py +14 -85
backend_parsers.py CHANGED
@@ -18,68 +18,11 @@ def parse_transformers_js_output(code: str) -> Dict[str, str]:
18
  print(f"[Parser] Received code length: {len(code)} characters")
19
  print(f"[Parser] First 200 chars: {code[:200]}")
20
 
21
- code_stripped = code.strip()
22
-
23
- # Check if code starts with HTML instead of markers (common LLM mistake)
24
- if code_stripped.startswith('<!DOCTYPE') or code_stripped.startswith('<html'):
25
- print("[Parser] WARNING: Code starts with HTML instead of === index.html === marker")
26
- print("[Parser] Attempting to extract files from malformed output...")
27
-
28
- # Try to split by === markers that do exist
29
- if '=== index.js ===' in code and '=== style.css ===' in code:
30
- # Extract HTML as everything before === index.js ===
31
- html_end = code.find('=== index.js ===')
32
- html_content = code[:html_end].strip()
33
-
34
- # Extract JS between === index.js === and === style.css ===
35
- js_start = code.find('=== index.js ===') + len('=== index.js ===')
36
- js_end = code.find('=== style.css ===')
37
- js_content = code[js_start:js_end].strip()
38
-
39
- # Extract CSS after === style.css ===
40
- css_start = code.find('=== style.css ===') + len('=== style.css ===')
41
- css_content = code[css_start:].strip()
42
-
43
- print(f"[Parser] Recovered HTML: {len(html_content)} chars")
44
- print(f"[Parser] Recovered JS: {len(js_content)} chars")
45
- print(f"[Parser] Recovered CSS: {len(css_content)} chars")
46
-
47
- files = {
48
- 'index.html': html_content,
49
- 'index.js': js_content,
50
- 'style.css': css_content
51
- }
52
-
53
- # Normalize imports and return early since we've already parsed everything
54
- cdn_url = "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]"
55
- for file_key in ['index.html', 'index.js']:
56
- if files[file_key]:
57
- content = files[file_key]
58
- content = re.sub(
59
- r"from\s+['\"]https://cdn.jsdelivr.net/npm/@huggingface/transformers@[^'\"]+['\"]",
60
- f"from '{cdn_url}'",
61
- content
62
- )
63
- content = re.sub(
64
- r"from\s+['\"]https://cdn.jsdelivr.net/npm/@xenova/transformers@[^'\"]+['\"]",
65
- f"from '{cdn_url}'",
66
- content
67
- )
68
- files[file_key] = content
69
-
70
- return files
71
- else:
72
- files = {
73
- 'index.html': '',
74
- 'index.js': '',
75
- 'style.css': ''
76
- }
77
- else:
78
- files = {
79
- 'index.html': '',
80
- 'index.js': '',
81
- 'style.css': ''
82
- }
83
 
84
  # Multiple patterns to match the three code blocks with different variations
85
  html_patterns = [
@@ -122,45 +65,31 @@ def parse_transformers_js_output(code: str) -> Dict[str, str]:
122
 
123
  # Fallback: support === index.html === format if any file is missing
124
  if not (files['index.html'] and files['index.js'] and files['style.css']):
125
- # Use regex to extract sections - match === markers with optional whitespace and newlines
126
- # Fixed lookahead to allow any whitespace (not just \n) before next === marker
127
- # Also support alternative names: styles.css or style.css, app.js or index.js
128
- html_fallback = re.search(r'===\s*index\.html\s*===\s*[\r\n]*([\s\S]+?)(?=\s*===|$)', code, re.IGNORECASE)
129
 
130
  # Try both index.js and app.js
131
- js_fallback = re.search(r'===\s*(?:index\.js|app\.js)\s*===\s*[\r\n]*([\s\S]+?)(?=\s*===|$)', code, re.IGNORECASE)
132
 
133
  # Try both style.css and styles.css
134
- css_fallback = re.search(r'===\s*(?:style\.css|styles\.css)\s*===\s*[\r\n]*([\s\S]+?)$', code, re.IGNORECASE)
135
 
136
  print(f"[Parser] Fallback extraction - HTML found: {bool(html_fallback)}, JS found: {bool(js_fallback)}, CSS found: {bool(css_fallback)}")
137
 
138
  if html_fallback:
139
- content = html_fallback.group(1).strip()
140
- # Remove code block markers if present
141
- content = re.sub(r'^```\w*\s*[\r\n]+', '', content)
142
- content = re.sub(r'[\r\n]+```\s*$', '', content)
143
- files['index.html'] = content.strip()
144
  if js_fallback:
145
- content = js_fallback.group(1).strip()
146
- # Remove code block markers if present
147
- content = re.sub(r'^```\w*\s*[\r\n]+', '', content)
148
- content = re.sub(r'[\r\n]+```\s*$', '', content)
149
- files['index.js'] = content.strip()
150
  if css_fallback:
151
- content = css_fallback.group(1).strip()
152
- # Remove code block markers if present
153
- content = re.sub(r'^```\w*\s*[\r\n]+', '', content)
154
- content = re.sub(r'[\r\n]+```\s*$', '', content)
155
- files['style.css'] = content.strip()
156
 
157
  # Additional fallback: extract from numbered sections or file headers
158
  if not (files['index.html'] and files['index.js'] and files['style.css']):
159
  # Try patterns like "1. index.html:" or "**index.html**"
160
  patterns = [
161
  (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)index\.html(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'index.html'),
162
- (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)index\.js(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'index.js'),
163
- (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)style\.css(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'style.css')
164
  ]
165
 
166
  for pattern, file_key in patterns:
 
18
  print(f"[Parser] Received code length: {len(code)} characters")
19
  print(f"[Parser] First 200 chars: {code[:200]}")
20
 
21
+ files = {
22
+ 'index.html': '',
23
+ 'index.js': '',
24
+ 'style.css': ''
25
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # Multiple patterns to match the three code blocks with different variations
28
  html_patterns = [
 
65
 
66
  # Fallback: support === index.html === format if any file is missing
67
  if not (files['index.html'] and files['index.js'] and files['style.css']):
68
+ # Use regex to extract sections - support alternative filenames
69
+ html_fallback = re.search(r'===\s*index\.html\s*===\s*\n([\s\S]+?)(?=\n===|$)', code, re.IGNORECASE)
 
 
70
 
71
  # Try both index.js and app.js
72
+ js_fallback = re.search(r'===\s*(?:index\.js|app\.js)\s*===\s*\n([\s\S]+?)(?=\n===|$)', code, re.IGNORECASE)
73
 
74
  # Try both style.css and styles.css
75
+ css_fallback = re.search(r'===\s*(?:style\.css|styles\.css)\s*===\s*\n([\s\S]+?)(?=\n===|$)', code, re.IGNORECASE)
76
 
77
  print(f"[Parser] Fallback extraction - HTML found: {bool(html_fallback)}, JS found: {bool(js_fallback)}, CSS found: {bool(css_fallback)}")
78
 
79
  if html_fallback:
80
+ files['index.html'] = html_fallback.group(1).strip()
 
 
 
 
81
  if js_fallback:
82
+ files['index.js'] = js_fallback.group(1).strip()
 
 
 
 
83
  if css_fallback:
84
+ files['style.css'] = css_fallback.group(1).strip()
 
 
 
 
85
 
86
  # Additional fallback: extract from numbered sections or file headers
87
  if not (files['index.html'] and files['index.js'] and files['style.css']):
88
  # Try patterns like "1. index.html:" or "**index.html**"
89
  patterns = [
90
  (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)index\.html(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'index.html'),
91
+ (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)(?:index\.js|app\.js)(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'index.js'),
92
+ (r'(?:^\d+\.\s*|^##\s*|^\*\*\s*)(?:style\.css|styles\.css)(?:\s*:|\*\*:?)\s*\n([\s\S]+?)(?=\n(?:\d+\.|##|\*\*|===)|$)', 'style.css')
93
  ]
94
 
95
  for pattern, file_key in patterns: