Spaces:
Sleeping
Sleeping
| from io import BytesIO | |
| import json, re | |
| import os | |
| import base64 | |
| import requests | |
| from fastapi import FastAPI, UploadFile, File | |
| from fastapi.responses import JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from PIL import Image | |
| app = FastAPI(title="GLM-4.1V-9B-Thinking") | |
| # Enable CORS for frontend interaction (Gradio/Spaces UI) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| API_URL = "https://router.huggingface.co/v1/chat/completions" | |
| HEADERS = { | |
| "Authorization": f"Bearer {os.environ['access_token']}", | |
| "Content-Type": "application/json" | |
| } | |
| PROMPT = """ | |
| You are an AI assistant. Extract item names and their prices from the following image. | |
| Your task is to extract item names and their corresponding prices from the image provided. | |
| Return ONLY a clean JSON array in this format: | |
| [ | |
| {"item": "<item_name>", "price": "<price>"}, | |
| ... | |
| ] | |
| ⚠️ Guidelines: | |
| - Do not include any explanation or text before/after the JSON. | |
| - Include only entries that have both item and price. | |
| - Preserve original spellings and formatting from the image. | |
| - If prices are written in ₹, Rs., or INR, keep the symbol as is. | |
| - Handle both packaged labels (like chips or snacks) and printed/handwritten menus. | |
| - If there are duplicates or unclear text, skip them. | |
| Only return the final JSON output, No explanation. | |
| Make sure each entry has both item and price, and preserve the original spelling. | |
| """ | |
| def resize_image(image: Image.Image, max_size=(1024, 1024)) -> Image.Image: | |
| image.thumbnail(max_size) | |
| return image | |
| async def encode_image_to_data_url(file: UploadFile=File(...)) -> str: | |
| image = Image.open(BytesIO(await file.read())) | |
| # Preprocessing | |
| image = resize_image(image) | |
| # Compress and convert to bytes | |
| buffered = BytesIO() | |
| image.save(buffered, quality=80, format=image.format) | |
| buffered.seek(0) | |
| image_bytes = buffered.getvalue() | |
| # Encode to base64 | |
| base64_image = base64.b64encode(image_bytes).decode("utf-8") | |
| mime_type = file.content_type | |
| return f"data:{mime_type};base64,{base64_image}" | |
| def root(): | |
| return {"message": "GLM 4.1V API for menu extraction is running."} | |
| async def extract(file: UploadFile = File(...)): | |
| try: | |
| # Convert uploaded image to base64 URL format | |
| image_data_url = await encode_image_to_data_url(file) | |
| # Create chat-style payload | |
| payload = { | |
| "model": "zai-org/GLM-4.1V-9B-Thinking:novita", | |
| # "model": "meta-llama/Llama-3.2-11B-Vision-Instruct:together", | |
| # "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct:novita", | |
| # "model": "llama3.2-vision:11b", | |
| "messages": [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": PROMPT | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": image_data_url | |
| } | |
| } | |
| ] | |
| } | |
| ] | |
| } | |
| # Send POST request to Hugging Face Chat Completion endpoint | |
| response = requests.post(API_URL, headers=HEADERS, json=payload) | |
| result = response.json() | |
| print("result :", result) | |
| reply = result["choices"][0]["message"]["content"] | |
| except Exception as e: | |
| return JSONResponse(content={"error": str(e)}, status_code=400) | |
| match = re.search(r"\[\s*{.*?}\s*\]", reply, re.DOTALL) | |
| if match: | |
| json_str = match.group(0) | |
| try: | |
| items = json.loads(json_str) | |
| return JSONResponse(content={"menu_items": items}) | |
| except json.JSONDecodeError: | |
| return JSONResponse(status_code=500, content={"error": "Failed to parse JSON", "raw": json_str}) | |
| else: | |
| return JSONResponse(status_code=404, | |
| content={"error": "No JSON array found in response", "model_response": reply}) | |