Spaces:

Kakashi-hatake
/

GLM-4.1V-9B-Thinking

Sleeping

56d2f3b verified 4 months ago

4.22 kB

	from io import BytesIO
	import json, re
	import os
	import base64
	import requests

	from fastapi import FastAPI, UploadFile, File
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	from PIL import Image

	app = FastAPI(title="GLM-4.1V-9B-Thinking")

	# Enable CORS for frontend interaction (Gradio/Spaces UI)
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)


	API_URL = "https://router.huggingface.co/v1/chat/completions"
	HEADERS = {
	"Authorization": f"Bearer {os.environ['access_token']}",
	"Content-Type": "application/json"
	}

	PROMPT = """
	You are an AI assistant. Extract item names and their prices from the following image.

	Your task is to extract item names and their corresponding prices from the image provided.

	Return ONLY a clean JSON array in this format:
	[
	{"item": "<item_name>", "price": "<price>"},
	...
	]

	⚠️ Guidelines:
	- Do not include any explanation or text before/after the JSON.
	- Include only entries that have both item and price.
	- Preserve original spellings and formatting from the image.
	- If prices are written in ₹, Rs., or INR, keep the symbol as is.
	- Handle both packaged labels (like chips or snacks) and printed/handwritten menus.
	- If there are duplicates or unclear text, skip them.

	Only return the final JSON output, No explanation.

	Make sure each entry has both item and price, and preserve the original spelling.
	"""

	def resize_image(image: Image.Image, max_size=(1024, 1024)) -> Image.Image:
	image.thumbnail(max_size)
	return image


	async def encode_image_to_data_url(file: UploadFile=File(...)) -> str:

	image = Image.open(BytesIO(await file.read()))

	# Preprocessing
	image = resize_image(image)

	# Compress and convert to bytes
	buffered = BytesIO()

	image.save(buffered, quality=80, format=image.format)
	buffered.seek(0)
	image_bytes = buffered.getvalue()

	# Encode to base64
	base64_image = base64.b64encode(image_bytes).decode("utf-8")
	mime_type = file.content_type

	return f"data:{mime_type};base64,{base64_image}"

	@app.get("/")
	def root():
	return {"message": "GLM 4.1V API for menu extraction is running."}

	@app.post("/extract/")
	async def extract(file: UploadFile = File(...)):
	try:
	# Convert uploaded image to base64 URL format
	image_data_url = await encode_image_to_data_url(file)

	# Create chat-style payload
	payload = {
	"model": "zai-org/GLM-4.1V-9B-Thinking:novita",
	# "model": "meta-llama/Llama-3.2-11B-Vision-Instruct:together",
	# "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct:novita",
	# "model": "llama3.2-vision:11b",
	"messages": [
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": PROMPT
	},
	{
	"type": "image_url",
	"image_url": {
	"url": image_data_url
	}
	}
	]
	}
	]
	}

	# Send POST request to Hugging Face Chat Completion endpoint
	response = requests.post(API_URL, headers=HEADERS, json=payload)

	result = response.json()
	print("result :", result)
	reply = result["choices"][0]["message"]["content"]

	except Exception as e:
	return JSONResponse(content={"error": str(e)}, status_code=400)

	match = re.search(r"\[\s{.?}\s*\]", reply, re.DOTALL)
	if match:
	json_str = match.group(0)
	try:
	items = json.loads(json_str)
	return JSONResponse(content={"menu_items": items})
	except json.JSONDecodeError:
	return JSONResponse(status_code=500, content={"error": "Failed to parse JSON", "raw": json_str})
	else:
	return JSONResponse(status_code=404,
	content={"error": "No JSON array found in response", "model_response": reply})