File size: 4,217 Bytes
cbbe76e
 
26ae6bd
ebe7430
9c79679
 
 
 
 
 
cbbe76e
56d2f3b
cbbe76e
 
 
 
 
 
 
 
 
26ae6bd
 
 
 
 
 
 
cbbe76e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26ae6bd
 
 
 
 
 
 
 
 
 
 
bf32df6
26ae6bd
 
25fa2d2
26ae6bd
 
 
 
 
 
cbbe76e
 
7a3c832
cbbe76e
 
 
 
26ae6bd
 
 
 
 
4704563
 
26ae6bd
 
 
 
 
 
 
 
a8eff0f
26ae6bd
 
 
 
 
 
 
 
 
 
 
 
 
742a487
26ae6bd
 
25fa2d2
26ae6bd
cbbe76e
 
26ae6bd
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from io import BytesIO
import json, re
import os
import base64
import requests

from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from PIL import Image

app = FastAPI(title="GLM-4.1V-9B-Thinking")

# Enable CORS for frontend interaction (Gradio/Spaces UI)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)


API_URL = "https://router.huggingface.co/v1/chat/completions"
HEADERS = {
    "Authorization": f"Bearer {os.environ['access_token']}",
    "Content-Type": "application/json"
}

PROMPT = """
You are an AI assistant. Extract item names and their prices from the following image.

Your task is to extract item names and their corresponding prices from the image provided.

Return ONLY a clean JSON array in this format:
[
  {"item": "<item_name>", "price": "<price>"},
  ...
]

⚠️ Guidelines:
- Do not include any explanation or text before/after the JSON.
- Include only entries that have both item and price.
- Preserve original spellings and formatting from the image.
- If prices are written in ₹, Rs., or INR, keep the symbol as is.
- Handle both packaged labels (like chips or snacks) and printed/handwritten menus.
- If there are duplicates or unclear text, skip them.

Only return the final JSON output, No explanation.

Make sure each entry has both item and price, and preserve the original spelling.
"""

def resize_image(image: Image.Image, max_size=(1024, 1024)) -> Image.Image:
    image.thumbnail(max_size)
    return image


async def encode_image_to_data_url(file: UploadFile=File(...)) -> str:

    image = Image.open(BytesIO(await file.read()))

    # Preprocessing
    image = resize_image(image)

    # Compress and convert to bytes
    buffered = BytesIO()

    image.save(buffered, quality=80, format=image.format)
    buffered.seek(0)
    image_bytes = buffered.getvalue()
    
    # Encode to base64
    base64_image = base64.b64encode(image_bytes).decode("utf-8")
    mime_type = file.content_type

    return f"data:{mime_type};base64,{base64_image}"
    
@app.get("/")
def root():
    return {"message": "GLM 4.1V API for menu extraction is running."}

@app.post("/extract/")
async def extract(file: UploadFile = File(...)):
    try:
        # Convert uploaded image to base64 URL format
        image_data_url = await encode_image_to_data_url(file)

        # Create chat-style payload
        payload = {
            "model": "zai-org/GLM-4.1V-9B-Thinking:novita",
            # "model": "meta-llama/Llama-3.2-11B-Vision-Instruct:together",
            # "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct:novita",
            # "model": "llama3.2-vision:11b",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": PROMPT
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": image_data_url
                            }
                        }
                    ]
                }
            ]
        }

        # Send POST request to Hugging Face Chat Completion endpoint
        response = requests.post(API_URL, headers=HEADERS, json=payload)

        result = response.json()
        print("result :", result)
        reply = result["choices"][0]["message"]["content"]

    except Exception as e:
        return JSONResponse(content={"error": str(e)}, status_code=400)

    match = re.search(r"\[\s*{.*?}\s*\]", reply, re.DOTALL)
    if match:
        json_str = match.group(0)
        try:
            items = json.loads(json_str)
            return JSONResponse(content={"menu_items": items})
        except json.JSONDecodeError:
            return JSONResponse(status_code=500, content={"error": "Failed to parse JSON", "raw": json_str})
    else:
        return JSONResponse(status_code=404,
                            content={"error": "No JSON array found in response", "model_response": reply})