amrnabih-FullTeck commited on
Commit
448953f
ยท
1 Parent(s): 614f101

Add Qari-OCR Gradio app v3

Browse files
Files changed (2) hide show
  1. app.py +29 -53
  2. requirements.txt +3 -4
app.py CHANGED
@@ -4,28 +4,29 @@
4
  import gradio as gr
5
  from PIL import Image
6
  import torch
7
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
8
- from qwen_vl_utils import process_vision_info
9
 
10
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
11
- # 1) ุชู†ุตูŠุจ ุงู„ุญุฒู… (ูŠููุถู„ ุฅุถุงูุชู‡ุง ููŠ requirements.txt ุจุฏู„ุงู‹ ู…ู† !pip ุฏุงุฎู„ ุงู„ูƒูˆุฏ)
12
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
13
- # !pip install transformers qwen_vl_utils accelerate>=0.26.0 PEFT -U
14
- # !pip install -U bitsandbytes
15
- # !pip install gradio pillow
 
 
 
16
 
17
- # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
18
- # 2) ุชุญู…ูŠู„ ุงู„ู†ู…ูˆุฐุฌ ูˆุงู„ู…ุนุงู„ุฌ
19
- # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
20
  model_name = "NAMAA-Space/Qari-OCR-0.2.2.1-Arabic-2B-Instruct"
21
-
22
  model = Qwen2VLForConditionalGeneration.from_pretrained(
23
  model_name,
24
- torch_dtype="auto",
25
  device_map="auto",
26
  trust_remote_code=True
27
  )
28
- processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
 
 
29
 
30
  max_tokens = 2000
31
  prompt = (
@@ -35,59 +36,34 @@ prompt = (
35
  )
36
 
37
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
38
- # 3) ุฏุงู„ุฉ ุงู„ู€OCR
39
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
40
  def ocr_from_image(img: Image.Image):
41
- # ุชุฃูƒุฏ ู…ู† ุตูŠุบุฉ RGB
42
- src = "temp_image.png"
43
- img.convert("RGB").save(src)
44
 
45
- messages = [
46
- {
47
- "role": "user",
48
- "content": [
49
- {"type": "image", "image": f"file://{src}"},
50
- {"type": "text", "text": prompt},
51
- ],
52
- }
53
- ]
54
- # ุฌู‡ู‘ุฒ ุงู„ุฑุณุงู„ุฉ
55
- text = processor.apply_chat_template(
56
- messages, tokenize=False, add_generation_prompt=True
57
- )
58
- image_inputs, video_inputs = process_vision_info(messages)
59
  inputs = processor(
60
- text=[text],
61
- images=image_inputs,
62
- videos=video_inputs,
63
- padding=True,
64
  return_tensors="pt",
 
65
  ).to(model.device)
66
 
67
- # ุงุณุชุฏุนุงุก ุงู„ู†ู…ูˆุฐุฌ
68
- generated_ids = model.generate(**inputs, max_new_tokens=max_tokens)
69
- # ู‚ุต ุงู„ุชูˆูƒู†ุฒ ุงู„ุฃุตู„ูŠุฉ
70
- generated_ids_trimmed = [
71
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
72
- ]
73
- # ููƒู‘ ุงู„ุชุดููŠุฑ
74
- output_text = processor.batch_decode(
75
- generated_ids_trimmed,
76
  skip_special_tokens=True,
77
  clean_up_tokenization_spaces=False
78
  )[0]
79
-
80
- # ู†ุธู‘ู ุงู„ุตูˆุฑุฉ ุงู„ู…ุคู‚ุชุฉ
81
- try:
82
- import os
83
- os.remove(src)
84
- except:
85
- pass
86
-
87
- return output_text
88
 
89
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
90
- # 4) ุจู†ุงุก ูˆุงุฌู‡ุฉ Gradio
91
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
92
  demo = gr.Interface(
93
  fn=ocr_from_image,
 
4
  import gradio as gr
5
  from PIL import Image
6
  import torch
7
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
 
8
 
9
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
10
+ # 1) ุฅุนุฏุงุฏ ุชุญู…ูŠู„ ุงู„ูƒู…ู‘ูŠุงุช 4-bit
11
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
12
+ bnb_config = BitsAndBytesConfig(
13
+ load_in_4bit=True,
14
+ bnb_4bit_quant_type="nf4",
15
+ bnb_4bit_use_double_quant=True,
16
+ bnb_4bit_compute_dtype=torch.float16,
17
+ )
18
 
 
 
 
19
  model_name = "NAMAA-Space/Qari-OCR-0.2.2.1-Arabic-2B-Instruct"
 
20
  model = Qwen2VLForConditionalGeneration.from_pretrained(
21
  model_name,
22
+ quantization_config=bnb_config,
23
  device_map="auto",
24
  trust_remote_code=True
25
  )
26
+ processor = AutoProcessor.from_pretrained(
27
+ model_name,
28
+ trust_remote_code=True
29
+ )
30
 
31
  max_tokens = 2000
32
  prompt = (
 
36
  )
37
 
38
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
39
+ # 2) ุฏุงู„ุฉ ุงู„ู€OCR ุจุฏูˆู† qwen_vl_utils
40
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
41
  def ocr_from_image(img: Image.Image):
42
+ img = img.convert("RGB")
43
+ # ู†ุณุชุฎุฏู… chat_template_format ู„ุถู… prompt
44
+ formatted = processor.chat_template_format(prompt)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  inputs = processor(
47
+ images=img,
48
+ text=[formatted],
 
 
49
  return_tensors="pt",
50
+ padding=True
51
  ).to(model.device)
52
 
53
+ outputs = model.generate(
54
+ **inputs,
55
+ max_new_tokens=max_tokens,
56
+ do_sample=False
57
+ )
58
+ result = processor.batch_decode(
59
+ outputs,
 
 
60
  skip_special_tokens=True,
61
  clean_up_tokenization_spaces=False
62
  )[0]
63
+ return result
 
 
 
 
 
 
 
 
64
 
65
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
66
+ # 3) ูˆุงุฌู‡ุฉ Gradio
67
  # โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”โ€”
68
  demo = gr.Interface(
69
  fn=ocr_from_image,
requirements.txt CHANGED
@@ -1,8 +1,7 @@
1
- torch
2
- torchvision
3
  transformers
 
 
4
  bitsandbytes
5
- peft
6
- safetensors
7
  gradio
8
  pillow
 
 
 
 
1
  transformers
2
+ accelerate>=0.26.0
3
+ PEFT
4
  bitsandbytes
 
 
5
  gradio
6
  pillow
7
+ torch