Kwai-Keye commited on
Commit
f88c57d
·
verified ·
1 Parent(s): af167f5

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. chat_template.jinja +5 -0
  2. config.json +270 -0
  3. configuration_deepseek.py +266 -0
  4. generation_config.json +13 -0
  5. image_processing_keye.py +541 -0
  6. merges.txt +0 -0
  7. model-00001-of-00136.safetensors +3 -0
  8. model-00003-of-00136.safetensors +3 -0
  9. model-00004-of-00136.safetensors +3 -0
  10. model-00009-of-00136.safetensors +3 -0
  11. model-00021-of-00136.safetensors +3 -0
  12. model-00035-of-00136.safetensors +3 -0
  13. model-00039-of-00136.safetensors +3 -0
  14. model-00041-of-00136.safetensors +3 -0
  15. model-00046-of-00136.safetensors +3 -0
  16. model-00048-of-00136.safetensors +3 -0
  17. model-00050-of-00136.safetensors +3 -0
  18. model-00057-of-00136.safetensors +3 -0
  19. model-00059-of-00136.safetensors +3 -0
  20. model-00061-of-00136.safetensors +3 -0
  21. model-00063-of-00136.safetensors +3 -0
  22. model-00066-of-00136.safetensors +3 -0
  23. model-00068-of-00136.safetensors +3 -0
  24. model-00069-of-00136.safetensors +3 -0
  25. model-00073-of-00136.safetensors +3 -0
  26. model-00077-of-00136.safetensors +3 -0
  27. model-00080-of-00136.safetensors +3 -0
  28. model-00083-of-00136.safetensors +3 -0
  29. model-00086-of-00136.safetensors +3 -0
  30. model-00096-of-00136.safetensors +3 -0
  31. model-00101-of-00136.safetensors +3 -0
  32. model-00102-of-00136.safetensors +3 -0
  33. model-00103-of-00136.safetensors +3 -0
  34. model-00109-of-00136.safetensors +3 -0
  35. model-00110-of-00136.safetensors +3 -0
  36. model-00113-of-00136.safetensors +3 -0
  37. model-00116-of-00136.safetensors +3 -0
  38. model-00124-of-00136.safetensors +3 -0
  39. model-00128-of-00136.safetensors +3 -0
  40. model-00130-of-00136.safetensors +3 -0
  41. model-00131-of-00136.safetensors +3 -0
  42. model-00135-of-00136.safetensors +3 -0
  43. model-00136-of-00136.safetensors +3 -0
  44. model.safetensors.index.json +0 -0
  45. preprocessor_config.json +33 -0
  46. processing_keye.py +494 -0
  47. processor_config.json +6 -0
  48. tokenizer.json +0 -0
  49. tokenizer_config.json +0 -0
  50. vocab.json +0 -0
chat_template.jinja ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if not add_vision_id is defined %}{% set add_vision_id = true %}{% endif %}{% if not thinking is defined %}{% set thinking = 'auto' %}{% endif %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set ns = namespace(is_first_tool_call=true, is_tool_output=false, system_prompt='', is_first_sp=true, is_last_user=false) %}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- if not ns.is_first_sp -%}{% set ns.system_prompt = ns.system_prompt + '
2
+
3
+ ' %}{% endif -%}{%- set sys_content = message['content'] -%}{%- if sys_content is string -%}{%- set sys_text = sys_content -%}{%- elif sys_content is iterable -%}{%- set sys_text = '' -%}{%- for part in sys_content -%}{%- if part is mapping and part.get('type') == 'text' -%}{%- set sys_text = sys_text + part.get('text', '') -%}{%- endif -%}{%- endfor -%}{%- else -%}{%- set sys_text = '' -%}{%- endif -%}{%- set ns.system_prompt = ns.system_prompt + sys_text -%}{%- set ns.is_first_sp = false -%}{%- endif -%}{%- endfor -%}{% if ns.system_prompt == '' %}{% set ns.system_prompt = '' %}{% endif %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages -%}{%- set role = message['role'] -%}{%- set content = message.get('content') -%}{%- if content is string -%}{%- set content_parts = [{'type': 'text', 'text': content}] -%}{%- elif content is none -%}{%- set content_parts = [] -%}{%- else -%}{%- set content_parts = content -%}{%- endif -%}{%- set text_content = namespace(value='') -%}{%- set vision_content = namespace(value='') -%}{%- for part in content_parts -%}{%- if part.type == 'text' -%}{%- set text_content.value = text_content.value + part.text -%}{%- elif part.type == 'image' or 'image_url' in part -%}{%- set image_count.value = image_count.value + 1 -%}{%- if add_vision_id -%}{%- set vision_content.value = vision_content.value + '' -%}{%- endif -%}{%- set vision_content.value = vision_content.value + '<|vision_start|><|image_pad|><|vision_end|>' -%}{%- elif part.type == 'video' or 'video_url' in part -%}{%- set video_count.value = video_count.value + 1 -%}{%- if add_vision_id -%}{%- set vision_content.value = vision_content.value + '' -%}{%- endif -%}{%- set vision_content.value = vision_content.value + '<|vision_start|><|video_pad|><|vision_end|>' -%}{%- endif -%}{%- endfor -%}{%- if role == 'user' -%}{%- set ns.is_tool_output = false -%}{%- set ns.is_last_user = true -%}<|User|>{{ vision_content.value }}{{ text_content.value }}<|Assistant|>{%- if thinking == 'True' or thinking == True -%}{{ '<think>' }}{%- elif thinking == 'False' or thinking == False -%}{{ '</think>' }}{%- else -%}{{ '' }}{%- endif -%}{%- elif role == 'assistant' -%}{%- set ns.is_last_user = false -%}{%- if ns.is_tool_output -%}<|tool▁outputs▁end|>{% endif -%}{%- set ns.is_tool_output = false -%}{{ vision_content.value }}{{ text_content.value }}{%- if message.get('tool_calls') -%}<|tool▁calls▁begin|>{%- set ns.is_first_tool_call = true -%}{%- for tool in message['tool_calls'] -%}{%- if not ns.is_first_tool_call %}{{'
4
+ '}}{% endif -%}<|tool▁call▁begin|>{{ tool['type'] }}<|tool▁sep|>{{ tool['function']['name'] }}json{{ tool['function']['arguments'] }}<|tool▁call▁end|>{%- set ns.is_first_tool_call = false -%}{%- endfor -%}<|tool▁calls▁end|><|end▁of▁sentence|>{%- else -%}<|end▁of▁sentence|>{%- endif -%}{%- elif role == 'tool' -%}{%- set ns.is_last_user = false -%}{%- if not ns.is_tool_output -%}<|tool▁outputs▁begin|>{% set ns.is_tool_output = true %}{% else %}{{ '
5
+ ' }}{% endif -%}<|tool▁output▁begin|>{{ content }}<|tool▁output▁end|>{%- endif -%}{%- endfor -%}{%- if ns.is_tool_output -%}<|tool▁outputs▁end|>{% endif -%}{%- if add_generation_prompt and not ns.is_last_user and not ns.is_tool_output -%}<|Assistant|>{%- if thinking == 'True' or thinking == True -%}{{ '<think>' }}{%- elif thinking == 'False' or thinking == False -%}{{ '</think>' }}{%- else -%}{{ '' }}{%- endif -%}{% endif -%}
config.json ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "KeyeVLMoeForConditionalGeneration"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_deepseek.DeepseekR1Config"
9
+ },
10
+ "bos_token_id": 0,
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 1,
13
+ "ep_size": 1,
14
+ "fast_video_token_id": 128021,
15
+ "first_k_dense_replace": 3,
16
+ "head_dim": 128,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 7168,
19
+ "image_token_id": 128010,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 18432,
22
+ "kv_lora_rank": 512,
23
+ "max_position_embeddings": 163840,
24
+ "model_type": "deepseek_r1",
25
+ "moe_intermediate_size": 2048,
26
+ "moe_layer_freq": 1,
27
+ "n_group": 8,
28
+ "n_routed_experts": 256,
29
+ "n_shared_experts": 1,
30
+ "norm_topk_prob": true,
31
+ "num_attention_heads": 128,
32
+ "num_experts_per_tok": 8,
33
+ "num_hidden_layers": 61,
34
+ "num_key_value_heads": 128,
35
+ "num_nextn_predict_layers": 1,
36
+ "q_lora_rank": 1536,
37
+ "qk_nope_head_dim": 128,
38
+ "qk_rope_head_dim": 64,
39
+ "quantization_config": {
40
+ "activation_scheme": "dynamic",
41
+ "fmt": "e4m3",
42
+ "quant_method": "fp8",
43
+ "ignored_layers": [
44
+ "mlp_AR.linear_1",
45
+ "mlp_AR.linear_2",
46
+ "visual.vision_model.encoder.layers.0.self_attn.k_proj",
47
+ "visual.vision_model.encoder.layers.0.self_attn.v_proj",
48
+ "visual.vision_model.encoder.layers.0.self_attn.q_proj",
49
+ "visual.vision_model.encoder.layers.0.self_attn.out_proj",
50
+ "visual.vision_model.encoder.layers.0.mlp.fc1",
51
+ "visual.vision_model.encoder.layers.0.mlp.fc2",
52
+ "visual.vision_model.encoder.layers.1.self_attn.k_proj",
53
+ "visual.vision_model.encoder.layers.1.self_attn.v_proj",
54
+ "visual.vision_model.encoder.layers.1.self_attn.q_proj",
55
+ "visual.vision_model.encoder.layers.1.self_attn.out_proj",
56
+ "visual.vision_model.encoder.layers.1.mlp.fc1",
57
+ "visual.vision_model.encoder.layers.1.mlp.fc2",
58
+ "visual.vision_model.encoder.layers.2.self_attn.k_proj",
59
+ "visual.vision_model.encoder.layers.2.self_attn.v_proj",
60
+ "visual.vision_model.encoder.layers.2.self_attn.q_proj",
61
+ "visual.vision_model.encoder.layers.2.self_attn.out_proj",
62
+ "visual.vision_model.encoder.layers.2.mlp.fc1",
63
+ "visual.vision_model.encoder.layers.2.mlp.fc2",
64
+ "visual.vision_model.encoder.layers.3.self_attn.k_proj",
65
+ "visual.vision_model.encoder.layers.3.self_attn.v_proj",
66
+ "visual.vision_model.encoder.layers.3.self_attn.q_proj",
67
+ "visual.vision_model.encoder.layers.3.self_attn.out_proj",
68
+ "visual.vision_model.encoder.layers.3.mlp.fc1",
69
+ "visual.vision_model.encoder.layers.3.mlp.fc2",
70
+ "visual.vision_model.encoder.layers.4.self_attn.k_proj",
71
+ "visual.vision_model.encoder.layers.4.self_attn.v_proj",
72
+ "visual.vision_model.encoder.layers.4.self_attn.q_proj",
73
+ "visual.vision_model.encoder.layers.4.self_attn.out_proj",
74
+ "visual.vision_model.encoder.layers.4.mlp.fc1",
75
+ "visual.vision_model.encoder.layers.4.mlp.fc2",
76
+ "visual.vision_model.encoder.layers.5.self_attn.k_proj",
77
+ "visual.vision_model.encoder.layers.5.self_attn.v_proj",
78
+ "visual.vision_model.encoder.layers.5.self_attn.q_proj",
79
+ "visual.vision_model.encoder.layers.5.self_attn.out_proj",
80
+ "visual.vision_model.encoder.layers.5.mlp.fc1",
81
+ "visual.vision_model.encoder.layers.5.mlp.fc2",
82
+ "visual.vision_model.encoder.layers.6.self_attn.k_proj",
83
+ "visual.vision_model.encoder.layers.6.self_attn.v_proj",
84
+ "visual.vision_model.encoder.layers.6.self_attn.q_proj",
85
+ "visual.vision_model.encoder.layers.6.self_attn.out_proj",
86
+ "visual.vision_model.encoder.layers.6.mlp.fc1",
87
+ "visual.vision_model.encoder.layers.6.mlp.fc2",
88
+ "visual.vision_model.encoder.layers.7.self_attn.k_proj",
89
+ "visual.vision_model.encoder.layers.7.self_attn.v_proj",
90
+ "visual.vision_model.encoder.layers.7.self_attn.q_proj",
91
+ "visual.vision_model.encoder.layers.7.self_attn.out_proj",
92
+ "visual.vision_model.encoder.layers.7.mlp.fc1",
93
+ "visual.vision_model.encoder.layers.7.mlp.fc2",
94
+ "visual.vision_model.encoder.layers.8.self_attn.k_proj",
95
+ "visual.vision_model.encoder.layers.8.self_attn.v_proj",
96
+ "visual.vision_model.encoder.layers.8.self_attn.q_proj",
97
+ "visual.vision_model.encoder.layers.8.self_attn.out_proj",
98
+ "visual.vision_model.encoder.layers.8.mlp.fc1",
99
+ "visual.vision_model.encoder.layers.8.mlp.fc2",
100
+ "visual.vision_model.encoder.layers.9.self_attn.k_proj",
101
+ "visual.vision_model.encoder.layers.9.self_attn.v_proj",
102
+ "visual.vision_model.encoder.layers.9.self_attn.q_proj",
103
+ "visual.vision_model.encoder.layers.9.self_attn.out_proj",
104
+ "visual.vision_model.encoder.layers.9.mlp.fc1",
105
+ "visual.vision_model.encoder.layers.9.mlp.fc2",
106
+ "visual.vision_model.encoder.layers.10.self_attn.k_proj",
107
+ "visual.vision_model.encoder.layers.10.self_attn.v_proj",
108
+ "visual.vision_model.encoder.layers.10.self_attn.q_proj",
109
+ "visual.vision_model.encoder.layers.10.self_attn.out_proj",
110
+ "visual.vision_model.encoder.layers.10.mlp.fc1",
111
+ "visual.vision_model.encoder.layers.10.mlp.fc2",
112
+ "visual.vision_model.encoder.layers.11.self_attn.k_proj",
113
+ "visual.vision_model.encoder.layers.11.self_attn.v_proj",
114
+ "visual.vision_model.encoder.layers.11.self_attn.q_proj",
115
+ "visual.vision_model.encoder.layers.11.self_attn.out_proj",
116
+ "visual.vision_model.encoder.layers.11.mlp.fc1",
117
+ "visual.vision_model.encoder.layers.11.mlp.fc2",
118
+ "visual.vision_model.encoder.layers.12.self_attn.k_proj",
119
+ "visual.vision_model.encoder.layers.12.self_attn.v_proj",
120
+ "visual.vision_model.encoder.layers.12.self_attn.q_proj",
121
+ "visual.vision_model.encoder.layers.12.self_attn.out_proj",
122
+ "visual.vision_model.encoder.layers.12.mlp.fc1",
123
+ "visual.vision_model.encoder.layers.12.mlp.fc2",
124
+ "visual.vision_model.encoder.layers.13.self_attn.k_proj",
125
+ "visual.vision_model.encoder.layers.13.self_attn.v_proj",
126
+ "visual.vision_model.encoder.layers.13.self_attn.q_proj",
127
+ "visual.vision_model.encoder.layers.13.self_attn.out_proj",
128
+ "visual.vision_model.encoder.layers.13.mlp.fc1",
129
+ "visual.vision_model.encoder.layers.13.mlp.fc2",
130
+ "visual.vision_model.encoder.layers.14.self_attn.k_proj",
131
+ "visual.vision_model.encoder.layers.14.self_attn.v_proj",
132
+ "visual.vision_model.encoder.layers.14.self_attn.q_proj",
133
+ "visual.vision_model.encoder.layers.14.self_attn.out_proj",
134
+ "visual.vision_model.encoder.layers.14.mlp.fc1",
135
+ "visual.vision_model.encoder.layers.14.mlp.fc2",
136
+ "visual.vision_model.encoder.layers.15.self_attn.k_proj",
137
+ "visual.vision_model.encoder.layers.15.self_attn.v_proj",
138
+ "visual.vision_model.encoder.layers.15.self_attn.q_proj",
139
+ "visual.vision_model.encoder.layers.15.self_attn.out_proj",
140
+ "visual.vision_model.encoder.layers.15.mlp.fc1",
141
+ "visual.vision_model.encoder.layers.15.mlp.fc2",
142
+ "visual.vision_model.encoder.layers.16.self_attn.k_proj",
143
+ "visual.vision_model.encoder.layers.16.self_attn.v_proj",
144
+ "visual.vision_model.encoder.layers.16.self_attn.q_proj",
145
+ "visual.vision_model.encoder.layers.16.self_attn.out_proj",
146
+ "visual.vision_model.encoder.layers.16.mlp.fc1",
147
+ "visual.vision_model.encoder.layers.16.mlp.fc2",
148
+ "visual.vision_model.encoder.layers.17.self_attn.k_proj",
149
+ "visual.vision_model.encoder.layers.17.self_attn.v_proj",
150
+ "visual.vision_model.encoder.layers.17.self_attn.q_proj",
151
+ "visual.vision_model.encoder.layers.17.self_attn.out_proj",
152
+ "visual.vision_model.encoder.layers.17.mlp.fc1",
153
+ "visual.vision_model.encoder.layers.17.mlp.fc2",
154
+ "visual.vision_model.encoder.layers.18.self_attn.k_proj",
155
+ "visual.vision_model.encoder.layers.18.self_attn.v_proj",
156
+ "visual.vision_model.encoder.layers.18.self_attn.q_proj",
157
+ "visual.vision_model.encoder.layers.18.self_attn.out_proj",
158
+ "visual.vision_model.encoder.layers.18.mlp.fc1",
159
+ "visual.vision_model.encoder.layers.18.mlp.fc2",
160
+ "visual.vision_model.encoder.layers.19.self_attn.k_proj",
161
+ "visual.vision_model.encoder.layers.19.self_attn.v_proj",
162
+ "visual.vision_model.encoder.layers.19.self_attn.q_proj",
163
+ "visual.vision_model.encoder.layers.19.self_attn.out_proj",
164
+ "visual.vision_model.encoder.layers.19.mlp.fc1",
165
+ "visual.vision_model.encoder.layers.19.mlp.fc2",
166
+ "visual.vision_model.encoder.layers.20.self_attn.k_proj",
167
+ "visual.vision_model.encoder.layers.20.self_attn.v_proj",
168
+ "visual.vision_model.encoder.layers.20.self_attn.q_proj",
169
+ "visual.vision_model.encoder.layers.20.self_attn.out_proj",
170
+ "visual.vision_model.encoder.layers.20.mlp.fc1",
171
+ "visual.vision_model.encoder.layers.20.mlp.fc2",
172
+ "visual.vision_model.encoder.layers.21.self_attn.k_proj",
173
+ "visual.vision_model.encoder.layers.21.self_attn.v_proj",
174
+ "visual.vision_model.encoder.layers.21.self_attn.q_proj",
175
+ "visual.vision_model.encoder.layers.21.self_attn.out_proj",
176
+ "visual.vision_model.encoder.layers.21.mlp.fc1",
177
+ "visual.vision_model.encoder.layers.21.mlp.fc2",
178
+ "visual.vision_model.encoder.layers.22.self_attn.k_proj",
179
+ "visual.vision_model.encoder.layers.22.self_attn.v_proj",
180
+ "visual.vision_model.encoder.layers.22.self_attn.q_proj",
181
+ "visual.vision_model.encoder.layers.22.self_attn.out_proj",
182
+ "visual.vision_model.encoder.layers.22.mlp.fc1",
183
+ "visual.vision_model.encoder.layers.22.mlp.fc2",
184
+ "visual.vision_model.encoder.layers.23.self_attn.k_proj",
185
+ "visual.vision_model.encoder.layers.23.self_attn.v_proj",
186
+ "visual.vision_model.encoder.layers.23.self_attn.q_proj",
187
+ "visual.vision_model.encoder.layers.23.self_attn.out_proj",
188
+ "visual.vision_model.encoder.layers.23.mlp.fc1",
189
+ "visual.vision_model.encoder.layers.23.mlp.fc2",
190
+ "visual.vision_model.encoder.layers.24.self_attn.k_proj",
191
+ "visual.vision_model.encoder.layers.24.self_attn.v_proj",
192
+ "visual.vision_model.encoder.layers.24.self_attn.q_proj",
193
+ "visual.vision_model.encoder.layers.24.self_attn.out_proj",
194
+ "visual.vision_model.encoder.layers.24.mlp.fc1",
195
+ "visual.vision_model.encoder.layers.24.mlp.fc2",
196
+ "visual.vision_model.encoder.layers.25.self_attn.k_proj",
197
+ "visual.vision_model.encoder.layers.25.self_attn.v_proj",
198
+ "visual.vision_model.encoder.layers.25.self_attn.q_proj",
199
+ "visual.vision_model.encoder.layers.25.self_attn.out_proj",
200
+ "visual.vision_model.encoder.layers.25.mlp.fc1",
201
+ "visual.vision_model.encoder.layers.25.mlp.fc2",
202
+ "visual.vision_model.encoder.layers.26.self_attn.k_proj",
203
+ "visual.vision_model.encoder.layers.26.self_attn.v_proj",
204
+ "visual.vision_model.encoder.layers.26.self_attn.q_proj",
205
+ "visual.vision_model.encoder.layers.26.self_attn.out_proj",
206
+ "visual.vision_model.encoder.layers.26.mlp.fc1",
207
+ "visual.vision_model.encoder.layers.26.mlp.fc2",
208
+ "lm_head"
209
+ ],
210
+ "weight_block_size": [
211
+ 128,
212
+ 128
213
+ ],
214
+ "modules_to_not_convert": [
215
+ "mlp_AR",
216
+ "visual.vision_model",
217
+ "lm_head"
218
+ ]
219
+ },
220
+ "rms_norm_eps": 1e-06,
221
+ "rope_scaling": {
222
+ "beta_fast": 32,
223
+ "beta_slow": 1,
224
+ "factor": 40,
225
+ "mscale": 1.0,
226
+ "mscale_all_dim": 1.0,
227
+ "original_max_position_embeddings": 4096,
228
+ "type": "yarn"
229
+ },
230
+ "rope_theta": 10000,
231
+ "routed_scaling_factor": 2.5,
232
+ "scoring_func": "sigmoid",
233
+ "tie_word_embeddings": false,
234
+ "topk_group": 4,
235
+ "topk_method": "noaux_tc",
236
+ "transformers_version": "4.56.2",
237
+ "use_cache": true,
238
+ "v_head_dim": 128,
239
+ "video_token_id": 128011,
240
+ "vision_config": {
241
+ "_attn_implementation_autoset": true,
242
+ "architectures": [
243
+ "SiglipVisionModel"
244
+ ],
245
+ "attention_dropout": 0.0,
246
+ "auto_map": {
247
+ "AutoConfig": "configuration_deepseek.KeyeVisionConfig",
248
+ "AutoModel": "modeling_deepseek.SiglipVisionModel"
249
+ },
250
+ "has_learnable_position_embedding": true,
251
+ "hidden_act": "gelu_pytorch_tanh",
252
+ "hidden_size": 1152,
253
+ "image_size": 384,
254
+ "intermediate_size": 4304,
255
+ "layer_norm_eps": 1e-06,
256
+ "model_type": "Keye",
257
+ "num_attention_heads": 16,
258
+ "num_channels": 3,
259
+ "num_hidden_layers": 27,
260
+ "patch_size": 14,
261
+ "rope_theta": 10000,
262
+ "spatial_merge_size": 2,
263
+ "temporal_patch_size": 2,
264
+ "tokens_per_second": 2
265
+ },
266
+ "vision_end_token_id": 128008,
267
+ "vision_start_token_id": 128007,
268
+ "vision_token_id": 128009,
269
+ "vocab_size": 129280
270
+ }
configuration_deepseek.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.modeling_rope_utils import rope_config_validation
22
+
23
+
24
+
25
+ class KeyeVisionConfig(PretrainedConfig):
26
+ model_type = "Keye"
27
+ base_config_key = "vision_config"
28
+
29
+ def __init__(
30
+ self,
31
+ hidden_size=768,
32
+ intermediate_size=3072,
33
+ num_hidden_layers=12,
34
+ num_attention_heads=12,
35
+ num_channels=3,
36
+ image_size=224,
37
+ patch_size=14,
38
+ hidden_act="gelu_pytorch_tanh",
39
+ layer_norm_eps=1e-6,
40
+ attention_dropout=0.0,
41
+ spatial_merge_size=2,
42
+ temporal_patch_size=2,
43
+ tokens_per_second=2,
44
+ **kwargs,
45
+ ):
46
+ super().__init__(**kwargs)
47
+
48
+ self.hidden_size = hidden_size
49
+ self.intermediate_size = intermediate_size
50
+ self.num_hidden_layers = num_hidden_layers
51
+ self.num_attention_heads = num_attention_heads
52
+ self.num_channels = num_channels
53
+ self.patch_size = patch_size
54
+ self.image_size = image_size
55
+ self.attention_dropout = attention_dropout
56
+ self.layer_norm_eps = layer_norm_eps
57
+ self.hidden_act = hidden_act
58
+ self.spatial_merge_size = spatial_merge_size
59
+ self.temporal_patch_size = temporal_patch_size
60
+ self.tokens_per_second = tokens_per_second
61
+
62
+
63
+ class DeepseekR1Config(PretrainedConfig):
64
+ r"""
65
+ This is the configuration class to store the configuration of a [`KeyeModel`]. It is used to instantiate a
66
+ KeyeVLMoeForConditionalGeneration model according to the specified arguments, defining the model architecture.
67
+
68
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
69
+ documentation from [`PretrainedConfig`] for more information.
70
+
71
+
72
+ Args:
73
+ vocab_size (`int`, *optional*, defaults to 152064):
74
+ Vocabulary size of the Keye model. Defines the number of different tokens that can be represented by the
75
+ `inputs_ids` passed when calling [`KeyeModel`]
76
+ hidden_size (`int`, *optional*, defaults to 8192):
77
+ Dimension of the hidden representations.
78
+ intermediate_size (`int`, *optional*, defaults to 29568):
79
+ Dimension of the MLP representations.
80
+ num_hidden_layers (`int`, *optional*, defaults to 80):
81
+ Number of hidden layers in the Transformer encoder.
82
+ num_attention_heads (`int`, *optional*, defaults to 64):
83
+ Number of attention heads for each attention layer in the Transformer encoder.
84
+ num_key_value_heads (`int`, *optional*, defaults to 8):
85
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
86
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
87
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
88
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
89
+ by meanpooling all the original heads within that group. For more details checkout [this
90
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
91
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
92
+ The non-linear activation function (function or string) in the decoder.
93
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
94
+ The maximum sequence length that this model might ever be used with.
95
+ initializer_range (`float`, *optional*, defaults to 0.02):
96
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
97
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
98
+ The epsilon used by the rms normalization layers.
99
+ use_cache (`bool`, *optional*, defaults to `True`):
100
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
101
+ relevant if `config.is_decoder=True`.
102
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
103
+ Whether the model's input and output word embeddings should be tied.
104
+ rope_theta (`float`, *optional*, defaults to 1000000.0):
105
+ The base period of the RoPE embeddings.
106
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
107
+ Whether to use sliding window attention.
108
+ sliding_window (`int`, *optional*, defaults to 4096):
109
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
110
+ max_window_layers (`int`, *optional*, defaults to 80):
111
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
112
+ attention_dropout (`float`, *optional*, defaults to 0.0):
113
+ The dropout ratio for the attention probabilities.
114
+ vision_config (`Dict`, *optional*):
115
+ The config for the visual encoder initialization.
116
+ rope_scaling (`Dict`, *optional*):
117
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
118
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
119
+ accordingly.
120
+ Expected contents:
121
+ `rope_type` (`str`):
122
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
123
+ 'llama3'], with 'default' being the original RoPE implementation.
124
+ `factor` (`float`, *optional*):
125
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
126
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
127
+ original maximum pre-trained length.
128
+ `original_max_position_embeddings` (`int`, *optional*):
129
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
130
+ pretraining.
131
+ `attention_factor` (`float`, *optional*):
132
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
133
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
134
+ `factor` field to infer the suggested value.
135
+ `beta_fast` (`float`, *optional*):
136
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
137
+ ramp function. If unspecified, it defaults to 32.
138
+ `beta_slow` (`float`, *optional*):
139
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
140
+ ramp function. If unspecified, it defaults to 1.
141
+ `short_factor` (`List[float]`, *optional*):
142
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
143
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
144
+ size divided by the number of attention heads divided by 2
145
+ `long_factor` (`List[float]`, *optional*):
146
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
147
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
148
+ size divided by the number of attention heads divided by 2
149
+ `low_freq_factor` (`float`, *optional*):
150
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
151
+ `high_freq_factor` (`float`, *optional*):
152
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
153
+
154
+ ```python
155
+ >>> from transformers import KeyeForConditionalGeneration, KeyeConfig
156
+
157
+ >>> # Initializing a Keye style configuration
158
+ >>> configuration = KeyeConfig()
159
+
160
+ >>> # Initializing a model from the Keye-VL-671B-A37B style configuration
161
+ >>> model = KeyeForConditionalGeneration(configuration)
162
+
163
+ >>> # Accessing the model configuration
164
+ >>> configuration = model.config
165
+ ```"""
166
+
167
+ model_type = "deepseek_r1"
168
+ sub_configs = {"vision_config": KeyeVisionConfig}
169
+ keys_to_ignore_at_inference = ["past_key_values"]
170
+
171
+ def __init__(
172
+ self,
173
+ vocab_size=129280,
174
+ hidden_size=7168,
175
+ intermediate_size=18432,
176
+ moe_intermediate_size = 2048,
177
+ num_hidden_layers=61,
178
+ num_nextn_predict_layers=1,
179
+ num_attention_heads=128,
180
+ num_key_value_heads=128,
181
+ n_shared_experts = 1,
182
+ n_routed_experts = 256,
183
+ ep_size = 1,
184
+ routed_scaling_factor = 2.5,
185
+ kv_lora_rank = 512,
186
+ q_lora_rank = 1536,
187
+ qk_rope_head_dim = 64,
188
+ v_head_dim = 128,
189
+ qk_nope_head_dim = 128,
190
+ topk_method = 'noaux_tc',
191
+ n_group = 8,
192
+ topk_group = 4,
193
+ num_experts_per_tok = 8,
194
+ moe_layer_freq = 1,
195
+ first_k_dense_replace = 3,
196
+ norm_topk_prob = True,
197
+ scoring_func = 'sigmoid',
198
+ hidden_act="silu",
199
+ max_position_embeddings=4096,
200
+ initializer_range=0.02,
201
+ rms_norm_eps=1e-6,
202
+ use_cache=True,
203
+ pad_token_id=None,
204
+ bos_token_id=0,
205
+ eos_token_id=1,
206
+ tie_word_embeddings=False,
207
+ rope_theta=10000.0,
208
+ rope_scaling=None,
209
+ attention_bias=False,
210
+ attention_dropout=0.0,
211
+ vision_config=None,
212
+ **kwargs,
213
+ ):
214
+ if isinstance(vision_config, dict):
215
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
216
+ elif vision_config is None:
217
+ self.vision_config = self.sub_configs["vision_config"]()
218
+
219
+ self.vocab_size = vocab_size
220
+ self.max_position_embeddings = max_position_embeddings
221
+ self.hidden_size = hidden_size
222
+ self.intermediate_size = intermediate_size
223
+ self.moe_intermediate_size = moe_intermediate_size
224
+ self.num_hidden_layers = num_hidden_layers
225
+ self.num_nextn_predict_layers = num_nextn_predict_layers
226
+ self.num_attention_heads = num_attention_heads
227
+ self.n_shared_experts = n_shared_experts
228
+ self.n_routed_experts = n_routed_experts
229
+ self.ep_size = ep_size
230
+ self.routed_scaling_factor = routed_scaling_factor
231
+ self.kv_lora_rank = kv_lora_rank
232
+ self.q_lora_rank = q_lora_rank
233
+ self.qk_rope_head_dim = qk_rope_head_dim
234
+ self.v_head_dim = v_head_dim
235
+ self.qk_nope_head_dim = qk_nope_head_dim
236
+ self.topk_method = topk_method
237
+ self.n_group = n_group
238
+ self.topk_group = topk_group
239
+ self.num_experts_per_tok = num_experts_per_tok
240
+ self.moe_layer_freq = moe_layer_freq
241
+ self.first_k_dense_replace = first_k_dense_replace
242
+ self.norm_topk_prob = norm_topk_prob
243
+ self.scoring_func = scoring_func
244
+ # for backward compatibility
245
+ if num_key_value_heads is None:
246
+ num_key_value_heads = num_attention_heads
247
+
248
+ self.num_key_value_heads = num_key_value_heads
249
+ self.hidden_act = hidden_act
250
+ self.initializer_range = initializer_range
251
+ self.rms_norm_eps = rms_norm_eps
252
+ self.use_cache = use_cache
253
+ self.rope_theta = rope_theta
254
+ self.rope_scaling = rope_scaling
255
+ self.attention_bias = attention_bias
256
+ self.attention_dropout = attention_dropout
257
+
258
+ super().__init__(
259
+ pad_token_id=pad_token_id,
260
+ bos_token_id=bos_token_id,
261
+ eos_token_id=eos_token_id,
262
+ tie_word_embeddings=tie_word_embeddings,
263
+ **kwargs,
264
+ )
265
+
266
+ __all__ = ["DeepseekR1Config"]
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.56.2"
13
+ }
image_processing_keye.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Keye team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """Image processor class for Keye-VL-671B-A37B."""
21
+
22
+ import math
23
+ from typing import Dict, List, Optional, Union
24
+ from PIL import Image
25
+
26
+ import numpy as np
27
+ import torch
28
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
29
+ from torchvision.transforms import functional as TF
30
+ from transformers.image_transforms import (
31
+ convert_to_rgb,
32
+ resize,
33
+ to_channel_dimension_format,
34
+ )
35
+ from transformers.image_utils import (
36
+ OPENAI_CLIP_MEAN,
37
+ OPENAI_CLIP_STD,
38
+ ChannelDimension,
39
+ PILImageResampling,
40
+ get_image_size,
41
+ infer_channel_dimension_format,
42
+ is_scaled_image,
43
+ is_valid_image,
44
+ make_list_of_images,
45
+ to_numpy_array,
46
+ valid_images,
47
+ validate_preprocess_arguments,
48
+ )
49
+ from transformers.utils import TensorType, is_vision_available, logging
50
+
51
+ import numpy as np
52
+
53
+ ImageInput = Union[
54
+ "PIL.Image.Image", np.ndarray, "torch.Tensor", list["PIL.Image.Image"], list[np.ndarray], list["torch.Tensor"]
55
+ ] # noqa
56
+
57
+
58
+ VideoInput = Union[
59
+ list["PIL.Image.Image"],
60
+ "np.ndarray",
61
+ "torch.Tensor",
62
+ list["np.ndarray"],
63
+ list["torch.Tensor"],
64
+ list[list["PIL.Image.Image"]],
65
+ list[list["np.ndarrray"]],
66
+ list[list["torch.Tensor"]],
67
+ ] # noqa
68
+
69
+ logger = logging.get_logger(__name__)
70
+
71
+
72
+ if is_vision_available():
73
+ from PIL import Image
74
+
75
+
76
+ def make_batched_images(images) -> List[List[ImageInput]]:
77
+ """
78
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
79
+
80
+ Args:
81
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
82
+ The input image.
83
+
84
+ Returns:
85
+ list: A list of images.
86
+ """
87
+ if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
88
+ return [img for img_list in images for img in img_list]
89
+
90
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
91
+ return images
92
+
93
+ elif is_valid_image(images):
94
+ return [images]
95
+
96
+ raise ValueError(f"Could not make batched images from {images}")
97
+
98
+
99
+ def adjust_size(size, patch_size):
100
+ num_patches = size // patch_size
101
+ if num_patches % 2 != 0: # 如果是奇数,减1
102
+ num_patches -= 1
103
+ return num_patches * patch_size
104
+
105
+
106
+ def make_batched_videos(videos) -> List[VideoInput]:
107
+ if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
108
+ return videos
109
+
110
+ elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
111
+ if isinstance(videos[0], Image.Image):
112
+ return [videos]
113
+ elif len(videos[0].shape) == 4:
114
+ return [list(video) for video in videos]
115
+
116
+ elif is_valid_image(videos) and len(videos.shape) == 4:
117
+ return [list(videos)]
118
+
119
+ raise ValueError(f"Could not make batched video from {videos}")
120
+
121
+
122
+ def smart_resize(
123
+ height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4096
124
+ ):
125
+ """Rescales the image so that the following conditions are met:
126
+
127
+ 1. Both dimensions (height and width) are divisible by 'factor'.
128
+
129
+ 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
130
+
131
+ 3. The aspect ratio of the image is maintained as closely as possible.
132
+
133
+ """
134
+ #if height < factor or width < factor:
135
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
136
+ # if int(height < factor//4) + int(width < factor//4):
137
+ # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
138
+
139
+ if height < factor:
140
+ print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
141
+ width = round((width * factor) / height)
142
+ height = factor
143
+
144
+ if width < factor:
145
+ print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
146
+ height = round((height * factor) / width)
147
+ width = factor
148
+
149
+ if max(height, width) / min(height, width) > 200:
150
+ raise ValueError(
151
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
152
+ )
153
+ h_bar = round(height / factor) * factor
154
+ w_bar = round(width / factor) * factor
155
+ if h_bar * w_bar > max_pixels:
156
+ beta = math.sqrt((height * width) / max_pixels)
157
+ h_bar = math.floor(height / beta / factor) * factor
158
+ w_bar = math.floor(width / beta / factor) * factor
159
+ elif h_bar * w_bar < min_pixels:
160
+ beta = math.sqrt(min_pixels / (height * width))
161
+ h_bar = math.ceil(height * beta / factor) * factor
162
+ w_bar = math.ceil(width * beta / factor) * factor
163
+ return h_bar, w_bar
164
+
165
+
166
+ class SiglipImageProcessor(BaseImageProcessor):
167
+ r"""
168
+ Constructs a Keye-VL-671B-A37B image processor that dynamically resizes images based on the original images.
169
+
170
+ Args:
171
+ do_resize (`bool`, *optional*, defaults to `True`):
172
+ Whether to resize the image's (height, width) dimensions.
173
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
174
+ Resampling filter to use when resizing the image.
175
+ do_rescale (`bool`, *optional*, defaults to `True`):
176
+ Whether to rescale the image by the specified scale `rescale_factor`.
177
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
178
+ Scale factor to use if rescaling the image.
179
+ do_normalize (`bool`, *optional*, defaults to `True`):
180
+ Whether to normalize the image.
181
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
182
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
183
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
184
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
185
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
186
+ Whether to convert the image to RGB.
187
+ min_pixels (`int`, *optional*, defaults to `56 * 56`):
188
+ The min pixels of the image to resize the image.
189
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
190
+ The max pixels of the image to resize the image.
191
+ patch_size (`int`, *optional*, defaults to 14):
192
+ The spacial patch size of the vision encoder.
193
+ temporal_patch_size (`int`, *optional*, defaults to 2):
194
+ The temporal patch size of the vision encoder.
195
+ merge_size (`int`, *optional*, defaults to 2):
196
+ The merge size of the vision encoder to llm encoder.
197
+ """
198
+
199
+ model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
200
+
201
+ def __init__(
202
+ self,
203
+ do_resize: bool = True,
204
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
205
+ do_rescale: bool = True,
206
+ rescale_factor: Union[int, float] = 1 / 255,
207
+ do_normalize: bool = True,
208
+ image_mean: Optional[Union[float, List[float]]] = None,
209
+ image_std: Optional[Union[float, List[float]]] = None,
210
+ do_convert_rgb: bool = True,
211
+ min_pixels: int = 56 * 56,
212
+ max_pixels: int = 28 * 28 * 1280,
213
+ patch_size: int = 14,
214
+ temporal_patch_size: int = 1,
215
+ merge_size: int = 2,
216
+ **kwargs,
217
+ ) -> None:
218
+ super().__init__(**kwargs)
219
+ self.do_resize = do_resize
220
+ self.resample = resample
221
+ self.do_rescale = do_rescale
222
+ self.rescale_factor = rescale_factor
223
+ self.do_normalize = do_normalize
224
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
225
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
226
+ self.min_pixels = min_pixels
227
+ self.max_pixels = max_pixels
228
+ self.patch_size = patch_size
229
+ self.temporal_patch_size = temporal_patch_size
230
+ self.merge_size = merge_size
231
+ self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
232
+ self.do_convert_rgb = do_convert_rgb
233
+
234
+ def mvit_rescale(
235
+ self, image: Image.Image, merge_size: int = 2
236
+ ) -> Image.Image:
237
+ try:
238
+ w, h = image.size
239
+ except:
240
+ raise ValueError(str((type(image), image)))
241
+ patch_size = self.patch_size
242
+
243
+ if (w // patch_size) * (h // patch_size) > self.in_token_limit:
244
+ scale = math.sqrt(self.in_token_limit / ((w // patch_size) * (h // patch_size)))
245
+ new_w, new_h = int(w * scale), int(h * scale)
246
+
247
+ image = image.resize((new_w, new_h), Image.Resampling.BILINEAR)
248
+ if self.pad_input:
249
+ new_w, new_h = image.size
250
+ pad_size_h = merge_size * patch_size
251
+ pad_size_w = merge_size * patch_size
252
+
253
+ pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
254
+ pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
255
+
256
+ image = TF.pad(image, (0, 0, pad_w, pad_h))
257
+ else:
258
+ new_w, new_h = image.size
259
+ new_w = new_w - new_w % patch_size
260
+ new_h = new_h - new_h % patch_size
261
+
262
+ new_w = adjust_size(new_w, patch_size)
263
+ new_h = adjust_size(new_h, patch_size)
264
+
265
+ image = TF.center_crop(image, (new_h, new_w))
266
+
267
+ w, h = image.size
268
+ if w // patch_size >= 512 or h // patch_size >= 512:
269
+ new_h = min(patch_size * 510, h)
270
+ new_w = min(patch_size * 510, w)
271
+ image = TF.center_crop(image, (new_h, new_w))
272
+ #raise ValueError("Exceed pos emb")
273
+ return image
274
+ def _preprocess(
275
+ self,
276
+ images: Union[ImageInput, VideoInput],
277
+ do_resize: bool = None,
278
+ size: Dict[str, int] = None,
279
+ resample: PILImageResampling = None,
280
+ do_rescale: bool = None,
281
+ rescale_factor: float = None,
282
+ do_normalize: bool = None,
283
+ image_mean: Optional[Union[float, List[float]]] = None,
284
+ image_std: Optional[Union[float, List[float]]] = None,
285
+ do_convert_rgb: bool = None,
286
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
287
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
288
+ ):
289
+ """
290
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
291
+
292
+ Args:
293
+ images (`ImageInput`):
294
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
295
+ vision_info (`List[Dict]`, *optional*):
296
+ Optional list of dictionaries containing additional information about vision inputs.
297
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
298
+ Whether to resize the image.
299
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
300
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
301
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
302
+ Whether to rescale the image.
303
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
304
+ Scale factor to use if rescaling the image.
305
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
306
+ Whether to normalize the image.
307
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
308
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
309
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
310
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
311
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
312
+ Whether to convert the image to RGB.
313
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
314
+ The channel dimension format for the output image. Can be one of:
315
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
316
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
317
+ - Unset: Use the channel dimension format of the input image.
318
+ input_data_format (`ChannelDimension` or `str`, *optional*):
319
+ The channel dimension format for the input image. Can be one of:
320
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
321
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
322
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
323
+ """
324
+ images = make_list_of_images(images)
325
+
326
+ if do_convert_rgb:
327
+ images = [convert_to_rgb(image) for image in images]
328
+
329
+ # All transformations expect numpy arrays.
330
+ images = [to_numpy_array(image) for image in images]
331
+
332
+ if is_scaled_image(images[0]) and do_rescale:
333
+ logger.warning_once(
334
+ "It looks like you are trying to rescale already rescaled images. If the input"
335
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
336
+ )
337
+ if input_data_format is None:
338
+ # We assume that all images have the same channel dimension format.
339
+ input_data_format = infer_channel_dimension_format(images[0])
340
+
341
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
342
+ resized_height, resized_width = height, width
343
+ processed_images = []
344
+ for image in images:
345
+ # image = self.mvit_rescale(image, merge_size=self.merge_size)
346
+ if do_resize:
347
+ if size is not None and "height" in size.keys():
348
+ resized_height, resized_width = size["height"], size["width"]
349
+ else:
350
+ resized_height, resized_width = smart_resize(
351
+ height,
352
+ width,
353
+ factor=self.patch_size * self.merge_size,
354
+ min_pixels=self.min_pixels,
355
+ max_pixels=self.max_pixels,
356
+ )
357
+ image = resize(
358
+ image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
359
+ )
360
+
361
+ if do_rescale:
362
+ image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
363
+
364
+ if do_normalize:
365
+ image = self.normalize(
366
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
367
+ )
368
+
369
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
370
+ processed_images.append(image)
371
+
372
+ patches = np.array(processed_images)
373
+ if data_format == ChannelDimension.LAST:
374
+ patches = patches.transpose(0, 3, 1, 2)
375
+ if patches.shape[0] == 1:
376
+ patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
377
+ init_patches = patches
378
+ channel = patches.shape[1]
379
+ grid_t = patches.shape[0] // self.temporal_patch_size
380
+ grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
381
+ patches = patches.reshape(
382
+ grid_t,
383
+ self.temporal_patch_size,
384
+ channel,
385
+ grid_h,
386
+ self.patch_size,
387
+ grid_w,
388
+ self.patch_size,
389
+ )
390
+ patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
391
+ assert self.temporal_patch_size == 1
392
+ flatten_patches = patches.reshape(
393
+ grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
394
+ )
395
+ return flatten_patches, (grid_t, grid_h, grid_w)
396
+
397
+ def preprocess(
398
+ self,
399
+ images: ImageInput,
400
+ videos: VideoInput = None,
401
+ do_resize: bool = None,
402
+ size: Dict[str, int] = None,
403
+ resample: PILImageResampling = None,
404
+ do_rescale: bool = None,
405
+ rescale_factor: float = None,
406
+ do_normalize: bool = None,
407
+ image_mean: Optional[Union[float, List[float]]] = None,
408
+ image_std: Optional[Union[float, List[float]]] = None,
409
+ do_convert_rgb: bool = None,
410
+ return_tensors: Optional[Union[str, TensorType]] = None,
411
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
412
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
413
+ ):
414
+ """
415
+ Args:
416
+ images (`ImageInput`):
417
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
418
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
419
+ videos (`VideoInput`):
420
+ Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
421
+ passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
422
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
423
+ Whether to resize the image.
424
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
425
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
426
+ the longest edge resized to keep the input aspect ratio.
427
+ resample (`int`, *optional*, defaults to `self.resample`):
428
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
429
+ has an effect if `do_resize` is set to `True`.
430
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
431
+ Whether to rescale the image.
432
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
433
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
434
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
435
+ Whether to normalize the image.
436
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
437
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
438
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
439
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
440
+ `True`.
441
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
442
+ Whether to convert the image to RGB.
443
+ return_tensors (`str` or `TensorType`, *optional*):
444
+ The type of tensors to return. Can be one of:
445
+ - Unset: Return a list of `np.ndarray`.
446
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
447
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
448
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
449
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
450
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
451
+ The channel dimension format for the output image. Can be one of:
452
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
453
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
454
+ - Unset: Use the channel dimension format of the input image.
455
+ input_data_format (`ChannelDimension` or `str`, *optional*):
456
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
457
+ from the input image. Can be one of:
458
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
459
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
460
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
461
+
462
+ """
463
+ do_resize = do_resize if do_resize is not None else self.do_resize
464
+ size = size if size is not None else self.size
465
+ resample = resample if resample is not None else self.resample
466
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
467
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
468
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
469
+ image_mean = image_mean if image_mean is not None else self.image_mean
470
+ image_std = image_std if image_std is not None else self.image_std
471
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
472
+
473
+ if images is not None:
474
+ images = make_batched_images(images)
475
+ if videos is not None:
476
+ videos = make_batched_videos(videos)
477
+
478
+ if images is not None and not valid_images(images):
479
+ raise ValueError(
480
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
481
+ "torch.Tensor, tf.Tensor or jax.ndarray."
482
+ )
483
+
484
+ validate_preprocess_arguments(
485
+ rescale_factor=rescale_factor,
486
+ do_normalize=do_normalize,
487
+ image_mean=image_mean,
488
+ image_std=image_std,
489
+ do_resize=do_resize,
490
+ size=size,
491
+ resample=resample,
492
+ )
493
+
494
+ if images is not None:
495
+ pixel_values, vision_grid_thws = [], []
496
+ for image in images:
497
+ patches, image_grid_thw = self._preprocess(
498
+ image,
499
+ do_resize=do_resize,
500
+ size = size,
501
+ resample=resample,
502
+ do_rescale=do_rescale,
503
+ rescale_factor=rescale_factor,
504
+ do_normalize=do_normalize,
505
+ image_mean=image_mean,
506
+ image_std=image_std,
507
+ data_format=data_format,
508
+ do_convert_rgb=do_convert_rgb,
509
+ input_data_format=input_data_format,
510
+ )
511
+ pixel_values.extend(patches)
512
+ vision_grid_thws.append(image_grid_thw)
513
+ pixel_values = np.array(pixel_values)
514
+ vision_grid_thws = np.array(vision_grid_thws)
515
+ data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
516
+
517
+ if videos is not None:
518
+ pixel_values, vision_grid_thws = [], []
519
+ for images in videos:
520
+ patches, video_grid_thw = self._preprocess(
521
+ images,
522
+ do_resize=do_resize,
523
+ size = size,
524
+ resample=resample,
525
+ do_rescale=do_rescale,
526
+ rescale_factor=rescale_factor,
527
+ do_normalize=do_normalize,
528
+ image_mean=image_mean,
529
+ image_std=image_std,
530
+ data_format=data_format,
531
+ do_convert_rgb=do_convert_rgb,
532
+ input_data_format=input_data_format,
533
+ )
534
+ pixel_values.extend(patches)
535
+ vision_grid_thws.append(video_grid_thw)
536
+ pixel_values = np.array(pixel_values)
537
+ vision_grid_thws = np.array(vision_grid_thws)
538
+ data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
539
+
540
+ return BatchFeature(data=data, tensor_type=return_tensors)
541
+
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f8def83c2332ebb0fb07399dd1ff8c643f3b22966d6f063a0aeb9d921f900be
3
+ size 4992443296
model-00003-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:783fe07320638f175a97b04bda9fa56e2481f156a5e6c3aea4298798ce09e97a
3
+ size 4992525496
model-00004-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8234d82d79b8460750644b74e2bd4102c9bb2fb63b8f9f121761db6bcaca0fd4
3
+ size 4992490496
model-00009-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49a96dbf5df71bd81f91df442f518d6334a9d437d2c0766cd61492ca17eeca7e
3
+ size 4992524984
model-00021-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f307fc57b7c118e7e18fc11e9dcbe64f8ddb8057fa6766a96299f9eb40a110fd
3
+ size 4992525976
model-00035-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c308c5e569fce7b6ef03a3aa0f61a4f347f4025013c44b70bbb62de5b34fa00a
3
+ size 4992526088
model-00039-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f04643034c4910206ab5a48e5230ae632e74047e27450af4e15c025c958941b
3
+ size 4992525672
model-00041-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e3d2d9890b89ff587732193a85e58cb1604f14fd7babcaaf72a3141d57765b7
3
+ size 4992491096
model-00046-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ae4b649002abec029c5b223994db661a4d901a530e4d198d7a93042dd705971
3
+ size 4992525728
model-00048-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b6edd7cb0d469274bd888e47326be1fe14db5ce58350d7493fc1cbe424ae5af
3
+ size 4992491064
model-00050-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4468368ee604420339ebf4c75abb4317136487eb72906ff81733531b57efbf9a
3
+ size 4992491248
model-00057-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b19c29ccd8b66887135e178c6313730a1e87f3f6f6c2fee5b7d6069ea068275f
3
+ size 4992491192
model-00059-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86aa5f23ec2473a8c12f072ff754d84fd03a567ffb986fd395b9bd352a056a0e
3
+ size 4992491400
model-00061-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15499f621462c19dbf63382a8ff5fd9742dd1a08b2097c58f6ccaa993427c373
3
+ size 4992491616
model-00063-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:976da05bcdd232d06594020f89bafe479433c507e9405bd54bd18feeb92ec266
3
+ size 4992526176
model-00066-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d691e93c460ec6006edac71608c8aa58b23bcb9abc0bd231f46b1a9c2f236c25
3
+ size 4992491344
model-00068-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb7ead31a8c46beabe30fd8e6bb602800c406035e92927be166115ebe368499b
3
+ size 4992491552
model-00069-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddae7b8835a480518055609c234d4025ab5fcbb1cbdf6624779028bc0f4767e7
3
+ size 4992525688
model-00073-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11b561e2ab1f9e230d36015face285d6634efdb9d1a404062583640808a618f5
3
+ size 4992491288
model-00077-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeae8aeb9828d640e2fc187058b6e578c4aaa7169df668b077bba525d80a17f1
3
+ size 4992526176
model-00080-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bef40110db347f6a50a167deacb270d08164dcf147da7334380f4fdbb2690ef7
3
+ size 4992491232
model-00083-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c7d28ba65411738740041570940e5766e3bd447305c7460c096cd10f938e18f
3
+ size 4992525800
model-00086-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:266fac4e673e340f10b68303ced5db2c2b1c3064bb3f0718920c14c712bea4a0
3
+ size 4992526176
model-00096-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9cf10aa7578e790c8f3cb5d3f7c83b323770ce35bae151b22367e9ae7a2c48c
3
+ size 4992491328
model-00101-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16fb974116f89ff15e10d085943f5b6077c90533b3110696cdc9504cd7a1e560
3
+ size 4992491064
model-00102-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3515511abb5b831728be5228adb6effdd6cc2f6a672b5ec3008bafddd8f56cf
3
+ size 4992526176
model-00103-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1939f174dddaad1d9e4418d0c44d750c1eb040bf0e97a007d903727f975cc7f3
3
+ size 4992491272
model-00109-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5019f53bc9a05196da4b8aaf2f3cc6dbc18ce591a883549e648064a5d094b189
3
+ size 4992526176
model-00110-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:710c5dc48b6c949bcbece419791fba03f0b46715f668caf9d5304be07faf0cd6
3
+ size 4992491216
model-00113-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a687b4ef3d52be153e9aaed6e9cf3d7a9e8c098918a6225fa4d2f7b876299683
3
+ size 4992525816
model-00116-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5db87372a5a4ed422df2caa91635610020081c91c25c8ddb1930f94b619c1e9b
3
+ size 4992526176
model-00124-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98972af38e95797831bf7b6fa4de8cd6e8cfc72bc73049b3142e3457e1dfd204
3
+ size 4992491104
model-00128-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dda12a925d7c1841eddc3d9874cd1d786e17723b77c05ded47468e194b85767
3
+ size 4992491520
model-00130-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a867da69417bc70179f14a09c7df5652ea348b762cbd84375b9f2e3cab8b7a59
3
+ size 4992526176
model-00131-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cc49078e6c09aaf03eeb896b7a080ab664ae14a021b27ed270e520df5f22134
3
+ size 4992491064
model-00135-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65ae76ac1753eef389e2abdb68c26cddca11f57e5e45049e8cc64258ae9c7b96
3
+ size 3336955648
model-00136-of-00136.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5ad10363f651e6d5923b478aa12c47ccd2daf794c1f7b3a7f0a71a10f95c371
3
+ size 1853358208
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_keye.SiglipImageProcessor",
4
+ "AutoProcessor": "processing_keye.KeyeProcessor"
5
+ },
6
+ "do_convert_rgb": true,
7
+ "do_normalize": true,
8
+ "do_rescale": true,
9
+ "do_resize": true,
10
+ "image_mean": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "image_processor_type": "SiglipImageProcessor",
16
+ "image_std": [
17
+ 0.5,
18
+ 0.5,
19
+ 0.5
20
+ ],
21
+ "max_pixels": 16056320,
22
+ "merge_size": 2,
23
+ "min_pixels": 3136,
24
+ "patch_size": 14,
25
+ "processor_class": "KeyeProcessor",
26
+ "resample": 2,
27
+ "rescale_factor": 0.00392156862745098,
28
+ "size": {
29
+ "max_pixels": 16056320,
30
+ "min_pixels": 3136
31
+ },
32
+ "temporal_patch_size": 1
33
+ }
processing_keye.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ from typing import List, Union, Optional
21
+
22
+ from transformers.feature_extraction_utils import BatchFeature
23
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
24
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
25
+ from .image_processing_keye import SiglipImageProcessor
26
+ import torch
27
+ import torch.nn as nn
28
+ import numpy as np
29
+ from itertools import chain
30
+ from typing import List
31
+
32
+
33
+ import numpy as np
34
+
35
+ ImageInput = Union[
36
+ "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
37
+ ] # noqa
38
+
39
+
40
+ VideoInput = Union[
41
+ List["PIL.Image.Image"],
42
+ "np.ndarray",
43
+ "torch.Tensor",
44
+ List["np.ndarray"],
45
+ List["torch.Tensor"],
46
+ List[List["PIL.Image.Image"]],
47
+ List[List["np.ndarrray"]],
48
+ List[List["torch.Tensor"]],
49
+ ] # noqa
50
+
51
+ class KeyeVideosProcessorKwargs(VideosKwargs, total=False):
52
+ fps: Optional[Union[List[float], float]]
53
+ # 准备reszie到的width(slow)
54
+ width: Optional[Union[List[int], int]]
55
+ # 准备reszie到的height(slow)
56
+ height: Optional[Union[List[int], int]]
57
+ # 准备resize到的width(fast)
58
+ fast_width: Optional[Union[List[int], int]]
59
+ # 准备resize到的height(fast)
60
+ fast_height: Optional[Union[List[int], int]]
61
+ # 用于标记每一帧的时间戳,数量和帧数相等
62
+ timestamps: Optional[Union[List[torch.Tensor], torch.Tensor]]
63
+ # 用于标记每一帧的类型是slow还是fast,slow=0, fast=1
64
+ frame_types: Optional[Union[List[torch.Tensor], torch.Tensor]]
65
+
66
+
67
+ class KeyeProcessorKwargs(ProcessingKwargs, total=False):
68
+ videos_kwargs: KeyeVideosProcessorKwargs
69
+ _defaults = {
70
+ "text_kwargs": {
71
+ "padding": False,
72
+ },
73
+ "videos_kwargs": {"fps": 2.0},
74
+ }
75
+
76
+ def select_slow_fast_frames(frames: torch.Tensor, frame_types: torch.Tensor):
77
+ """
78
+ Selects frames from a tensor based on a mask list.
79
+
80
+ Args:
81
+ frames (torch.Tensor): A tensor of shape (nframes, c, h, w).
82
+ frame_types (torch.Tensor): A int tensor of shape (nframes,)
83
+
84
+ Returns:
85
+ tuple[torch.Tensor, torch.Tensor]: A tuple containing two tensors:
86
+ - slow_frames: Frames which the type is 0.
87
+ - fast_frames: Frames where the type is 1.
88
+ """
89
+ nframes, _, _, _ = frames.shape
90
+ if frame_types.shape[-1] != nframes:
91
+ raise ValueError("Length of mask must be equal to the number of frames.")
92
+
93
+ mask = (frame_types == 0)
94
+
95
+ slow_frames = frames[mask]
96
+ fast_frames = frames[~mask]
97
+
98
+ return slow_frames, fast_frames
99
+
100
+ def split_thw(tensor):
101
+ """Split grid_thw in t dimension, the result tensor should like [[1, h, w],...]"""
102
+ repeats = tensor[:, 0]
103
+ new_thw = torch.cat([
104
+ torch.ones(tensor.shape[0], 1, dtype=tensor.dtype,
105
+ device=tensor.device),
106
+ tensor[:, 1:]
107
+ ], dim=1)
108
+ return torch.repeat_interleave(new_thw, repeats, dim=0)
109
+
110
+ def merge_hws(hws):
111
+ """
112
+ 优化版本:使用更高效的方法合并张量
113
+ """
114
+ merged = []
115
+ last_hw = [-1, -1]
116
+
117
+ for hw in hws:
118
+ # 找到连续相同形状的张量
119
+ if hw[1:] == last_hw:
120
+ merged[-1][0] += 1
121
+ else:
122
+ merged.append(hw)
123
+ last_hw = hw[1:]
124
+
125
+ return torch.tensor(merged)
126
+
127
+ class KeyeProcessor(ProcessorMixin):
128
+ r"""
129
+ [`KeyeProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`Qwen2TokenizerFast`]. See the
130
+ [`~KeyeProcessor.__call__`] and [`~KeyeProcessor.decode`] for more information.
131
+ Args:
132
+ image_processor ([`SiglipImageProcessor`], *optional*):
133
+ The image processor is a required input.
134
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
135
+ The tokenizer is a required input.
136
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
137
+ in a chat into a tokenizable string.
138
+ """
139
+
140
+ attributes = ["image_processor", "tokenizer"]
141
+ valid_kwargs = [
142
+ "chat_template","image_std", "min_pixels", "image_mean", "merge_size", "image_processor_type",
143
+ "temporal_patch_size", "patch_size", "max_pixels"
144
+ ]
145
+
146
+ image_processor_class = "AutoImageProcessor"
147
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
148
+
149
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
150
+ self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
151
+ self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
152
+ self.frame_token = "<|frame|>" if not hasattr(tokenizer, "frame_token") else tokenizer.frame_token
153
+ self.fast_video_token = "<|fast_video_pad|>" if not hasattr(tokenizer, "fast_video_token") else tokenizer.fast_video_token
154
+ self.fast_start = "<|fast_start|>" if not hasattr(tokenizer, "fast_start") else tokenizer.fast_start
155
+ self.fast_end = "<|fast_end|>" if not hasattr(tokenizer, "fast_end") else tokenizer.fast_end
156
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
157
+
158
+ self.slowfast = True
159
+
160
+ def __call__(
161
+ self,
162
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
163
+ images: ImageInput = None,
164
+ videos: VideoInput = None,
165
+ **kwargs: Unpack[KeyeProcessorKwargs],
166
+ ) -> BatchFeature:
167
+ """
168
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
169
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
170
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
171
+ SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `vision_infos` is not `None`.
172
+
173
+ Args:
174
+ text (`str`, `List[str]`, `List[List[str]]`):
175
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
176
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
177
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
178
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
179
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
180
+ tensor. Both channels-first and channels-last formats are supported.
181
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
182
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
183
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
184
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
185
+ If set, will return tensors of a particular framework. Acceptable values are:
186
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
187
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
188
+ - `'np'`: Return NumPy `np.ndarray` objects.
189
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
190
+
191
+ Returns:
192
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
193
+
194
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
195
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
196
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
197
+ `None`).
198
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
199
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
200
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
201
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
202
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
203
+ """
204
+ output_kwargs = self._merge_kwargs(
205
+ KeyeProcessorKwargs,
206
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
207
+ **kwargs,
208
+ )
209
+ if images is not None:
210
+ # slow_images = images
211
+ image_inputs = self.image_processor(images=images, return_tensors="pt")
212
+ image_inputs['pixel_values'] = image_inputs['pixel_values']
213
+ image_grid_thw = image_inputs["image_grid_thw"]
214
+ else:
215
+ image_inputs = {}
216
+ image_grid_thw = None
217
+
218
+ num_frames = []
219
+ if videos is not None:
220
+ batch_slow_frames = []
221
+ batch_fast_frames = []
222
+
223
+ videos_kwargs = output_kwargs["videos_kwargs"]
224
+ num_videos = len(videos)
225
+ batch_frame_types = videos_kwargs.get("frame_types", [None] * num_videos)
226
+ batch_timestamps = videos_kwargs.get("timestamps", [None] * num_videos)
227
+ batch_width = videos_kwargs.get("width", [None] * num_videos)
228
+ batch_height = videos_kwargs.get("height", [None] * num_videos)
229
+ batch_fast_width = videos_kwargs.get("fast_width", [None] * num_videos)
230
+ batch_fast_height = videos_kwargs.get("fast_height", [None] * num_videos)
231
+
232
+ for index, frames in enumerate(videos):
233
+ if isinstance(frames, np.ndarray):
234
+ frames = torch.from_numpy(frames.copy())
235
+ nframes = frames.shape[0]
236
+ num_frames.append(nframes)
237
+ assert nframes > 0, "No frames in video"
238
+ if batch_frame_types[index] is None:
239
+ # default to all slow frames
240
+ batch_frame_types[index] = torch.Tensor([0] * nframes)
241
+ frame_types = batch_frame_types[index]
242
+ slow_frames, fast_frames = select_slow_fast_frames(frames, frame_types)
243
+ has_fast_frames = fast_frames.shape[0] > 0
244
+ # resize slow frames
245
+ resized_width = batch_width[index]
246
+ resized_height = batch_height[index]
247
+ if resized_width is not None and resized_height is not None:
248
+ slow_frames = nn.functional.interpolate(
249
+ slow_frames,
250
+ [resized_height, resized_width],
251
+ mode="bilinear",
252
+ antialias=True,
253
+ ).float()
254
+ # Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
255
+ # slow_frames = list(slow_frames.split(1, dim=0)),不split,在模型里面做
256
+ slow_video_inputs = self.image_processor(
257
+ images=None, videos=[slow_frames], **output_kwargs["images_kwargs"])
258
+ slow_video_grid_thw = slow_video_inputs["video_grid_thw"]
259
+ batch_slow_frames.append(slow_video_inputs)
260
+ # # 当前这个视频每一帧的token数
261
+ # slow_frames_patch_nums[index] = int(slow_video_inputs["pixel_values_videos"].shape[0] / \
262
+ # slow_video_grid_thw.squeeze()[0])
263
+
264
+ if has_fast_frames:
265
+ # TODO: shrink fast_frames
266
+ fast_resized_width = batch_fast_width[index]
267
+ fast_resized_height = batch_fast_height[index]
268
+ if fast_resized_width is not None and fast_resized_height is not None:
269
+ fast_frames = nn.functional.interpolate(
270
+ fast_frames,
271
+ [fast_resized_height, fast_resized_width],
272
+ mode="bilinear",
273
+ antialias=True,
274
+ ).float()
275
+ # Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
276
+ # fast_frames = list(fast_frames.split(1, dim=0))
277
+ fast_video_inputs = self.image_processor(
278
+ images=None, videos=[fast_frames], **output_kwargs["images_kwargs"])
279
+ fast_video_grid_thw = fast_video_inputs["video_grid_thw"]
280
+ batch_fast_frames.append(fast_video_inputs)
281
+ # # 当前这个视频的所有token数
282
+ # fast_frames_token_nums[index] = int(fast_video_inputs["pixel_values_videos"].shape[0] / \
283
+ # fast_video_grid_thw.squeeze()[0])
284
+
285
+ assert len(batch_slow_frames) > 0, "Slow frames should not be empty."
286
+ slow_pixel_values_videos_list = [
287
+ video["pixel_values_videos"] for video in batch_slow_frames if video is not None]
288
+ slow_video_grid_thw_list = [
289
+ video["video_grid_thw"] for video in batch_slow_frames if video is not None]
290
+
291
+ slow_pixel_values_videos = torch.concat(slow_pixel_values_videos_list, dim=0)
292
+ slow_video_grid_thw = torch.concat(slow_video_grid_thw_list, dim=0)
293
+
294
+ if has_fast_frames:
295
+ fast_pixel_values_videos_list = [
296
+ video["pixel_values_videos"] for video in batch_fast_frames \
297
+ if video is not None]
298
+ fast_video_grid_thw_list = [
299
+ video["video_grid_thw"] for video in batch_fast_frames \
300
+ if video is not None]
301
+
302
+ fast_pixel_values_videos = \
303
+ torch.concat(fast_pixel_values_videos_list, dim=0)
304
+ fast_video_grid_thw = \
305
+ torch.concat(fast_video_grid_thw_list, dim=0)
306
+ else:
307
+ fast_video_grid_thw = None
308
+ else:
309
+ slow_video_grid_thw = None
310
+ fast_video_grid_thw = None
311
+
312
+ if not isinstance(text, list):
313
+ text = [text]
314
+ if image_grid_thw is not None:
315
+ index = 0
316
+ for i in range(len(text)):
317
+ while self.image_token in text[i]:
318
+ # image_place_holder_tempale = "<|placeholder|>" * (
319
+ # image_grid_thw[index].prod() // self.image_processor.merge_size ** 2)
320
+ image_place_holder_tempale = ""
321
+ _, h_merged, w_merged = image_grid_thw[index]// self.image_processor.merge_size
322
+ for i_h in range(h_merged.item()):
323
+ image_place_holder_tempale += "<|placeholder|>" * w_merged + "<|mm_pos_start|>" + f"{i_h},{w_merged}" + "<|mm_pos_end|>"
324
+
325
+ text[i] = text[i].replace(
326
+ self.image_token,
327
+ image_place_holder_tempale,
328
+ 1,
329
+ )
330
+ index += 1
331
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
332
+ pixel_values_videos = []
333
+ video_grid_thw = []
334
+ videos_inputs = {}
335
+ if slow_video_grid_thw is not None:
336
+ slow_video_grid_thw = split_thw(slow_video_grid_thw)
337
+ if fast_video_grid_thw is not None:
338
+ fast_video_grid_thw = split_thw(fast_video_grid_thw)
339
+ index = 0
340
+ slow_index = 0
341
+ fast_index = 0
342
+ slow_pixels_index = 0
343
+ fast_pixels_index = 0
344
+ for i in range(len(text)):
345
+ while self.video_token in text[i]:
346
+ video_place_holder_tempale = ""
347
+
348
+ for j in range(batch_frame_types[index].shape[-1]):
349
+ if batch_timestamps[index] is not None: # 如果有时间戳
350
+ video_place_holder_tempale += self.frame_token + format(batch_timestamps[index][j], ".1f")
351
+ else:
352
+ video_place_holder_tempale += self.frame_token
353
+
354
+ # 当前帧是slow
355
+ if batch_frame_types[index][j] == 0:
356
+ num_patches = int(slow_video_grid_thw[slow_index].prod())
357
+
358
+ # video_place_holder_tempale += "<|placeholder|>" * (
359
+ # num_patches // self.image_processor.merge_size ** 2)
360
+
361
+ _, h_merged, w_merged = slow_video_grid_thw[slow_index]// self.image_processor.merge_size
362
+ for i_h in range(h_merged.item()):
363
+ video_place_holder_tempale += "<|placeholder|>" * w_merged + "<|mm_pos_start|>" + f"{i_h},{w_merged}" + "<|mm_pos_end|>"
364
+
365
+ pixel_values_videos.append(
366
+ slow_pixel_values_videos[slow_pixels_index:slow_pixels_index + num_patches])
367
+ slow_pixels_index = slow_pixels_index + num_patches
368
+ video_grid_thw.append(slow_video_grid_thw[slow_index].tolist())
369
+ slow_index += 1
370
+
371
+ # 当前帧是fast
372
+ elif batch_frame_types[index][j] == 1:
373
+ num_patches = int(fast_video_grid_thw[fast_index].prod())
374
+
375
+ # video_place_holder_tempale += self.fast_start + "<|placeholder|>" * (
376
+ # num_patches // self.image_processor.merge_size ** 2) + \
377
+ # self.fast_end
378
+
379
+ _, h_merged, w_merged = fast_video_grid_thw[fast_index] // self.image_processor.merge_size
380
+ video_place_holder_tempale += self.fast_start
381
+
382
+ for i_h in range(h_merged.item()):
383
+ video_place_holder_tempale += "<|placeholder|>" * w_merged + "<|mm_pos_start|>" + f"{i_h},{w_merged}" + "<|mm_pos_end|>"
384
+
385
+ video_place_holder_tempale += self.fast_end
386
+
387
+ pixel_values_videos.append(
388
+ fast_pixel_values_videos[fast_pixels_index:fast_pixels_index + num_patches])
389
+ fast_pixels_index = fast_pixels_index + num_patches
390
+ video_grid_thw.append(fast_video_grid_thw[fast_index].tolist())
391
+ fast_index += 1
392
+ text[i] = text[i].replace(
393
+ self.video_token,
394
+ video_place_holder_tempale,
395
+ 1,
396
+ )
397
+ index += 1
398
+ text[i] = text[i].replace("<|placeholder|>", self.video_token)
399
+
400
+ videos_inputs["pixel_values_videos"] = torch.cat(pixel_values_videos, dim=0)
401
+ videos_inputs["video_grid_thw"] = merge_hws(video_grid_thw)
402
+ videos_inputs["num_frames"] = torch.tensor(num_frames)
403
+
404
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
405
+
406
+ return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
407
+
408
+ def batch_decode(self, *args, **kwargs):
409
+ """
410
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
411
+ refer to the docstring of this method for more information.
412
+ """
413
+ return self.tokenizer.batch_decode(*args, **kwargs)
414
+
415
+ def decode(self, *args, **kwargs):
416
+ """
417
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
418
+ the docstring of this method for more information.
419
+ """
420
+ return self.tokenizer.decode(*args, **kwargs)
421
+
422
+ def post_process_image_text_to_text(
423
+ self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
424
+ ):
425
+ """
426
+ Post-process the output of the model to decode the text.
427
+
428
+ Args:
429
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
430
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
431
+ or `(sequence_length,)`.
432
+ skip_special_tokens (`bool`, *optional*, defaults to `True`):
433
+ Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
434
+ Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
435
+ Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
436
+ **kwargs:
437
+ Additional arguments to be passed to the tokenizer's `batch_decode method`.
438
+
439
+ Returns:
440
+ `List[str]`: The decoded text.
441
+ """
442
+ return self.tokenizer.batch_decode(
443
+ generated_outputs,
444
+ skip_special_tokens=skip_special_tokens,
445
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
446
+ **kwargs,
447
+ )
448
+
449
+ @property
450
+ def model_input_names(self):
451
+ tokenizer_input_names = self.tokenizer.model_input_names
452
+ image_processor_input_names = self.image_processor.model_input_names
453
+ names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
454
+ return names_from_processor
455
+
456
+
457
+ def apply_chat_template(self, messages, tokenize=True, add_generation_prompt=True, thinking='auto', **kwargs):
458
+ """
459
+ 将对话消息列表转换为模型可接受的格式,支持添加思维标记
460
+
461
+ Args:
462
+ messages (list): 对话消息列表,每个消息是一个字典,包含"role"和"content"
463
+ tokenize (bool): 是否对结果进行分词,默认为True
464
+ add_generation_prompt (bool): 是否添加生成提示,默认为True
465
+ enable_thinking (bool): 是否启用思维标记,默认为None
466
+
467
+ Returns:
468
+ 如果tokenize=True,返回分词后的输入;否则返回格式化的文本字符串
469
+ """
470
+ # 使用父类的apply_chat_template生成基础模板
471
+ formatted_text = super().apply_chat_template(
472
+ messages,
473
+ tokenize=False,
474
+ add_generation_prompt=add_generation_prompt,
475
+ thinking=thinking,
476
+ **kwargs
477
+ )
478
+
479
+ # if enable_thinking == 'auto':
480
+ # pass
481
+ # elif enable_thinking == True:
482
+ # formatted_text += "<think>"
483
+ # elif enable_thinking == False:
484
+ # formatted_text += f"</think>"
485
+ # else:
486
+ # raise RuntimeError(f"Bad enable_thinking={enable_thinking}")
487
+ # 如果需要分词,返回分词结果;否则返回格式化文本
488
+ # print(formatted_text)
489
+ return self.tokenizer(formatted_text, **kwargs) if tokenize else formatted_text
490
+
491
+
492
+
493
+ __all__ = ["KeyeProcessor"]
494
+
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_keye.KeyeProcessor"
4
+ },
5
+ "processor_class": "KeyeProcessor"
6
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
vocab.json ADDED
The diff for this file is too large to render. See raw diff