Kwai-Keye commited on 22 days ago

Commit

f88c57d

verified ·

1 Parent(s): af167f5

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

chat_template.jinja +5 -0
config.json +270 -0
configuration_deepseek.py +266 -0
generation_config.json +13 -0
image_processing_keye.py +541 -0
merges.txt +0 -0
model-00001-of-00136.safetensors +3 -0
model-00003-of-00136.safetensors +3 -0
model-00004-of-00136.safetensors +3 -0
model-00009-of-00136.safetensors +3 -0
model-00021-of-00136.safetensors +3 -0
model-00035-of-00136.safetensors +3 -0
model-00039-of-00136.safetensors +3 -0
model-00041-of-00136.safetensors +3 -0
model-00046-of-00136.safetensors +3 -0
model-00048-of-00136.safetensors +3 -0
model-00050-of-00136.safetensors +3 -0
model-00057-of-00136.safetensors +3 -0
model-00059-of-00136.safetensors +3 -0
model-00061-of-00136.safetensors +3 -0
model-00063-of-00136.safetensors +3 -0
model-00066-of-00136.safetensors +3 -0
model-00068-of-00136.safetensors +3 -0
model-00069-of-00136.safetensors +3 -0
model-00073-of-00136.safetensors +3 -0
model-00077-of-00136.safetensors +3 -0
model-00080-of-00136.safetensors +3 -0
model-00083-of-00136.safetensors +3 -0
model-00086-of-00136.safetensors +3 -0
model-00096-of-00136.safetensors +3 -0
model-00101-of-00136.safetensors +3 -0
model-00102-of-00136.safetensors +3 -0
model-00103-of-00136.safetensors +3 -0
model-00109-of-00136.safetensors +3 -0
model-00110-of-00136.safetensors +3 -0
model-00113-of-00136.safetensors +3 -0
model-00116-of-00136.safetensors +3 -0
model-00124-of-00136.safetensors +3 -0
model-00128-of-00136.safetensors +3 -0
model-00130-of-00136.safetensors +3 -0
model-00131-of-00136.safetensors +3 -0
model-00135-of-00136.safetensors +3 -0
model-00136-of-00136.safetensors +3 -0
model.safetensors.index.json +0 -0
preprocessor_config.json +33 -0
processing_keye.py +494 -0
processor_config.json +6 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,5 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% if not add_vision_id is defined %}{% set add_vision_id = true %}{% endif %}{% if not thinking is defined %}{% set thinking = 'auto' %}{% endif %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set ns = namespace(is_first_tool_call=true, is_tool_output=false, system_prompt='', is_first_sp=true, is_last_user=false) %}{%- for message in messages -%}{%- if message['role'] == 'system' -%}{%- if not ns.is_first_sp -%}{% set ns.system_prompt = ns.system_prompt + '
+' %}{% endif -%}{%- set sys_content = message['content'] -%}{%- if sys_content is string -%}{%- set sys_text = sys_content -%}{%- elif sys_content is iterable -%}{%- set sys_text = '' -%}{%- for part in sys_content -%}{%- if part is mapping and part.get('type') == 'text' -%}{%- set sys_text = sys_text + part.get('text', '') -%}{%- endif -%}{%- endfor -%}{%- else -%}{%- set sys_text = '' -%}{%- endif -%}{%- set ns.system_prompt = ns.system_prompt + sys_text -%}{%- set ns.is_first_sp = false -%}{%- endif -%}{%- endfor -%}{% if ns.system_prompt == '' %}{% set ns.system_prompt = '' %}{% endif %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages -%}{%- set role = message['role'] -%}{%- set content = message.get('content') -%}{%- if content is string -%}{%- set content_parts = [{'type': 'text', 'text': content}] -%}{%- elif content is none -%}{%- set content_parts = [] -%}{%- else -%}{%- set content_parts = content -%}{%- endif -%}{%- set text_content = namespace(value='') -%}{%- set vision_content = namespace(value='') -%}{%- for part in content_parts -%}{%- if part.type == 'text' -%}{%- set text_content.value = text_content.value + part.text -%}{%- elif part.type == 'image' or 'image_url' in part -%}{%- set image_count.value = image_count.value + 1 -%}{%- if add_vision_id -%}{%- set vision_content.value = vision_content.value + '' -%}{%- endif -%}{%- set vision_content.value = vision_content.value + '<|vision_start|><|image_pad|><|vision_end|>' -%}{%- elif part.type == 'video' or 'video_url' in part -%}{%- set video_count.value = video_count.value + 1 -%}{%- if add_vision_id -%}{%- set vision_content.value = vision_content.value + '' -%}{%- endif -%}{%- set vision_content.value = vision_content.value + '<|vision_start|><|video_pad|><|vision_end|>' -%}{%- endif -%}{%- endfor -%}{%- if role == 'user' -%}{%- set ns.is_tool_output = false -%}{%- set ns.is_last_user = true -%}<｜User｜>{{ vision_content.value }}{{ text_content.value }}<｜Assistant｜>{%- if thinking == 'True' or thinking == True -%}{{ '<think>' }}{%- elif thinking == 'False' or thinking == False -%}{{ '</think>' }}{%- else -%}{{ '' }}{%- endif -%}{%- elif role == 'assistant' -%}{%- set ns.is_last_user = false -%}{%- if ns.is_tool_output -%}<｜tool▁outputs▁end｜>{% endif -%}{%- set ns.is_tool_output = false -%}{{ vision_content.value }}{{ text_content.value }}{%- if message.get('tool_calls') -%}<｜tool▁calls▁begin｜>{%- set ns.is_first_tool_call = true -%}{%- for tool in message['tool_calls'] -%}{%- if not ns.is_first_tool_call %}{{'
+'}}{% endif -%}<｜tool▁call▁begin｜>{{ tool['type'] }}<｜tool▁sep｜>{{ tool['function']['name'] }}json{{ tool['function']['arguments'] }}<｜tool▁call▁end｜>{%- set ns.is_first_tool_call = false -%}{%- endfor -%}<｜tool▁calls▁end｜><｜end▁of▁sentence｜>{%- else -%}<｜end▁of▁sentence｜>{%- endif -%}{%- elif role == 'tool' -%}{%- set ns.is_last_user = false -%}{%- if not ns.is_tool_output -%}<｜tool▁outputs▁begin｜>{% set ns.is_tool_output = true %}{% else %}{{ '
+' }}{% endif -%}<｜tool▁output▁begin｜>{{ content }}<｜tool▁output▁end｜>{%- endif -%}{%- endfor -%}{%- if ns.is_tool_output -%}<｜tool▁outputs▁end｜>{% endif -%}{%- if add_generation_prompt and not ns.is_last_user and not ns.is_tool_output -%}<｜Assistant｜>{%- if thinking == 'True' or thinking == True -%}{{ '<think>' }}{%- elif thinking == 'False' or thinking == False -%}{{ '</think>' }}{%- else -%}{{ '' }}{%- endif -%}{% endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,270 @@

+{
+  "architectures": [
+    "KeyeVLMoeForConditionalGeneration"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_deepseek.DeepseekR1Config"
+  },
+  "bos_token_id": 0,
+  "dtype": "bfloat16",
+  "eos_token_id": 1,
+  "ep_size": 1,
+  "fast_video_token_id": 128021,
+  "first_k_dense_replace": 3,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 7168,
+  "image_token_id": 128010,
+  "initializer_range": 0.02,
+  "intermediate_size": 18432,
+  "kv_lora_rank": 512,
+  "max_position_embeddings": 163840,
+  "model_type": "deepseek_r1",
+  "moe_intermediate_size": 2048,
+  "moe_layer_freq": 1,
+  "n_group": 8,
+  "n_routed_experts": 256,
+  "n_shared_experts": 1,
+  "norm_topk_prob": true,
+  "num_attention_heads": 128,
+  "num_experts_per_tok": 8,
+  "num_hidden_layers": 61,
+  "num_key_value_heads": 128,
+  "num_nextn_predict_layers": 1,
+  "q_lora_rank": 1536,
+  "qk_nope_head_dim": 128,
+  "qk_rope_head_dim": 64,
+  "quantization_config": {
+    "activation_scheme": "dynamic",
+    "fmt": "e4m3",
+    "quant_method": "fp8",
+    "ignored_layers": [
+      "mlp_AR.linear_1",
+      "mlp_AR.linear_2",
+      "visual.vision_model.encoder.layers.0.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.0.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.0.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.0.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.0.mlp.fc1",
+      "visual.vision_model.encoder.layers.0.mlp.fc2",
+      "visual.vision_model.encoder.layers.1.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.1.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.1.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.1.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.1.mlp.fc1",
+      "visual.vision_model.encoder.layers.1.mlp.fc2",
+      "visual.vision_model.encoder.layers.2.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.2.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.2.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.2.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.2.mlp.fc1",
+      "visual.vision_model.encoder.layers.2.mlp.fc2",
+      "visual.vision_model.encoder.layers.3.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.3.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.3.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.3.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.3.mlp.fc1",
+      "visual.vision_model.encoder.layers.3.mlp.fc2",
+      "visual.vision_model.encoder.layers.4.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.4.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.4.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.4.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.4.mlp.fc1",
+      "visual.vision_model.encoder.layers.4.mlp.fc2",
+      "visual.vision_model.encoder.layers.5.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.5.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.5.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.5.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.5.mlp.fc1",
+      "visual.vision_model.encoder.layers.5.mlp.fc2",
+      "visual.vision_model.encoder.layers.6.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.6.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.6.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.6.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.6.mlp.fc1",
+      "visual.vision_model.encoder.layers.6.mlp.fc2",
+      "visual.vision_model.encoder.layers.7.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.7.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.7.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.7.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.7.mlp.fc1",
+      "visual.vision_model.encoder.layers.7.mlp.fc2",
+      "visual.vision_model.encoder.layers.8.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.8.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.8.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.8.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.8.mlp.fc1",
+      "visual.vision_model.encoder.layers.8.mlp.fc2",
+      "visual.vision_model.encoder.layers.9.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.9.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.9.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.9.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.9.mlp.fc1",
+      "visual.vision_model.encoder.layers.9.mlp.fc2",
+      "visual.vision_model.encoder.layers.10.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.10.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.10.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.10.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.10.mlp.fc1",
+      "visual.vision_model.encoder.layers.10.mlp.fc2",
+      "visual.vision_model.encoder.layers.11.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.11.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.11.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.11.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.11.mlp.fc1",
+      "visual.vision_model.encoder.layers.11.mlp.fc2",
+      "visual.vision_model.encoder.layers.12.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.12.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.12.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.12.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.12.mlp.fc1",
+      "visual.vision_model.encoder.layers.12.mlp.fc2",
+      "visual.vision_model.encoder.layers.13.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.13.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.13.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.13.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.13.mlp.fc1",
+      "visual.vision_model.encoder.layers.13.mlp.fc2",
+      "visual.vision_model.encoder.layers.14.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.14.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.14.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.14.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.14.mlp.fc1",
+      "visual.vision_model.encoder.layers.14.mlp.fc2",
+      "visual.vision_model.encoder.layers.15.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.15.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.15.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.15.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.15.mlp.fc1",
+      "visual.vision_model.encoder.layers.15.mlp.fc2",
+      "visual.vision_model.encoder.layers.16.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.16.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.16.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.16.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.16.mlp.fc1",
+      "visual.vision_model.encoder.layers.16.mlp.fc2",
+      "visual.vision_model.encoder.layers.17.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.17.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.17.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.17.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.17.mlp.fc1",
+      "visual.vision_model.encoder.layers.17.mlp.fc2",
+      "visual.vision_model.encoder.layers.18.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.18.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.18.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.18.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.18.mlp.fc1",
+      "visual.vision_model.encoder.layers.18.mlp.fc2",
+      "visual.vision_model.encoder.layers.19.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.19.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.19.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.19.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.19.mlp.fc1",
+      "visual.vision_model.encoder.layers.19.mlp.fc2",
+      "visual.vision_model.encoder.layers.20.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.20.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.20.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.20.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.20.mlp.fc1",
+      "visual.vision_model.encoder.layers.20.mlp.fc2",
+      "visual.vision_model.encoder.layers.21.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.21.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.21.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.21.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.21.mlp.fc1",
+      "visual.vision_model.encoder.layers.21.mlp.fc2",
+      "visual.vision_model.encoder.layers.22.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.22.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.22.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.22.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.22.mlp.fc1",
+      "visual.vision_model.encoder.layers.22.mlp.fc2",
+      "visual.vision_model.encoder.layers.23.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.23.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.23.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.23.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.23.mlp.fc1",
+      "visual.vision_model.encoder.layers.23.mlp.fc2",
+      "visual.vision_model.encoder.layers.24.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.24.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.24.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.24.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.24.mlp.fc1",
+      "visual.vision_model.encoder.layers.24.mlp.fc2",
+      "visual.vision_model.encoder.layers.25.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.25.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.25.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.25.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.25.mlp.fc1",
+      "visual.vision_model.encoder.layers.25.mlp.fc2",
+      "visual.vision_model.encoder.layers.26.self_attn.k_proj",
+      "visual.vision_model.encoder.layers.26.self_attn.v_proj",
+      "visual.vision_model.encoder.layers.26.self_attn.q_proj",
+      "visual.vision_model.encoder.layers.26.self_attn.out_proj",
+      "visual.vision_model.encoder.layers.26.mlp.fc1",
+      "visual.vision_model.encoder.layers.26.mlp.fc2",
+      "lm_head"
+    ],
+    "weight_block_size": [
+      128,
+      128
+    ],
+    "modules_to_not_convert": [
+      "mlp_AR",
+      "visual.vision_model",
+      "lm_head"
+    ]
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "beta_fast": 32,
+    "beta_slow": 1,
+    "factor": 40,
+    "mscale": 1.0,
+    "mscale_all_dim": 1.0,
+    "original_max_position_embeddings": 4096,
+    "type": "yarn"
+  },
+  "rope_theta": 10000,
+  "routed_scaling_factor": 2.5,
+  "scoring_func": "sigmoid",
+  "tie_word_embeddings": false,
+  "topk_group": 4,
+  "topk_method": "noaux_tc",
+  "transformers_version": "4.56.2",
+  "use_cache": true,
+  "v_head_dim": 128,
+  "video_token_id": 128011,
+  "vision_config": {
+    "_attn_implementation_autoset": true,
+    "architectures": [
+      "SiglipVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_deepseek.KeyeVisionConfig",
+      "AutoModel": "modeling_deepseek.SiglipVisionModel"
+    },
+    "has_learnable_position_embedding": true,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "image_size": 384,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "Keye",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "patch_size": 14,
+    "rope_theta": 10000,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2,
+    "tokens_per_second": 2
+  },
+  "vision_end_token_id": 128008,
+  "vision_start_token_id": 128007,
+  "vision_token_id": 128009,
+  "vocab_size": 129280
+}

configuration_deepseek.py ADDED Viewed

	@@ -0,0 +1,266 @@

+# coding=utf-8
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+class KeyeVisionConfig(PretrainedConfig):
+    model_type = "Keye"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+class DeepseekR1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`KeyeModel`]. It is used to instantiate a
+    KeyeVLMoeForConditionalGeneration model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the Keye model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`KeyeModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 29568):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 80):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        vision_config (`Dict`, *optional*):
+            The config for the visual encoder initialization.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+    ```python
+    >>> from transformers import KeyeForConditionalGeneration, KeyeConfig
+    >>> # Initializing a Keye style configuration
+    >>> configuration = KeyeConfig()
+    >>> # Initializing a model from the Keye-VL-671B-A37B style configuration
+    >>> model = KeyeForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "deepseek_r1"
+    sub_configs = {"vision_config": KeyeVisionConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=129280,
+        hidden_size=7168,
+        intermediate_size=18432,
+        moe_intermediate_size = 2048,
+        num_hidden_layers=61,
+        num_nextn_predict_layers=1,
+        num_attention_heads=128,
+        num_key_value_heads=128,
+        n_shared_experts = 1,
+        n_routed_experts = 256,
+        ep_size = 1,
+        routed_scaling_factor = 2.5,
+        kv_lora_rank = 512,
+        q_lora_rank = 1536,
+        qk_rope_head_dim = 64,
+        v_head_dim = 128,
+        qk_nope_head_dim = 128,
+        topk_method = 'noaux_tc',
+        n_group = 8,
+        topk_group = 4,
+        num_experts_per_tok = 8,
+        moe_layer_freq = 1,
+        first_k_dense_replace = 3,
+        norm_topk_prob = True,
+        scoring_func = 'sigmoid',
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=0,
+        eos_token_id=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        vision_config=None,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_nextn_predict_layers = num_nextn_predict_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+__all__ = ["DeepseekR1Config"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "4.56.2"
+}

image_processing_keye.py ADDED Viewed

	@@ -0,0 +1,541 @@

+# coding=utf-8
+# Copyright 2024 The Keye team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Keye-VL-671B-A37B."""
+import math
+from typing import Dict, List, Optional, Union
+from PIL import Image
+import numpy as np
+import torch
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from torchvision.transforms import functional as TF
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+import numpy as np
+ImageInput = Union[
+    "PIL.Image.Image", np.ndarray, "torch.Tensor", list["PIL.Image.Image"], list[np.ndarray], list["torch.Tensor"]
+]  # noqa
+VideoInput = Union[
+    list["PIL.Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    list["np.ndarray"],
+    list["torch.Tensor"],
+    list[list["PIL.Image.Image"]],
+    list[list["np.ndarrray"]],
+    list[list["torch.Tensor"]],
+]  # noqa
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+def adjust_size(size, patch_size):
+    num_patches = size // patch_size
+    if num_patches % 2 != 0:  # 如果是奇数，减1
+        num_patches -= 1
+    return num_patches * patch_size
+def make_batched_videos(videos) -> List[VideoInput]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+    raise ValueError(f"Could not make batched video from {videos}")
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4096
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    #if height < factor or width < factor:
+    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # if int(height < factor//4) + int(width < factor//4):
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
+    if height < factor:
+        print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
+        width = round((width * factor) / height)
+        height = factor
+    if width < factor:
+        print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
+        height = round((height * factor) / width)
+        width = factor
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class SiglipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Keye-VL-671B-A37B image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+    model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+    def mvit_rescale(
+        self, image: Image.Image, merge_size: int = 2
+    ) -> Image.Image:
+        try:
+            w, h = image.size
+        except:
+            raise ValueError(str((type(image), image)))
+        patch_size = self.patch_size
+        if (w // patch_size) * (h // patch_size) > self.in_token_limit:
+            scale = math.sqrt(self.in_token_limit / ((w // patch_size) * (h // patch_size)))
+            new_w, new_h = int(w * scale), int(h * scale)
+            image = image.resize((new_w, new_h), Image.Resampling.BILINEAR)
+        if self.pad_input:
+            new_w, new_h = image.size
+            pad_size_h = merge_size * patch_size
+            pad_size_w = merge_size * patch_size
+            pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
+            pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
+            image = TF.pad(image, (0, 0, pad_w, pad_h))
+        else:
+            new_w, new_h = image.size
+            new_w = new_w - new_w % patch_size
+            new_h = new_h - new_h % patch_size
+            new_w = adjust_size(new_w, patch_size)
+            new_h = adjust_size(new_h, patch_size)
+            image = TF.center_crop(image, (new_h, new_w))
+        w, h = image.size
+        if w // patch_size >= 512 or h // patch_size >= 512:
+            new_h = min(patch_size * 510, h)
+            new_w = min(patch_size * 510, w)
+            image = TF.center_crop(image, (new_h, new_w))
+            #raise ValueError("Exceed pos emb")
+        return image
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            # image = self.mvit_rescale(image, merge_size=self.merge_size)
+            if do_resize:
+                if size is not None and "height" in size.keys():
+                    resized_height, resized_width = size["height"], size["width"]
+                else:
+                    resized_height, resized_width = smart_resize(
+                        height,
+                        width,
+                        factor=self.patch_size * self.merge_size,
+                        min_pixels=self.min_pixels,
+                        max_pixels=self.max_pixels,
+                    )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        init_patches = patches
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h,
+            self.patch_size,
+            grid_w,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
+        assert self.temporal_patch_size == 1
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    size = size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    size = size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
+        return BatchFeature(data=data, tensor_type=return_tensors)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f8def83c2332ebb0fb07399dd1ff8c643f3b22966d6f063a0aeb9d921f900be
+size 4992443296

model-00003-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:783fe07320638f175a97b04bda9fa56e2481f156a5e6c3aea4298798ce09e97a
+size 4992525496

model-00004-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8234d82d79b8460750644b74e2bd4102c9bb2fb63b8f9f121761db6bcaca0fd4
+size 4992490496

model-00009-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49a96dbf5df71bd81f91df442f518d6334a9d437d2c0766cd61492ca17eeca7e
+size 4992524984

model-00021-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f307fc57b7c118e7e18fc11e9dcbe64f8ddb8057fa6766a96299f9eb40a110fd
+size 4992525976

model-00035-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c308c5e569fce7b6ef03a3aa0f61a4f347f4025013c44b70bbb62de5b34fa00a
+size 4992526088

model-00039-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f04643034c4910206ab5a48e5230ae632e74047e27450af4e15c025c958941b
+size 4992525672

model-00041-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e3d2d9890b89ff587732193a85e58cb1604f14fd7babcaaf72a3141d57765b7
+size 4992491096

model-00046-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ae4b649002abec029c5b223994db661a4d901a530e4d198d7a93042dd705971
+size 4992525728

model-00048-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b6edd7cb0d469274bd888e47326be1fe14db5ce58350d7493fc1cbe424ae5af
+size 4992491064

model-00050-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4468368ee604420339ebf4c75abb4317136487eb72906ff81733531b57efbf9a
+size 4992491248

model-00057-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b19c29ccd8b66887135e178c6313730a1e87f3f6f6c2fee5b7d6069ea068275f
+size 4992491192

model-00059-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86aa5f23ec2473a8c12f072ff754d84fd03a567ffb986fd395b9bd352a056a0e
+size 4992491400

model-00061-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15499f621462c19dbf63382a8ff5fd9742dd1a08b2097c58f6ccaa993427c373
+size 4992491616

model-00063-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:976da05bcdd232d06594020f89bafe479433c507e9405bd54bd18feeb92ec266
+size 4992526176

model-00066-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d691e93c460ec6006edac71608c8aa58b23bcb9abc0bd231f46b1a9c2f236c25
+size 4992491344

model-00068-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb7ead31a8c46beabe30fd8e6bb602800c406035e92927be166115ebe368499b
+size 4992491552

model-00069-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddae7b8835a480518055609c234d4025ab5fcbb1cbdf6624779028bc0f4767e7
+size 4992525688

model-00073-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11b561e2ab1f9e230d36015face285d6634efdb9d1a404062583640808a618f5
+size 4992491288

model-00077-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeae8aeb9828d640e2fc187058b6e578c4aaa7169df668b077bba525d80a17f1
+size 4992526176

model-00080-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bef40110db347f6a50a167deacb270d08164dcf147da7334380f4fdbb2690ef7
+size 4992491232

model-00083-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c7d28ba65411738740041570940e5766e3bd447305c7460c096cd10f938e18f
+size 4992525800

model-00086-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:266fac4e673e340f10b68303ced5db2c2b1c3064bb3f0718920c14c712bea4a0
+size 4992526176

model-00096-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9cf10aa7578e790c8f3cb5d3f7c83b323770ce35bae151b22367e9ae7a2c48c
+size 4992491328

model-00101-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16fb974116f89ff15e10d085943f5b6077c90533b3110696cdc9504cd7a1e560
+size 4992491064

model-00102-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3515511abb5b831728be5228adb6effdd6cc2f6a672b5ec3008bafddd8f56cf
+size 4992526176

model-00103-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1939f174dddaad1d9e4418d0c44d750c1eb040bf0e97a007d903727f975cc7f3
+size 4992491272

model-00109-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5019f53bc9a05196da4b8aaf2f3cc6dbc18ce591a883549e648064a5d094b189
+size 4992526176

model-00110-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:710c5dc48b6c949bcbece419791fba03f0b46715f668caf9d5304be07faf0cd6
+size 4992491216

model-00113-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a687b4ef3d52be153e9aaed6e9cf3d7a9e8c098918a6225fa4d2f7b876299683
+size 4992525816

model-00116-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5db87372a5a4ed422df2caa91635610020081c91c25c8ddb1930f94b619c1e9b
+size 4992526176

model-00124-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98972af38e95797831bf7b6fa4de8cd6e8cfc72bc73049b3142e3457e1dfd204
+size 4992491104

model-00128-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4dda12a925d7c1841eddc3d9874cd1d786e17723b77c05ded47468e194b85767
+size 4992491520

model-00130-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a867da69417bc70179f14a09c7df5652ea348b762cbd84375b9f2e3cab8b7a59
+size 4992526176

model-00131-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cc49078e6c09aaf03eeb896b7a080ab664ae14a021b27ed270e520df5f22134
+size 4992491064

model-00135-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65ae76ac1753eef389e2abdb68c26cddca11f57e5e45049e8cc64258ae9c7b96
+size 3336955648

model-00136-of-00136.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5ad10363f651e6d5923b478aa12c47ccd2daf794c1f7b3a7f0a71a10f95c371
+size 1853358208

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_keye.SiglipImageProcessor",
+    "AutoProcessor": "processing_keye.KeyeProcessor"
+  },
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "SiglipImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "max_pixels": 16056320,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "KeyeProcessor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 16056320,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 1
+}

processing_keye.py ADDED Viewed

	@@ -0,0 +1,494 @@

+# coding=utf-8
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union, Optional
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from .image_processing_keye import SiglipImageProcessor
+import torch
+import torch.nn as nn
+import numpy as np
+from itertools import chain
+from typing import List
+import numpy as np
+ImageInput = Union[
+    "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
+]  # noqa
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["torch.Tensor"]],
+]  # noqa
+class KeyeVideosProcessorKwargs(VideosKwargs, total=False):
+    fps: Optional[Union[List[float], float]]
+    # 准备reszie到的width（slow）
+    width: Optional[Union[List[int], int]]
+    # 准备reszie到的height（slow）
+    height: Optional[Union[List[int], int]]
+    # 准备resize到的width（fast）
+    fast_width: Optional[Union[List[int], int]]
+    # 准备resize到的height（fast）
+    fast_height: Optional[Union[List[int], int]]
+    # 用于标记每一帧的时间戳，数量和帧数相等
+    timestamps: Optional[Union[List[torch.Tensor], torch.Tensor]]
+    # 用于标记每一帧的类型是slow还是fast，slow=0, fast=1
+    frame_types: Optional[Union[List[torch.Tensor], torch.Tensor]]
+class KeyeProcessorKwargs(ProcessingKwargs, total=False):
+    videos_kwargs: KeyeVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "videos_kwargs": {"fps": 2.0},
+    }
+def select_slow_fast_frames(frames: torch.Tensor, frame_types: torch.Tensor):
+    """
+    Selects frames from a tensor based on a mask list.
+    Args:
+        frames (torch.Tensor): A tensor of shape (nframes, c, h, w).
+        frame_types (torch.Tensor): A int tensor of shape (nframes,)
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: A tuple containing two tensors:
+            - slow_frames: Frames which the type is 0.
+            - fast_frames: Frames where the type is 1.
+    """
+    nframes, _, _, _ = frames.shape
+    if frame_types.shape[-1] != nframes:
+        raise ValueError("Length of mask must be equal to the number of frames.")
+    mask = (frame_types == 0)
+    slow_frames = frames[mask]
+    fast_frames = frames[~mask]
+    return slow_frames, fast_frames
+def split_thw(tensor):
+    """Split grid_thw in t dimension, the result tensor should like [[1, h, w],...]"""
+    repeats = tensor[:, 0]
+    new_thw = torch.cat([
+        torch.ones(tensor.shape[0], 1, dtype=tensor.dtype,
+            device=tensor.device),
+        tensor[:, 1:]
+    ], dim=1)
+    return torch.repeat_interleave(new_thw, repeats, dim=0)
+def merge_hws(hws):
+    """
+    优化版本：使用更高效的方法合并张量
+    """
+    merged = []
+    last_hw = [-1, -1]
+    for hw in hws:
+        # 找到连续相同形状的张量
+        if hw[1:] == last_hw:
+            merged[-1][0] += 1
+        else:
+            merged.append(hw)
+            last_hw = hw[1:]
+    return torch.tensor(merged)
+class KeyeProcessor(ProcessorMixin):
+    r"""
+    [`KeyeProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~KeyeProcessor.__call__`] and [`~KeyeProcessor.decode`] for more information.
+    Args:
+        image_processor ([`SiglipImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template","image_std", "min_pixels", "image_mean", "merge_size", "image_processor_type",
+        "temporal_patch_size", "patch_size", "max_pixels"
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.frame_token = "<|frame|>" if not hasattr(tokenizer, "frame_token") else tokenizer.frame_token
+        self.fast_video_token = "<|fast_video_pad|>" if not hasattr(tokenizer, "fast_video_token") else tokenizer.fast_video_token
+        self.fast_start = "<|fast_start|>" if not hasattr(tokenizer, "fast_start") else tokenizer.fast_start
+        self.fast_end = "<|fast_end|>" if not hasattr(tokenizer, "fast_end") else tokenizer.fast_end
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+        self.slowfast = True
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[KeyeProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            KeyeProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            # slow_images = images
+            image_inputs = self.image_processor(images=images, return_tensors="pt")
+            image_inputs['pixel_values'] = image_inputs['pixel_values']
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        num_frames = []
+        if videos is not None:
+            batch_slow_frames = []
+            batch_fast_frames = []
+            videos_kwargs = output_kwargs["videos_kwargs"]
+            num_videos = len(videos)
+            batch_frame_types = videos_kwargs.get("frame_types", [None] * num_videos)
+            batch_timestamps = videos_kwargs.get("timestamps", [None] * num_videos)
+            batch_width = videos_kwargs.get("width", [None] * num_videos)
+            batch_height = videos_kwargs.get("height", [None] * num_videos)
+            batch_fast_width = videos_kwargs.get("fast_width", [None] * num_videos)
+            batch_fast_height = videos_kwargs.get("fast_height", [None] * num_videos)
+            for index, frames in enumerate(videos):
+                if isinstance(frames, np.ndarray):
+                    frames = torch.from_numpy(frames.copy())
+                nframes = frames.shape[0]
+                num_frames.append(nframes)
+                assert nframes > 0, "No frames in video"
+                if batch_frame_types[index] is None:
+                    # default to all slow frames
+                    batch_frame_types[index] = torch.Tensor([0] * nframes)
+                frame_types = batch_frame_types[index]
+                slow_frames, fast_frames = select_slow_fast_frames(frames, frame_types)
+                has_fast_frames = fast_frames.shape[0] > 0
+                # resize slow frames
+                resized_width = batch_width[index]
+                resized_height = batch_height[index]
+                if resized_width is not None and resized_height is not None:
+                    slow_frames = nn.functional.interpolate(
+                        slow_frames,
+                        [resized_height, resized_width],
+                        mode="bilinear",
+                        antialias=True,
+                    ).float()
+                # Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
+                # slow_frames = list(slow_frames.split(1, dim=0))，不split，在模型里面做
+                slow_video_inputs = self.image_processor(
+                    images=None, videos=[slow_frames], **output_kwargs["images_kwargs"])
+                slow_video_grid_thw = slow_video_inputs["video_grid_thw"]
+                batch_slow_frames.append(slow_video_inputs)
+                # # 当前这个视频每一帧的token数
+                # slow_frames_patch_nums[index] = int(slow_video_inputs["pixel_values_videos"].shape[0] / \
+                #     slow_video_grid_thw.squeeze()[0])
+                if has_fast_frames:
+                    # TODO: shrink fast_frames
+                    fast_resized_width = batch_fast_width[index]
+                    fast_resized_height = batch_fast_height[index]
+                    if fast_resized_width is not None and fast_resized_height is not None:
+                        fast_frames = nn.functional.interpolate(
+                            fast_frames,
+                            [fast_resized_height, fast_resized_width],
+                            mode="bilinear",
+                            antialias=True,
+                        ).float()
+                    # Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
+                    # fast_frames = list(fast_frames.split(1, dim=0))
+                    fast_video_inputs = self.image_processor(
+                        images=None, videos=[fast_frames], **output_kwargs["images_kwargs"])
+                    fast_video_grid_thw = fast_video_inputs["video_grid_thw"]
+                    batch_fast_frames.append(fast_video_inputs)
+                    # # 当前这个视频的所有token数
+                    # fast_frames_token_nums[index] = int(fast_video_inputs["pixel_values_videos"].shape[0] / \
+                    #     fast_video_grid_thw.squeeze()[0])
+            assert len(batch_slow_frames) > 0, "Slow frames should not be empty."
+            slow_pixel_values_videos_list = [
+                video["pixel_values_videos"] for video in batch_slow_frames if video is not None]
+            slow_video_grid_thw_list = [
+                video["video_grid_thw"] for video in batch_slow_frames if video is not None]
+            slow_pixel_values_videos = torch.concat(slow_pixel_values_videos_list, dim=0)
+            slow_video_grid_thw = torch.concat(slow_video_grid_thw_list, dim=0)
+            if has_fast_frames:
+                fast_pixel_values_videos_list = [
+                    video["pixel_values_videos"] for video in batch_fast_frames \
+                        if video is not None]
+                fast_video_grid_thw_list = [
+                    video["video_grid_thw"] for video in batch_fast_frames \
+                        if video is not None]
+                fast_pixel_values_videos = \
+                    torch.concat(fast_pixel_values_videos_list, dim=0)
+                fast_video_grid_thw = \
+                    torch.concat(fast_video_grid_thw_list, dim=0)
+            else:
+                fast_video_grid_thw = None
+        else:
+            slow_video_grid_thw = None
+            fast_video_grid_thw = None
+        if not isinstance(text, list):
+            text = [text]
+        if image_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    # image_place_holder_tempale = "<|placeholder|>" * (
+                    #     image_grid_thw[index].prod() // self.image_processor.merge_size ** 2)
+                    image_place_holder_tempale = ""
+                    _, h_merged, w_merged = image_grid_thw[index]// self.image_processor.merge_size
+                    for i_h in range(h_merged.item()):
+                        image_place_holder_tempale += "<|placeholder|>" * w_merged + "<|mm_pos_start|>" + f"{i_h},{w_merged}" + "<|mm_pos_end|>"
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        image_place_holder_tempale,
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        pixel_values_videos = []
+        video_grid_thw = []
+        videos_inputs = {}
+        if slow_video_grid_thw is not None:
+            slow_video_grid_thw = split_thw(slow_video_grid_thw)
+            if fast_video_grid_thw is not None:
+                fast_video_grid_thw = split_thw(fast_video_grid_thw)
+            index = 0
+            slow_index = 0
+            fast_index = 0
+            slow_pixels_index = 0
+            fast_pixels_index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    video_place_holder_tempale = ""
+                    for j in range(batch_frame_types[index].shape[-1]):
+                        if batch_timestamps[index] is not None: # 如果有时间戳
+                            video_place_holder_tempale += self.frame_token + format(batch_timestamps[index][j], ".1f")
+                        else:
+                            video_place_holder_tempale += self.frame_token
+                        # 当前帧是slow
+                        if batch_frame_types[index][j] == 0:
+                            num_patches = int(slow_video_grid_thw[slow_index].prod())
+                            # video_place_holder_tempale += "<|placeholder|>" * (
+                            #     num_patches // self.image_processor.merge_size ** 2)
+                            _, h_merged, w_merged = slow_video_grid_thw[slow_index]// self.image_processor.merge_size
+                            for i_h in range(h_merged.item()):
+                                video_place_holder_tempale += "<|placeholder|>" * w_merged + "<|mm_pos_start|>" + f"{i_h},{w_merged}" + "<|mm_pos_end|>"
+                            pixel_values_videos.append(
+                                slow_pixel_values_videos[slow_pixels_index:slow_pixels_index + num_patches])
+                            slow_pixels_index = slow_pixels_index + num_patches
+                            video_grid_thw.append(slow_video_grid_thw[slow_index].tolist())
+                            slow_index += 1
+                        # 当前帧是fast
+                        elif batch_frame_types[index][j] == 1:
+                            num_patches = int(fast_video_grid_thw[fast_index].prod())
+                            # video_place_holder_tempale += self.fast_start + "<|placeholder|>" * (
+                            #     num_patches // self.image_processor.merge_size ** 2) + \
+                            #     self.fast_end
+                            _, h_merged, w_merged = fast_video_grid_thw[fast_index] // self.image_processor.merge_size
+                            video_place_holder_tempale += self.fast_start
+                            for i_h in range(h_merged.item()):
+                                video_place_holder_tempale += "<|placeholder|>" * w_merged + "<|mm_pos_start|>" + f"{i_h},{w_merged}" + "<|mm_pos_end|>"
+                            video_place_holder_tempale += self.fast_end
+                            pixel_values_videos.append(
+                                fast_pixel_values_videos[fast_pixels_index:fast_pixels_index + num_patches])
+                            fast_pixels_index = fast_pixels_index + num_patches
+                            video_grid_thw.append(fast_video_grid_thw[fast_index].tolist())
+                            fast_index += 1
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        video_place_holder_tempale,
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+            videos_inputs["pixel_values_videos"] = torch.cat(pixel_values_videos, dim=0)
+            videos_inputs["video_grid_thw"] = merge_hws(video_grid_thw)
+            videos_inputs["num_frames"] = torch.tensor(num_frames)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+            Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return names_from_processor
+    def apply_chat_template(self, messages, tokenize=True, add_generation_prompt=True, thinking='auto', **kwargs):
+        """
+        将对话消息列表转换为模型可接受的格式，支持添加思维标记
+        Args:
+            messages (list): 对话消息列表，每个消息是一个字典，包含"role"和"content"
+            tokenize (bool): 是否对结果进行分词，默认为True
+            add_generation_prompt (bool): 是否添加生成提示，默认为True
+            enable_thinking (bool): 是否启用思维标记，默认为None
+        Returns:
+            如果tokenize=True，返回分词后的输入；否则返回格式化的文本字符串
+        """
+        # 使用父类的apply_chat_template生成基础模板
+        formatted_text = super().apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=add_generation_prompt,
+            thinking=thinking,
+            **kwargs
+        )
+        # if enable_thinking == 'auto':
+        #     pass
+        # elif enable_thinking == True:
+        #     formatted_text += "<think>"
+        # elif enable_thinking == False:
+        #     formatted_text += f"</think>"
+        # else:
+        #     raise RuntimeError(f"Bad enable_thinking={enable_thinking}")
+        # 如果需要分词，返回分词结果；否则返回格式化文本
+        # print(formatted_text)
+        return self.tokenizer(formatted_text, **kwargs) if tokenize else formatted_text
+__all__ = ["KeyeProcessor"]

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_keye.KeyeProcessor"
+  },
+  "processor_class": "KeyeProcessor"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff