Upload model
Browse files- README.md +26 -60
- config.json +7 -3
- tf_model.h5 +3 -0
README.md
CHANGED
|
@@ -1,81 +1,47 @@
|
|
| 1 |
---
|
| 2 |
-
widget:
|
| 3 |
-
- src: http://images.cocodataset.org/val2017/000000039769.jpg
|
| 4 |
-
candidate_labels: 고양이, 강아지, 토끼
|
| 5 |
-
example_title: cat and remote
|
| 6 |
-
language: ko
|
| 7 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
Korean CLIP model trained by [Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation](https://arxiv.org/abs/2004.09813)
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
훈련 코드: <https://github.com/Bing-su/KoCLIP_training_code>
|
| 17 |
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
-
## How to Use
|
| 21 |
|
| 22 |
-
|
| 23 |
|
| 24 |
-
|
| 25 |
-
import requests
|
| 26 |
-
import torch
|
| 27 |
-
from PIL import Image
|
| 28 |
-
from transformers import AutoModel, AutoProcessor
|
| 29 |
|
| 30 |
-
|
| 31 |
-
model = AutoModel.from_pretrained(repo)
|
| 32 |
-
processor = AutoProcessor.from_pretrained(repo)
|
| 33 |
|
| 34 |
-
|
| 35 |
-
image = Image.open(requests.get(url, stream=True).raw)
|
| 36 |
-
inputs = processor(text=["고양이 두 마리", "개 두 마리"], images=image, return_tensors="pt", padding=True)
|
| 37 |
-
with torch.inference_mode():
|
| 38 |
-
outputs = model(**inputs)
|
| 39 |
-
logits_per_image = outputs.logits_per_image
|
| 40 |
-
probs = logits_per_image.softmax(dim=1)
|
| 41 |
-
```
|
| 42 |
|
| 43 |
-
|
| 44 |
-
>>> probs
|
| 45 |
-
tensor([[0.9926, 0.0074]])
|
| 46 |
-
```
|
| 47 |
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
-
from transformers import pipeline
|
| 52 |
|
| 53 |
-
|
| 54 |
-
pipe = pipeline("zero-shot-image-classification", model=repo)
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
-
|
| 61 |
-
>>> result
|
| 62 |
-
[{'score': 0.9456236958503723, 'label': '분홍색 소파에 드러누운 고양이 친구들'},
|
| 63 |
-
{'score': 0.05315302312374115, 'label': '고양이 두 마리'},
|
| 64 |
-
{'score': 0.0012233294546604156, 'label': '고양이 한 마리'}]
|
| 65 |
-
```
|
| 66 |
|
| 67 |
-
## Tokenizer
|
| 68 |
|
| 69 |
-
토크나이저는 한국어 데이터와 영어 데이터를 7:3 비율로 섞어, 원본 CLIP 토크나이저에서 `.train_new_from_iterator`를 통해 학습되었습니다.
|
| 70 |
|
| 71 |
-
|
| 72 |
-
```python
|
| 73 |
-
# text_embeds.shape = [batch_size, sequence_length, transformer.width]
|
| 74 |
-
# take features from the eot embedding (eot_token is the highest number in each sequence)
|
| 75 |
-
# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
|
| 76 |
-
pooled_output = last_hidden_state[
|
| 77 |
-
torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1)
|
| 78 |
-
]
|
| 79 |
-
```
|
| 80 |
|
| 81 |
-
|
|
|
|
|
|
|
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- generated_from_keras_callback
|
| 5 |
+
model-index:
|
| 6 |
+
- name: clip-vit-base-patch32-ko
|
| 7 |
+
results: []
|
| 8 |
---
|
| 9 |
|
| 10 |
+
<!-- This model card has been generated automatically according to the information Keras had access to. You should
|
| 11 |
+
probably proofread and complete it, then remove this comment. -->
|
|
|
|
| 12 |
|
| 13 |
+
# clip-vit-base-patch32-ko
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
This model is a fine-tuned version of [Bingsu/clip-vit-base-patch32-ko](https://huggingface.co/Bingsu/clip-vit-base-patch32-ko) on an unknown dataset.
|
| 16 |
+
It achieves the following results on the evaluation set:
|
| 17 |
|
|
|
|
| 18 |
|
| 19 |
+
## Model description
|
| 20 |
|
| 21 |
+
More information needed
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
## Intended uses & limitations
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
More information needed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
+
## Training and evaluation data
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
More information needed
|
| 30 |
|
| 31 |
+
## Training procedure
|
|
|
|
| 32 |
|
| 33 |
+
### Training hyperparameters
|
|
|
|
| 34 |
|
| 35 |
+
The following hyperparameters were used during training:
|
| 36 |
+
- optimizer: None
|
| 37 |
+
- training_precision: float32
|
| 38 |
|
| 39 |
+
### Training results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
|
|
|
| 41 |
|
|
|
|
| 42 |
|
| 43 |
+
### Framework versions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
- Transformers 4.23.1
|
| 46 |
+
- TensorFlow 2.9.2
|
| 47 |
+
- Tokenizers 0.13.1
|
config.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
{
|
| 2 |
-
"_commit_hash":
|
| 3 |
"_name_or_path": "Bingsu/clip-vit-base-patch32-ko",
|
| 4 |
"architectures": [
|
| 5 |
"CLIPModel"
|
|
@@ -14,6 +14,7 @@
|
|
| 14 |
"architectures": null,
|
| 15 |
"attention_dropout": 0.0,
|
| 16 |
"bad_words_ids": null,
|
|
|
|
| 17 |
"bos_token_id": 0,
|
| 18 |
"chunk_size_feed_forward": 0,
|
| 19 |
"cross_attention_hidden_size": null,
|
|
@@ -67,6 +68,7 @@
|
|
| 67 |
"return_dict": true,
|
| 68 |
"return_dict_in_generate": false,
|
| 69 |
"sep_token_id": null,
|
|
|
|
| 70 |
"task_specific_params": null,
|
| 71 |
"temperature": 1.0,
|
| 72 |
"tf_legacy_loss": false,
|
|
@@ -77,7 +79,7 @@
|
|
| 77 |
"top_p": 1.0,
|
| 78 |
"torch_dtype": null,
|
| 79 |
"torchscript": false,
|
| 80 |
-
"transformers_version": "4.
|
| 81 |
"typical_p": 1.0,
|
| 82 |
"use_bfloat16": false,
|
| 83 |
"vocab_size": 49408
|
|
@@ -91,6 +93,7 @@
|
|
| 91 |
"architectures": null,
|
| 92 |
"attention_dropout": 0.0,
|
| 93 |
"bad_words_ids": null,
|
|
|
|
| 94 |
"bos_token_id": null,
|
| 95 |
"chunk_size_feed_forward": 0,
|
| 96 |
"cross_attention_hidden_size": null,
|
|
@@ -146,6 +149,7 @@
|
|
| 146 |
"return_dict": true,
|
| 147 |
"return_dict_in_generate": false,
|
| 148 |
"sep_token_id": null,
|
|
|
|
| 149 |
"task_specific_params": null,
|
| 150 |
"temperature": 1.0,
|
| 151 |
"tf_legacy_loss": false,
|
|
@@ -156,7 +160,7 @@
|
|
| 156 |
"top_p": 1.0,
|
| 157 |
"torch_dtype": null,
|
| 158 |
"torchscript": false,
|
| 159 |
-
"transformers_version": "4.
|
| 160 |
"typical_p": 1.0,
|
| 161 |
"use_bfloat16": false
|
| 162 |
},
|
|
|
|
| 1 |
{
|
| 2 |
+
"_commit_hash": "6f381bab5397bf31910ecd753491b53c84383811",
|
| 3 |
"_name_or_path": "Bingsu/clip-vit-base-patch32-ko",
|
| 4 |
"architectures": [
|
| 5 |
"CLIPModel"
|
|
|
|
| 14 |
"architectures": null,
|
| 15 |
"attention_dropout": 0.0,
|
| 16 |
"bad_words_ids": null,
|
| 17 |
+
"begin_suppress_tokens": null,
|
| 18 |
"bos_token_id": 0,
|
| 19 |
"chunk_size_feed_forward": 0,
|
| 20 |
"cross_attention_hidden_size": null,
|
|
|
|
| 68 |
"return_dict": true,
|
| 69 |
"return_dict_in_generate": false,
|
| 70 |
"sep_token_id": null,
|
| 71 |
+
"suppress_tokens": null,
|
| 72 |
"task_specific_params": null,
|
| 73 |
"temperature": 1.0,
|
| 74 |
"tf_legacy_loss": false,
|
|
|
|
| 79 |
"top_p": 1.0,
|
| 80 |
"torch_dtype": null,
|
| 81 |
"torchscript": false,
|
| 82 |
+
"transformers_version": "4.23.1",
|
| 83 |
"typical_p": 1.0,
|
| 84 |
"use_bfloat16": false,
|
| 85 |
"vocab_size": 49408
|
|
|
|
| 93 |
"architectures": null,
|
| 94 |
"attention_dropout": 0.0,
|
| 95 |
"bad_words_ids": null,
|
| 96 |
+
"begin_suppress_tokens": null,
|
| 97 |
"bos_token_id": null,
|
| 98 |
"chunk_size_feed_forward": 0,
|
| 99 |
"cross_attention_hidden_size": null,
|
|
|
|
| 149 |
"return_dict": true,
|
| 150 |
"return_dict_in_generate": false,
|
| 151 |
"sep_token_id": null,
|
| 152 |
+
"suppress_tokens": null,
|
| 153 |
"task_specific_params": null,
|
| 154 |
"temperature": 1.0,
|
| 155 |
"tf_legacy_loss": false,
|
|
|
|
| 160 |
"top_p": 1.0,
|
| 161 |
"torch_dtype": null,
|
| 162 |
"torchscript": false,
|
| 163 |
+
"transformers_version": "4.23.1",
|
| 164 |
"typical_p": 1.0,
|
| 165 |
"use_bfloat16": false
|
| 166 |
},
|
tf_model.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea376ac0b923856e999412382f09b8aab4401a99d6ceabd2cba7ac2d1b75ddd1
|
| 3 |
+
size 605559544
|