Spaces:
Running
Running
Commit
·
808e59a
1
Parent(s):
3b6e0d3
Add tokenizer generator
Browse files- Dockerfile +6 -1
- README.md +5 -5
- public-apps/tokenizer-generator.livemd +153 -0
- public-apps/welcome.livemd +0 -46
Dockerfile
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
FROM ghcr.io/livebook-dev/livebook:latest
|
| 2 |
|
| 3 |
ENV LIVEBOOK_APP_SERVICE_NAME "🐳 Hugging Face - $SPACE_TITLE"
|
| 4 |
ENV LIVEBOOK_APP_SERVICE_URL "https://huggingface.co/spaces/$SPACE_AUTHOR_NAME/$SPACE_REPO_NAME"
|
|
@@ -9,7 +9,12 @@ ENV LIVEBOOK_DATA_PATH "/data"
|
|
| 9 |
ENV LIVEBOOK_PORT 7860
|
| 10 |
|
| 11 |
EXPOSE 7860
|
|
|
|
| 12 |
USER root
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
COPY public-apps/ /public-apps
|
| 14 |
RUN mkdir -p /data
|
| 15 |
RUN chmod 777 /data
|
|
|
|
| 1 |
+
FROM ghcr.io/livebook-dev/livebook:latest
|
| 2 |
|
| 3 |
ENV LIVEBOOK_APP_SERVICE_NAME "🐳 Hugging Face - $SPACE_TITLE"
|
| 4 |
ENV LIVEBOOK_APP_SERVICE_URL "https://huggingface.co/spaces/$SPACE_AUTHOR_NAME/$SPACE_REPO_NAME"
|
|
|
|
| 9 |
ENV LIVEBOOK_PORT 7860
|
| 10 |
|
| 11 |
EXPOSE 7860
|
| 12 |
+
|
| 13 |
USER root
|
| 14 |
+
|
| 15 |
+
RUN apt-get update && apt-get install -y python3 python3-pip python-is-python3
|
| 16 |
+
RUN pip --no-cache-dir install transformers sentencepiece protobuf
|
| 17 |
+
|
| 18 |
COPY public-apps/ /public-apps
|
| 19 |
RUN mkdir -p /data
|
| 20 |
RUN chmod 777 /data
|
README.md
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
fullWidth: true
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Bumblebee tools
|
| 3 |
+
emoji: 🐝
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: brown
|
| 6 |
sdk: docker
|
| 7 |
fullWidth: true
|
| 8 |
---
|
| 9 |
|
| 10 |
+
Tools for [elixir-nx/bumblebee](https://github.com/elixir-nx/bumblebee).
|
public-apps/tokenizer-generator.livemd
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- livebook:{"app_settings":{"access_type":"public","auto_shutdown_ms":5000,"multi_session":true,"output_type":"rich","show_source":true,"slug":"tokenizer-generator"}} -->
|
| 2 |
+
|
| 3 |
+
# Tokenizer generator
|
| 4 |
+
|
| 5 |
+
```elixir
|
| 6 |
+
Mix.install([
|
| 7 |
+
{:kino, "~> 0.10.0"},
|
| 8 |
+
{:req, "~> 0.4.3"}
|
| 9 |
+
])
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
## Info
|
| 13 |
+
|
| 14 |
+
```elixir
|
| 15 |
+
Kino.Markdown.new("""
|
| 16 |
+
## Background
|
| 17 |
+
|
| 18 |
+
HuggingFace repositories store tokenizers in two flavours:
|
| 19 |
+
|
| 20 |
+
1. "slow tokenizer" - corresponds to a tokenizer implemented in Python
|
| 21 |
+
and stored as `tokenizer_config.json`
|
| 22 |
+
|
| 23 |
+
2. "fast tokenizers" - corresponds to a tokenizer implemented in Rust
|
| 24 |
+
and stored as `tokenizer.json`
|
| 25 |
+
|
| 26 |
+
Many repositories only include files for 1., but the `transformers` library
|
| 27 |
+
automatically converts "slow tokenizer" to "fast tokenizer" whenever possible.
|
| 28 |
+
|
| 29 |
+
Bumblebee relies on the Rust bindings and therefore always requires the
|
| 30 |
+
`tokenizer.json` file. This app generates that file for any repository with the
|
| 31 |
+
"slow tokenizer".
|
| 32 |
+
""")
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## Generator
|
| 36 |
+
|
| 37 |
+
```elixir
|
| 38 |
+
Kino.Markdown.new("## Converter")
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
```elixir
|
| 42 |
+
{version, 0} =
|
| 43 |
+
System.cmd("python", ["-c", "import transformers; print(transformers.__version__, end='')"])
|
| 44 |
+
|
| 45 |
+
Kino.Markdown.new("""
|
| 46 |
+
`tokenizers: #{version}`
|
| 47 |
+
""")
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
```elixir
|
| 51 |
+
repo_input = Kino.Input.text("HuggingFace repo")
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
```elixir
|
| 55 |
+
repo = Kino.Input.read(repo_input)
|
| 56 |
+
|
| 57 |
+
if repo == "" do
|
| 58 |
+
Kino.interrupt!(:normal, "Enter repository.")
|
| 59 |
+
end
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
```elixir
|
| 63 |
+
response =
|
| 64 |
+
Req.post!("https://huggingface.co/api/models/#{repo}/paths-info/main",
|
| 65 |
+
json: %{paths: ["tokenizer.json"]}
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
case response do
|
| 69 |
+
%{status: 200, body: []} ->
|
| 70 |
+
:ok
|
| 71 |
+
|
| 72 |
+
%{status: 200, body: [%{"path" => "tokenizer.json"}]} ->
|
| 73 |
+
Kino.interrupt!(:error, "The tokenizer.json file already exist in the given repository.")
|
| 74 |
+
|
| 75 |
+
_ ->
|
| 76 |
+
Kino.interrupt!(:error, "The repository does not exist or requires authentication.")
|
| 77 |
+
end
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
```elixir
|
| 81 |
+
output_dir = Path.join(System.tmp_dir!(), repo)
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
````elixir
|
| 85 |
+
script = """
|
| 86 |
+
import sys
|
| 87 |
+
from transformers import AutoTokenizer
|
| 88 |
+
|
| 89 |
+
repo = sys.argv[1]
|
| 90 |
+
output_dir = sys.argv[2]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
tokenizer = AutoTokenizer.from_pretrained(repo)
|
| 95 |
+
assert tokenizer.is_fast
|
| 96 |
+
tokenizer.save_pretrained(output_dir)
|
| 97 |
+
except Exception as error:
|
| 98 |
+
print(error)
|
| 99 |
+
exit(1)
|
| 100 |
+
"""
|
| 101 |
+
|
| 102 |
+
case System.cmd("python", ["-c", script, repo, output_dir]) do
|
| 103 |
+
{_, 0} ->
|
| 104 |
+
:ok
|
| 105 |
+
|
| 106 |
+
{output, _} ->
|
| 107 |
+
Kino.Markdown.new("""
|
| 108 |
+
```
|
| 109 |
+
#{output}
|
| 110 |
+
```
|
| 111 |
+
""")
|
| 112 |
+
|> Kino.render()
|
| 113 |
+
|
| 114 |
+
Kino.interrupt!(:error, "Tokenizer conversion failed.")
|
| 115 |
+
end
|
| 116 |
+
````
|
| 117 |
+
|
| 118 |
+
```elixir
|
| 119 |
+
tokenizer_path = Path.join(output_dir, "tokenizer.json")
|
| 120 |
+
|
| 121 |
+
Kino.Download.new(
|
| 122 |
+
fn -> File.read!(tokenizer_path) end,
|
| 123 |
+
filename: "tokenizer.json",
|
| 124 |
+
label: "tokenizer.json"
|
| 125 |
+
)
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
`````elixir
|
| 129 |
+
Kino.Markdown.new("""
|
| 130 |
+
### Next steps
|
| 131 |
+
|
| 132 |
+
1. Go to https://huggingface.co/#{repo}/upload/main.
|
| 133 |
+
|
| 134 |
+
2. Upload the `tokenizer.json` file.
|
| 135 |
+
|
| 136 |
+
3. Add the following description:
|
| 137 |
+
|
| 138 |
+
````markdown
|
| 139 |
+
Generated with:
|
| 140 |
+
|
| 141 |
+
```python
|
| 142 |
+
from transformers import AutoTokenizer
|
| 143 |
+
|
| 144 |
+
tokenizer = AutoTokenizer.from_pretrained("#{repo}")
|
| 145 |
+
assert tokenizer.is_fast
|
| 146 |
+
tokenizer.save_pretrained("...")
|
| 147 |
+
```
|
| 148 |
+
````
|
| 149 |
+
|
| 150 |
+
4. Submit the PR.
|
| 151 |
+
|
| 152 |
+
""")
|
| 153 |
+
`````
|
public-apps/welcome.livemd
DELETED
|
@@ -1,46 +0,0 @@
|
|
| 1 |
-
<!-- livebook:{"app_settings":{"access_type":"public","slug":"welcome"}} -->
|
| 2 |
-
|
| 3 |
-
# Livebook <3 Hugging Face
|
| 4 |
-
|
| 5 |
-
```elixir
|
| 6 |
-
Mix.install([
|
| 7 |
-
{:kino, "~> 0.9"}
|
| 8 |
-
])
|
| 9 |
-
```
|
| 10 |
-
|
| 11 |
-
## Section
|
| 12 |
-
|
| 13 |
-
This is the source of a deployed notebook.
|
| 14 |
-
This notebook is static and simply renders the markdown content below.
|
| 15 |
-
|
| 16 |
-
```elixir
|
| 17 |
-
Kino.Markdown.new("""
|
| 18 |
-
Welcome to Livebook in Hugging Face!
|
| 19 |
-
|
| 20 |
-
This is a deployed notebook, which is also a perfect place to teach you
|
| 21 |
-
the ropes in using Livebook with Hugging Face.
|
| 22 |
-
|
| 23 |
-
## Getting started
|
| 24 |
-
|
| 25 |
-
First off, if you want to run your own copy of Livebook,
|
| 26 |
-
[check our tutorial](https://news.livebook.dev/livebook-inside-hugging-face-spaces-3LQaRi).
|
| 27 |
-
Once you clone the space, remember to set `LIVEBOOK_PASSWORD` as
|
| 28 |
-
an environment variable on your Space Settings page (a minimum of
|
| 29 |
-
12 digits is required).
|
| 30 |
-
|
| 31 |
-
If you are new to Elixir and Livebook, [head out to the Learn page](/learn)
|
| 32 |
-
(it requires a password), there you will find resources to get started
|
| 33 |
-
with both.
|
| 34 |
-
|
| 35 |
-
## Deploying notebooks
|
| 36 |
-
|
| 37 |
-
Livebook is fully collaborative and it enables you to deploy interactive
|
| 38 |
-
and collaborative apps just as well. All of your deployable notebooks will
|
| 39 |
-
be in the "public-apps" directory of your Spaces repository.
|
| 40 |
-
|
| 41 |
-
To deploy your own notebook on Hugging Face, you must click the
|
| 42 |
-
<i class="ri-livebook-deploy"></i> icon on the notebook sidebar, set a "Slug"
|
| 43 |
-
for the notebook, mark it as public and then drop its `.livemd` file into
|
| 44 |
-
the "public-apps" directory of your Spaces repo.
|
| 45 |
-
""")
|
| 46 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|