Jellyfish042 commited on
Commit
14e0ea5
·
1 Parent(s): eb54354

brand new version

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. __pycache__/data_manager.cpython-311.pyc +0 -0
  2. __pycache__/title.cpython-311.pyc +0 -0
  3. about.md +0 -10
  4. app.py +196 -258
  5. data/2024-10/7b.xlsx +0 -0
  6. data/2024-10/xb.xlsx +0 -0
  7. data/2025-12/2025-12-21_11-34-39.json +24 -0
  8. data/2025-12/2025-12-21_11-35-15.json +24 -0
  9. data/2025-12/2025-12-21_11-36-04.json +24 -0
  10. data/2025-12/2025-12-21_11-36-44.json +24 -0
  11. data/2025-12/2025-12-21_11-37-00.json +24 -0
  12. data/2025-12/2025-12-21_11-37-31.json +24 -0
  13. data/2025-12/2025-12-21_11-37-59.json +24 -0
  14. data/2025-12/2025-12-21_11-38-27.json +24 -0
  15. data/2025-12/2025-12-21_11-38-57.json +24 -0
  16. data/2025-12/2025-12-21_11-39-11.json +24 -0
  17. data/2025-12/2025-12-21_11-39-42.json +26 -0
  18. data/2025-12/2025-12-21_11-40-01.json +26 -0
  19. data/2025-12/2025-12-21_11-40-26.json +26 -0
  20. data/2025-12/2025-12-21_11-40-48.json +26 -0
  21. data/2025-12/2025-12-21_11-41-02.json +26 -0
  22. data/2025-12/2025-12-21_11-41-20.json +26 -0
  23. data/2025-12/2025-12-21_11-41-38.json +26 -0
  24. data/2025-12/2025-12-21_11-41-55.json +26 -0
  25. data/2025-12/2025-12-21_11-42-12.json +26 -0
  26. data/2025-12/2025-12-21_11-42-26.json +26 -0
  27. data/2025-12/2025-12-21_11-42-49.json +26 -0
  28. data/2025-12/2025-12-21_11-43-05.json +26 -0
  29. data/2025-12/2025-12-21_11-43-28.json +26 -0
  30. data/2025-12/2025-12-21_11-43-47.json +26 -0
  31. data/2025-12/2025-12-21_11-43-58.json +26 -0
  32. data/2025-12/2025-12-21_11-44-14.json +26 -0
  33. data/2025-12/2025-12-21_11-44-30.json +26 -0
  34. data/2025-12/2025-12-21_11-44-45.json +26 -0
  35. data/2025-12/2025-12-21_11-45-01.json +26 -0
  36. data/2025-12/2025-12-21_11-45-11.json +26 -0
  37. data/2025-12/2025-12-21_11-45-38.json +26 -0
  38. data/2025-12/2025-12-21_11-45-55.json +26 -0
  39. data/2025-12/2025-12-21_11-46-17.json +26 -0
  40. data/2025-12/2025-12-21_11-46-35.json +26 -0
  41. data/2025-12/2025-12-21_11-46-50.json +26 -0
  42. data/2025-12/2025-12-21_11-47-06.json +26 -0
  43. data/2025-12/2025-12-21_11-47-21.json +26 -0
  44. data/2025-12/2025-12-21_11-47-36.json +26 -0
  45. data/2025-12/2025-12-21_11-47-52.json +26 -0
  46. data/2025-12/2025-12-21_11-48-04.json +26 -0
  47. data/2025-12/2025-12-21_11-48-25.json +26 -0
  48. data/2025-12/2025-12-21_11-48-37.json +26 -0
  49. data/2025-12/2025-12-21_11-48-52.json +26 -0
  50. data/2025-12/2025-12-21_11-49-05.json +26 -0
__pycache__/data_manager.cpython-311.pyc ADDED
Binary file (14.8 kB). View file
 
__pycache__/title.cpython-311.pyc ADDED
Binary file (873 Bytes). View file
 
about.md CHANGED
@@ -24,13 +24,3 @@ Therefore, the compression rate of a model can be directly calculated through th
24
 
25
  ### Can Models Using Different Tokenizers Be Directly Compared?
26
  Yes. When calculating the sum of negative log probabilities, we essentially treat the model + tokenizer as a single entity or system. As long as this system has a high probability of generating real text, we consider it better. From the perspective of compression, you can choose any tokenizer. From the compression rate perspective, we don't care; we only care about whether your system can compress the text more effectively.
27
-
28
- ### Is It Really Uncheatable? Can't I train my model on a large number of arXiv papers to improve its test performance on arXiv papers?
29
- Uncheatable Eval's data sources currently include new arXiv papers, new GitHub projects, BBC news, AO3 fanfictions, and new Wikipedia entries, with more sources to be added in the future. If you genuinely achieve excellent results across these data by training extensively on these sources, I would consider you to have developed a genuinely good language model rather than cheating.
30
-
31
- From my test results, accurately modeling these data is very challenging. I believe Uncheatable Eval more accurately reflects the value of every bit of data and computing you invest compared to other benchmarks. Models trained with more data and computing are almost always better, and there are no shortcuts. This is a key strength of Uncheatable Eval.
32
-
33
- ### Is This Too "Random"? Why Consider Random Texts from the Internet as Ground Truth?
34
- This is why we choose rigorous and verified texts such as arXiv papers and news reports, which typically have better quality. Additionally, a round of Uncheatable Eval evaluates a model over millions of tokens, increasing the reliability of the results.
35
-
36
- In fact, the model rankings obtained through Uncheatable Eval are very stable. For instance, the model ranked first in January's data is highly likely to remain first in February, March, April, May, and June, indicating that the data obtained through this method is sufficiently representative.
 
24
 
25
  ### Can Models Using Different Tokenizers Be Directly Compared?
26
  Yes. When calculating the sum of negative log probabilities, we essentially treat the model + tokenizer as a single entity or system. As long as this system has a high probability of generating real text, we consider it better. From the perspective of compression, you can choose any tokenizer. From the compression rate perspective, we don't care; we only care about whether your system can compress the text more effectively.
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,36 +1,22 @@
 
1
  import pandas as pd
2
  import gradio as gr
3
  import os
4
- import re
5
  import requests
6
  from dotenv import load_dotenv
7
  from matplotlib.colors import LinearSegmentedColormap
8
- import plotly.express as px
9
  import plotly.graph_objects as go
10
- # from sklearn.linear_model import LinearRegression
11
  import numpy as np
12
  from huggingface_hub import HfApi
13
  from huggingface_hub.hf_api import HTTPError
14
  from huggingface_hub.utils import GatedRepoError
15
  from gradio_rangeslider import RangeSlider
16
  import datetime
17
- from gradio.themes.utils.colors import slate
 
18
 
19
  load_dotenv()
20
  webhook_url = os.environ.get("WEBHOOK_URL")
21
- file_name_list = [
22
- "14b",
23
- "9b",
24
- "7b",
25
- "3b",
26
- "1b5",
27
- "other",
28
- ]
29
- sheet_name_list = [
30
- "cr",
31
- "bpc",
32
- "bpb",
33
- ]
34
  metric_list = [
35
  "Compression Rate (%)",
36
  "Bits Per Character (BPC)",
@@ -58,92 +44,54 @@ model_size_to_file_name = {
58
  "Other": "other",
59
  }
60
 
 
61
  def read_about_md():
62
- with open('about.md', 'r', encoding='utf-8') as f:
63
  return f.read()
64
 
65
- def rename_columns(df):
66
- df.columns = [col.rsplit("_", maxsplit=1)[0] for col in df.columns]
67
- return df
68
-
69
- def get_folders_matching_format(directory):
70
- pattern = re.compile(r"^\d{4}-\d{2}$")
71
- folders = []
72
- if not os.path.exists(directory):
73
- return folders
74
- for item in os.listdir(directory):
75
- full_path = os.path.join(directory, item)
76
- if os.path.isdir(full_path) and pattern.match(item):
77
- folders.append(full_path)
78
- return folders
79
-
80
- def get_unique_column_names(data=None):
81
- return [
82
- "ao3_\u200benglish",
83
- "bbc_\u200bnews",
84
- "wikipedia_\u200benglish",
85
- "arxiv_\u200bcomputer_\u200bscience",
86
- "arxiv_\u200bphysics",
87
- "github_\u200bcpp",
88
- "github_\u200bpython",
89
- ]
90
-
91
- def color_cell(value):
92
- return "background-color: #fffdd0" if pd.notna(value) else "default"
93
-
94
- # def color_cell_themed(value):
95
- # return "background-color: rgba(255, 253, 208, 1.0)" if pd.notna(value) else "default"
96
-
97
- # --- 核心改动点 1: 修改 update_table 函数 ---
98
- # 添加 request: gr.Request = None 参数来接收主题模式信息
99
- # 默认值为 None 是为了处理初始加载
100
- def update_table(period: str, models_size: list, metric: str, visible_columns: list, color_columns: list, size_range: list, midpoint: float = 0.5, sort_by: str = "Average (lower=better)", ascending: bool = True, request: gr.Request = None):
101
- # 打印日志并检查当前模式
102
  is_dark_mode = request.is_dark if request else False
103
- print(f"Updating - time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, period: {period}, models: {models_size}, metric: {metric}, visible_columns: {visible_columns}, color_columns: {color_columns}, size_range: {size_range}, sort_by: {sort_by}, ascending: {ascending}, is_dark: {is_dark_mode}\n")
 
 
104
 
105
- if not models_size:
106
- return "No data available for the selected models and period."
107
-
108
- target_period_data = all_data[period]
109
  target_file_name = [model_size_to_file_name[model] for model in models_size]
110
- sheet_name = metric_to_sheet[metric]
111
- combined_data = pd.concat([df.dropna(axis=1, how="all") for df in [target_period_data[file_name][sheet_name] for file_name in target_file_name]], axis=0)
112
-
113
- if len(combined_data) == 0:
114
- return "No data available for the selected models and period."
115
-
116
- combined_data = combined_data[combined_data["Parameters Count (B)"].between(size_range[0], size_range[1])]
117
- combined_data.reset_index(drop=True, inplace=True)
118
-
119
- if len(combined_data) == 0:
 
 
 
 
 
 
120
  return "No data available for the selected models and period."
121
-
122
- combined_data["Name"] = combined_data["Name"].apply(lambda x: x.replace(".pth", ""))
123
- ordered_columns = get_unique_column_names()
124
- relevant_columns = [col for col in ordered_columns if col in visible_columns and col not in ["Name", "Parameters Count (B)", "Average (The lower the better)"]]
125
-
126
- if len(combined_data) > 0 and relevant_columns:
127
- combined_data["Average (The lower the better)"] = round(combined_data[relevant_columns].mean(axis=1), 3)
128
-
129
- combined_data = combined_data.rename(columns={"Parameters Count (B)": "Params (B)", "Average (The lower the better)": "Average (lower=better)"})
130
- sorted_data = combined_data.sort_values(by=sort_by, ascending=ascending)
131
- visible_columns_final = ["Name", "Params (B)", "Average (lower=better)"] + relevant_columns
132
- filtered_data = sorted_data[visible_columns_final]
133
- filtered_data.columns = [col.replace("_", " ") for col in filtered_data.columns]
134
- formatter = {col: "{:.3f}" for col in filtered_data.columns if filtered_data[col].dtype in ["float64", "float32"]}
135
-
136
- # --- 核心改动点 2: 根据主题模式选择不同的配色方案 ---
137
- if is_dark_mode:
138
- # 夜间模式配色 (绿 -> 深灰 -> 红)
139
- colors = ["#2ca02c", "#2b2b2b", "#d62728"]
140
- else:
141
- # 日间模式配色 (绿 -> 白 -> 红)
142
- colors = ["#63be7b", "#ffffff", "#f8696b"]
143
-
144
  vmin, vmax, vmid = {}, {}, {}
145
  for column in filtered_data.columns:
146
- if column in ["Name", "Params (B)"]: continue
 
147
  col_values = filtered_data[column].dropna()
148
  if len(col_values) > 1:
149
  sorted_values = np.sort(col_values)
@@ -152,93 +100,84 @@ def update_table(period: str, models_size: list, metric: str, visible_columns: l
152
  idx = int(len(sorted_values) * midpoint)
153
  vmid[column] = sorted_values[idx]
154
 
155
- # --- 核心改动点 3: 修改样式函数以包含固定的黑色字体 ---
156
  def custom_background_gradient(series, cmap, vmin_val, vmax_val, vmid_val):
157
- if len(series) == 0: return series
 
 
158
  def normalize(x):
159
- if pd.isna(x): return 0.5 # Neutral for NaN
160
- if vmid_val == vmin_val and x <= vmid_val: return 0.0
161
- if vmid_val == vmax_val and x >= vmid_val: return 1.0
162
- if vmid_val == vmin_val or vmid_val == vmax_val: return 0.5
 
 
 
 
163
  if x <= vmid_val:
164
  return 0.5 * (x - vmin_val) / (vmid_val - vmin_val)
165
  else:
166
  return 0.5 + 0.5 * (x - vmid_val) / (vmax_val - vmid_val)
 
167
  normed = series.apply(normalize)
168
  cmap_colors = [cmap(x) for x in normed]
169
- # 在返回的CSS中同时设置 background-color color
170
- return [
171
- "background-color: rgba({}, {}, {}, {}); color: black;".format(*[int(255 * c) for c in color[:3]], color[3])
172
- for color in cmap_colors
173
- ]
174
 
175
  target_color_columns = []
176
- if "Average" in color_columns: target_color_columns.append("Average (lower=better)")
177
- if "Individual Tests" in color_columns: target_color_columns.extend([col for col in filtered_data.columns if col not in ["Name", "Params (B)", "Average (lower=better)"]])
178
-
 
 
179
  def color_params_column_dynamic(value):
180
  if not pd.notna(value):
181
  return "default"
182
-
183
- # 2. 根据 is_dark_mode 返回不同的颜色
184
  if is_dark_mode:
185
- # 为夜间模式选择一个柔和、不刺眼的暗金色
186
- # 字体颜色也设置为浅色以保证对比度
187
  return "background-color: #4b4936; color: #f0f0f0;"
188
  else:
189
- # 为日间模式使用明亮的奶油色,字体为黑色
190
  return "background-color: #fffdd0; color: black;"
191
-
192
- styler = filtered_data.style.format(formatter).map(color_params_column_dynamic, subset=["Params (B)"])
 
 
193
  for column in target_color_columns:
194
  if column in vmin:
195
  custom_cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
196
- styler = styler.apply(custom_background_gradient, cmap=custom_cmap, vmin_val=vmin[column], vmax_val=vmax[column], vmid_val=vmid[column], subset=[column])
197
-
 
 
198
  styler = styler.hide(axis="index")
199
- widths = [300, 150, 150, 100, 100, 100, 100, 100, 100, 100, 100]
200
-
201
  table_styles = []
202
- table_styles.append({"selector": "th", "props": [("background-color", "var(--background-fill-secondary)"), ("color", "var(--body-text-color)"), ("padding", "8px"), ("font-weight", "bold")]})
 
 
 
 
 
 
 
 
 
 
203
  table_styles.append({"selector": "table", "props": [("border-collapse", "collapse"), ("border", f"1px solid var(--border-color-primary)")]})
204
  for i, w in enumerate(widths):
205
- table_styles.append({"selector": f"th.col{i}, td.col{i}", "props": [("min-width", f"{w}px"), ("max-width", f"{w}px"), ("text-align", "center"), ("border", f"1px solid var(--border-color-primary)")]})
 
 
 
 
 
 
 
 
 
 
206
  styler = styler.set_table_styles(table_styles)
207
  return styler.to_html()
208
 
209
- def create_world_languages_gdp_chart():
210
- languages = ["English", "Chinese", "Spanish", "Japanese", "German", "French", "Arabic", "Italian", "Portuguese", "Korean", "Other"]
211
- shares = [27, 18, 8, 6, 5, 4, 3, 2, 2, 2, 23]
212
- colors = ["#FF7F7F", "#FFA07A", "#FFDB58", "#90EE90", "#98FB98", "#87CEFA", "#B0C4DE", "#DDA0DD", "#D8BFD8", "#F0E68C", "#E0FFFF"]
213
- fig = go.Figure(
214
- data=[
215
- go.Pie(
216
- labels=languages,
217
- values=shares,
218
- hole=0.3,
219
- marker=dict(colors=colors, line=dict(color="#FFFFFF", width=2)),
220
- textinfo="label+percent",
221
- textposition="outside",
222
- insidetextorientation="radial",
223
- textfont=dict(size=12),
224
- )
225
- ]
226
- )
227
- fig.update_layout(
228
- title={
229
- "text": "World Languages by Share of Global GDP",
230
- "y": 0.95,
231
- "x": 0.5,
232
- "xanchor": "center",
233
- "yanchor": "top",
234
- "font": dict(size=20, color="black"),
235
- },
236
- showlegend=False,
237
- width=700,
238
- height=500,
239
- margin=dict(t=80, b=20, l=20, r=20),
240
- )
241
- return fig
242
 
243
  def check_model_exists(model_id):
244
  api = HfApi()
@@ -253,6 +192,7 @@ def check_model_exists(model_id):
253
  else:
254
  return "Error: " + str(e)
255
 
 
256
  def submit_model(name):
257
  if "Exists" not in check_model_exists(name):
258
  return f"# ERROR: Model {name} does not exist on Hugging Face!"
@@ -271,14 +211,24 @@ def submit_model(name):
271
  except Exception as e:
272
  print(e)
273
  return "ERROR: Unexpected error. Please try again later."
274
- def create_scaling_plot(all_data, period):
275
- selected_columns = ["Name", "Parameters Count (B)", "Average (The lower the better)"]
276
- target_data = all_data[period]
277
- new_df = pd.DataFrame()
278
- for size in target_data.keys():
279
- new_df = pd.concat([new_df, target_data[size]["cr"].loc[:, selected_columns].dropna(axis=1, how="all")], axis=0)
280
- x_values = new_df["Parameters Count (B)"].astype(float).tolist()
281
- y_values = new_df["Average (The lower the better)"].astype(float).tolist()
 
 
 
 
 
 
 
 
 
 
282
  names = new_df["Name"].tolist()
283
  x_min, x_max = np.log10(min(x_values)), np.log10(max(x_values))
284
  y_min, y_max = np.log10(min(y_values)), np.log10(max(y_values))
@@ -326,100 +276,88 @@ def create_scaling_plot(all_data, period):
326
  )
327
  return fig
328
 
329
- def read_all_data(folder_name):
330
- all_data = {}
331
- time_list = []
332
- for folder in get_folders_matching_format(folder_name):
333
- folder_name = os.path.basename(folder)
334
- time_list.append(folder_name)
335
- if all_data.get(folder) is None:
336
- all_data[folder_name] = {}
337
- for file_name in file_name_list:
338
- if all_data.get(file_name) is None:
339
- all_data[folder_name][file_name] = {}
340
- for sheet_name in sheet_name_list:
341
- final_file_name = os.path.join(folder, file_name)
342
- all_data[folder_name][file_name][sheet_name] = rename_columns(pd.read_excel(final_file_name + ".xlsx", sheet_name=sheet_name))
343
- return all_data, time_list
344
-
345
- all_data, time_list = read_all_data("data")
346
- time_list.sort()
347
- last_period = time_list[-1]
348
- initial_fig = create_scaling_plot(all_data, last_period)
349
- initial_metric = metric_list[0]
350
- initial_columns = get_unique_column_names(all_data)
351
- initial_colors = ["Average", "Individual Tests"]
352
- initial_size_range = [0, 40]
353
- # 初始调用 update_table 时,request 参数将为默认的 None
354
- initial_data = update_table(last_period, model_size_list, initial_metric, initial_columns, initial_colors, initial_size_range)
355
- css = """
356
- .gradio-container {
357
- max-width: 95% !important;
358
- margin: 0 auto;
359
- }
360
- .tab-buttons button {
361
- font-size: 1.3em;
362
- }
363
- .gr-dataframe th {
364
- white-space: normal;
365
- word-break: break-word;
366
- }
367
- table {
368
- margin-left: auto !important;
369
- margin-right: auto !important;
370
- width: 100% !important;
371
- }
372
- """
373
- TITLE_HTML = '<h1 style="text-align:center"><span style="font-size:1.3em">🏆 LLM Compression Leaderboard</span></h1>'
374
- SUBTITLE_HTML = "<h1 style='text-align:center'><span style='font-size:0.8em'>Welcome to Uncheatable Eval LLM Compression Leaderboard, where fancy fine-tuning and cheating won't work 🚫; only compute 💻, data 📊, and real innovation 🔥 can prevail!</span></h1>"
375
- # theme = gr.themes.Default(primary_hue=slate, secondary_hue=slate)
376
- theme = gr.themes.Default()
377
- with gr.Blocks(theme=theme, css=css) as demo:
378
- gr.HTML(TITLE_HTML)
379
- gr.HTML(SUBTITLE_HTML)
380
- with gr.Tabs() as tabs:
381
- with gr.Tab("🏆 Leaderboard"):
382
- with gr.Row():
383
- with gr.Column():
384
- period_selector = gr.Dropdown(label="Period", choices=time_list, value=last_period)
385
- model_selector = gr.CheckboxGroup(label="Model Size", choices=model_size_list, value=model_size_list)
386
- size_range_slider = RangeSlider(minimum=0, maximum=40, value=[0, 40], step=0.1, label="Model Size Range")
387
- metric_selector = gr.Dropdown(label="Metric", choices=metric_list, value=initial_metric)
388
- with gr.Column():
389
- midpoint_slider = gr.Slider(minimum=0.1, maximum=0.9, value=0.5, step=0.01, label="Color Gradient Midpoint")
390
- color_selector = gr.CheckboxGroup(label="Colored Columns", choices=["Average", "Individual Tests"], value=initial_colors)
391
- colfilter = gr.CheckboxGroup(label="Data Source", choices=get_unique_column_names(all_data), value=initial_columns)
392
- table = gr.HTML(initial_data)
393
-
394
- # --- 核心改动点 4: 更新所有 .change() 事件,添加 gr.Request() ---
395
- # 定义共享的输入列表,避免重复
396
- shared_inputs = [period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider]
397
-
398
- period_selector.change(update_table, inputs=shared_inputs, outputs=table)
399
- model_selector.change(update_table, inputs=shared_inputs, outputs=table)
400
- metric_selector.change(update_table, inputs=shared_inputs, outputs=table)
401
- colfilter.change(update_table, inputs=shared_inputs, outputs=table)
402
- color_selector.change(update_table, inputs=shared_inputs, outputs=table)
403
- size_range_slider.change(update_table, inputs=shared_inputs, outputs=table)
404
- midpoint_slider.change(update_table, inputs=shared_inputs, outputs=table)
405
-
406
- with gr.Tab("🌍 MultiLang"):
407
- gr.Markdown("## Coming soon...")
408
- # world_languages_plot = gr.Plot(create_world_languages_gdp_chart())
409
- with gr.Tab("📈 Scaling Law"):
410
- period_selector_2 = gr.Dropdown(label="Period", choices=time_list, value=last_period)
411
- def update_plot(period):
412
- new_fig = create_scaling_plot(all_data, period)
413
- return new_fig
414
- plot = gr.Plot(initial_fig)
415
- period_selector_2.change(update_plot, inputs=period_selector_2, outputs=plot)
416
- with gr.Tab("ℹ️ About"):
417
- gr.Markdown(read_about_md())
418
- with gr.Tab("🚀 Submit"):
419
- with gr.Group():
420
  with gr.Row():
421
- model_name = gr.Textbox(max_lines=1, placeholder="Enter model name...", show_label=False, scale=4)
422
- submit = gr.Button("Submit", variant="primary", scale=0)
423
- output = gr.Markdown("# Enter a public HF repo id, then hit Submit to add it to the evaluation queue.")
424
- submit.click(fn=submit_model, inputs=model_name, outputs=output)
425
- demo.launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import is_
2
  import pandas as pd
3
  import gradio as gr
4
  import os
 
5
  import requests
6
  from dotenv import load_dotenv
7
  from matplotlib.colors import LinearSegmentedColormap
 
8
  import plotly.graph_objects as go
 
9
  import numpy as np
10
  from huggingface_hub import HfApi
11
  from huggingface_hub.hf_api import HTTPError
12
  from huggingface_hub.utils import GatedRepoError
13
  from gradio_rangeslider import RangeSlider
14
  import datetime
15
+ from title import css, TITLE_HTML, SUBTITLE_HTML
16
+ from data_manager import DataManager
17
 
18
  load_dotenv()
19
  webhook_url = os.environ.get("WEBHOOK_URL")
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  metric_list = [
21
  "Compression Rate (%)",
22
  "Bits Per Character (BPC)",
 
44
  "Other": "other",
45
  }
46
 
47
+
48
  def read_about_md():
49
+ with open("about.md", "r", encoding="utf-8") as f:
50
  return f.read()
51
 
52
+
53
+ def update_table(
54
+ data_manager: DataManager,
55
+ period: str,
56
+ models_size: list,
57
+ metric: str,
58
+ visible_columns: list,
59
+ color_columns: list,
60
+ size_range: list,
61
+ midpoint: float = 0.5,
62
+ ascending: bool = True,
63
+ request: gr.Request = None,
64
+ ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  is_dark_mode = request.is_dark if request else False
66
+ print(
67
+ f"Updating - time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, period: {period}, models: {models_size}, metric: {metric}, visible_columns: {visible_columns}, color_columns: {color_columns}, size_range: {size_range}, ascending: {ascending}, is_dark: {is_dark_mode}\n"
68
+ )
69
 
 
 
 
 
70
  target_file_name = [model_size_to_file_name[model] for model in models_size]
71
+ metric_code = metric_to_sheet[metric]
72
+
73
+ # 过滤掉不在当前 period 可用列中的列名,避免错误
74
+ if visible_columns:
75
+ available_columns = data_manager.get_available_columns(period)
76
+ visible_columns = [col for col in visible_columns if col in available_columns]
77
+
78
+ filtered_data = data_manager.query(
79
+ period=period,
80
+ metric_code=metric_code,
81
+ param_range=(size_range[0], size_range[1]),
82
+ model_groups=target_file_name,
83
+ visible_columns=visible_columns,
84
+ )
85
+
86
+ if len(filtered_data) == 0:
87
  return "No data available for the selected models and period."
88
+
89
+ colors = ["#2ca02c", "#2b2b2b", "#d62728"] if is_dark_mode else ["#63be7b", "#ffffff", "#f8696b"]
90
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  vmin, vmax, vmid = {}, {}, {}
92
  for column in filtered_data.columns:
93
+ if column in ["Name", "Params (B)"]:
94
+ continue
95
  col_values = filtered_data[column].dropna()
96
  if len(col_values) > 1:
97
  sorted_values = np.sort(col_values)
 
100
  idx = int(len(sorted_values) * midpoint)
101
  vmid[column] = sorted_values[idx]
102
 
 
103
  def custom_background_gradient(series, cmap, vmin_val, vmax_val, vmid_val):
104
+ if len(series) == 0:
105
+ return series
106
+
107
  def normalize(x):
108
+ if pd.isna(x):
109
+ return 0.5 # Neutral for NaN
110
+ if vmid_val == vmin_val and x <= vmid_val:
111
+ return 0.0
112
+ if vmid_val == vmax_val and x >= vmid_val:
113
+ return 1.0
114
+ if vmid_val == vmin_val or vmid_val == vmax_val:
115
+ return 0.5
116
  if x <= vmid_val:
117
  return 0.5 * (x - vmin_val) / (vmid_val - vmin_val)
118
  else:
119
  return 0.5 + 0.5 * (x - vmid_val) / (vmax_val - vmid_val)
120
+
121
  normed = series.apply(normalize)
122
  cmap_colors = [cmap(x) for x in normed]
123
+ return ["background-color: rgba({}, {}, {}, {}); color: black;".format(*[int(255 * c) for c in color[:3]], color[3]) for color in cmap_colors]
 
 
 
 
124
 
125
  target_color_columns = []
126
+ if "Average" in color_columns:
127
+ target_color_columns.append("Average (lower=better)")
128
+ if "Individual Tests" in color_columns:
129
+ target_color_columns.extend([col for col in filtered_data.columns if col not in ["Name", "Params (B)", "Average (lower=better)"]])
130
+
131
  def color_params_column_dynamic(value):
132
  if not pd.notna(value):
133
  return "default"
134
+
 
135
  if is_dark_mode:
 
 
136
  return "background-color: #4b4936; color: #f0f0f0;"
137
  else:
 
138
  return "background-color: #fffdd0; color: black;"
139
+
140
+ formatter = {col: "{:.3f}" for col in filtered_data.columns if filtered_data[col].dtype in ["float64", "float32"]}
141
+ styler = filtered_data.style.format(formatter)
142
+ styler = styler.map(color_params_column_dynamic, subset=["Params (B)"])
143
  for column in target_color_columns:
144
  if column in vmin:
145
  custom_cmap = LinearSegmentedColormap.from_list("custom_cmap", colors)
146
+ styler = styler.apply(
147
+ custom_background_gradient, cmap=custom_cmap, vmin_val=vmin[column], vmax_val=vmax[column], vmid_val=vmid[column], subset=[column]
148
+ )
149
+
150
  styler = styler.hide(axis="index")
151
+ widths = [250, 80, 80, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70, 70]
152
+
153
  table_styles = []
154
+ table_styles.append(
155
+ {
156
+ "selector": "th",
157
+ "props": [
158
+ ("background-color", "var(--background-fill-secondary)"),
159
+ ("color", "var(--body-text-color)"),
160
+ ("padding", "8px"),
161
+ ("font-weight", "bold"),
162
+ ],
163
+ }
164
+ )
165
  table_styles.append({"selector": "table", "props": [("border-collapse", "collapse"), ("border", f"1px solid var(--border-color-primary)")]})
166
  for i, w in enumerate(widths):
167
+ table_styles.append(
168
+ {
169
+ "selector": f"th.col{i}, td.col{i}",
170
+ "props": [
171
+ ("min-width", f"{w}px"),
172
+ ("max-width", f"{w}px"),
173
+ ("text-align", "center"),
174
+ ("border", f"1px solid var(--border-color-primary)"),
175
+ ],
176
+ }
177
+ )
178
  styler = styler.set_table_styles(table_styles)
179
  return styler.to_html()
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  def check_model_exists(model_id):
183
  api = HfApi()
 
192
  else:
193
  return "Error: " + str(e)
194
 
195
+
196
  def submit_model(name):
197
  if "Exists" not in check_model_exists(name):
198
  return f"# ERROR: Model {name} does not exist on Hugging Face!"
 
211
  except Exception as e:
212
  print(e)
213
  return "ERROR: Unexpected error. Please try again later."
214
+
215
+
216
+ def create_scaling_plot(data_manager: DataManager, period: str):
217
+ new_df = data_manager.query(
218
+ period=period,
219
+ metric_code="cr",
220
+ param_range=(0, 40),
221
+ model_groups=None,
222
+ visible_columns=None,
223
+ )
224
+
225
+ if len(new_df) == 0:
226
+ fig = go.Figure()
227
+ fig.update_layout(title={"text": "Compression Rate Scaling Law", "x": 0.5}, width=800, height=600)
228
+ return fig
229
+
230
+ x_values = new_df["Params (B)"].astype(float).tolist()
231
+ y_values = new_df["Average (lower=better)"].astype(float).tolist()
232
  names = new_df["Name"].tolist()
233
  x_min, x_max = np.log10(min(x_values)), np.log10(max(x_values))
234
  y_min, y_max = np.log10(min(y_values)), np.log10(max(y_values))
 
276
  )
277
  return fig
278
 
279
+
280
+ if __name__ == "__main__":
281
+ data_manager = DataManager("data")
282
+ time_list = data_manager.get_available_periods()
283
+ last_period = time_list[-1]
284
+
285
+ initial_fig = create_scaling_plot(data_manager, last_period) if last_period else go.Figure()
286
+ initial_metric = metric_list[0]
287
+ initial_columns = data_manager.get_available_columns(last_period)
288
+ initial_colors = ["Average", "Individual Tests"]
289
+ initial_size_range = [0, 40]
290
+ initial_data = update_table(data_manager, last_period, model_size_list, initial_metric, initial_columns, initial_colors, initial_size_range)
291
+
292
+ theme = gr.themes.Default()
293
+ with gr.Blocks(theme=theme, css=css) as demo:
294
+ gr.HTML(TITLE_HTML)
295
+ gr.HTML(SUBTITLE_HTML)
296
+ with gr.Tabs() as tabs:
297
+ with gr.Tab("🏆 Leaderboard"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  with gr.Row():
299
+ with gr.Column():
300
+ period_selector = gr.Dropdown(label="Period", choices=time_list, value=last_period)
301
+ model_selector = gr.CheckboxGroup(label="Model Size", choices=model_size_list, value=model_size_list)
302
+ size_range_slider = RangeSlider(minimum=0, maximum=40, value=[0, 40], step=0.1, label="Model Size Range")
303
+ metric_selector = gr.Dropdown(label="Metric", choices=metric_list, value=initial_metric)
304
+ with gr.Column():
305
+ midpoint_slider = gr.Slider(minimum=0.1, maximum=0.9, value=0.5, step=0.01, label="Color Gradient Midpoint")
306
+ color_selector = gr.CheckboxGroup(label="Colored Columns", choices=["Average", "Individual Tests"], value=initial_colors)
307
+ colfilter = gr.CheckboxGroup(label="Data Source", choices=initial_columns, value=initial_columns)
308
+ table = gr.HTML(initial_data)
309
+
310
+ def update_table_wrapper(period, models_size, metric, visible_columns, color_columns, size_range, midpoint):
311
+ return update_table(data_manager, period, models_size, metric, visible_columns, color_columns, size_range, midpoint)
312
+
313
+ def update_column_choices(period, current_selected):
314
+ if not period:
315
+ return gr.update(choices=[], value=[])
316
+ columns = data_manager.get_available_columns(period)
317
+ # 只保留在新 choices 中存在的已选择值
318
+ if current_selected:
319
+ valid_selected = [col for col in current_selected if col in columns]
320
+ # 如果过滤后为空,默认选择所有列(保持默认行为)
321
+ if not valid_selected:
322
+ valid_selected = columns
323
+ else:
324
+ # 如果没有当前选择,默认选择所有列(保持默认行为)
325
+ valid_selected = columns
326
+ return gr.update(choices=columns, value=valid_selected)
327
+
328
+ shared_inputs = [period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider]
329
+
330
+ period_selector.change(update_column_choices, inputs=[period_selector, colfilter], outputs=colfilter)
331
+ period_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
332
+ model_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
333
+ metric_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
334
+ colfilter.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
335
+ color_selector.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
336
+ size_range_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
337
+ midpoint_slider.change(update_table_wrapper, inputs=shared_inputs, outputs=table)
338
+
339
+ with gr.Tab("📚 Long Context"):
340
+ gr.Markdown("## Coming soon...")
341
+
342
+ with gr.Tab("📈 Scaling Law"):
343
+ period_selector_2 = gr.Dropdown(label="Period", choices=time_list, value=last_period)
344
+
345
+ def update_plot(period):
346
+ new_fig = create_scaling_plot(data_manager, period)
347
+ return new_fig
348
+
349
+ plot = gr.Plot(initial_fig)
350
+ period_selector_2.change(update_plot, inputs=period_selector_2, outputs=plot)
351
+
352
+ with gr.Tab("ℹ️ About"):
353
+ gr.Markdown(read_about_md())
354
+
355
+ with gr.Tab("🚀 Submit"):
356
+ with gr.Group():
357
+ with gr.Row():
358
+ model_name = gr.Textbox(max_lines=1, placeholder="Enter model name...", show_label=False, scale=4)
359
+ submit = gr.Button("Submit", variant="primary", scale=0)
360
+ output = gr.Markdown("# Enter a public HF repo id, then hit Submit to add it to the evaluation queue.")
361
+ submit.click(fn=submit_model, inputs=model_name, outputs=output)
362
+
363
+ demo.launch(share=False)
data/2024-10/7b.xlsx CHANGED
Binary files a/data/2024-10/7b.xlsx and b/data/2024-10/7b.xlsx differ
 
data/2024-10/xb.xlsx DELETED
Binary file (9.4 kB)
 
data/2025-12/2025-12-21_11-34-39.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4649.08,
3
+ "avg tokens": 1909.12,
4
+ "avg character count": 7857.404,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 8012.242,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-ao3_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 10.463994754364728,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-35-15.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4283.474,
3
+ "avg tokens": 2095.926,
4
+ "avg character count": 9964.74,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 9994.128,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 7.729221971112452,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-36-04.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4036.0446875,
3
+ "avg tokens": 2925.354,
4
+ "avg character count": 9913.284,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 9918.674,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 7.338155351540054,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-36-44.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4376.222,
3
+ "avg tokens": 2448.906,
4
+ "avg character count": 9946.974,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 9952.8,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 7.9293688424729485,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-37-00.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1719.608,
3
+ "avg tokens": 739.35,
4
+ "avg character count": 3394.84,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 3396.996,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-bbc_news",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 9.128911006492899,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-37-31.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1347.243,
3
+ "avg tokens": 1773.934,
4
+ "avg character count": 5773.33,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 5853.154,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_cpp",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 4.150883427491335,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-37-59.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1377.357875,
3
+ "avg tokens": 1654.562,
4
+ "avg character count": 5774.754,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 5870.628,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_javascript",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 4.231036645040064,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-38-27.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2226.4415625,
3
+ "avg tokens": 1598.294,
4
+ "avg character count": 5024.17,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 5522.098,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_markdown",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 7.270959789757048,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-38-57.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1621.03725,
3
+ "avg tokens": 1791.012,
4
+ "avg character count": 6339.622,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 6497.474,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_python",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 4.49917614458958,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-39-11.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1502.122,
3
+ "avg tokens": 718.362,
4
+ "avg character count": 3043.39,
5
+ "parameters count": 1.527404544,
6
+ "avg bytes": 3062.292,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "/mnt/Public/rwkv_models/rwkv7-g1b-1.5b-20251202-ctx8192.pth",
9
+ "tokenizer_name": "rwkv_vocab_v20230424",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true
16
+ },
17
+ "tokenizer_args": {
18
+ "trust_remote_code": true
19
+ },
20
+ "requirements": [],
21
+ "batch_size": 1,
22
+ "compression_rate": 8.845923087226053,
23
+ "track_byte_wise_data": false
24
+ }
data/2025-12/2025-12-21_11-39-42.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5066.424,
3
+ "avg tokens": 1833.724,
4
+ "avg character count": 7857.404,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 8012.242,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-ao3_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 11.403338759364772,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-40-01.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4186.624,
3
+ "avg tokens": 2071.622,
4
+ "avg character count": 9964.74,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 9994.128,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 7.554463084306498,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-40-26.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3646.42,
3
+ "avg tokens": 3000.148,
4
+ "avg character count": 9913.284,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 9918.674,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 6.6297572273752685,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-40-48.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4222.864,
3
+ "avg tokens": 2501.464,
4
+ "avg character count": 9946.974,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 9952.8,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 7.651496251241524,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-41-02.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1826.096,
3
+ "avg tokens": 720.27,
4
+ "avg character count": 3394.84,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 3396.996,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-bbc_news",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 9.69422558705976,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-41-20.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1175.4235,
3
+ "avg tokens": 1617.712,
4
+ "avg character count": 5773.33,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 5853.154,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_cpp",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 3.6215040096210274,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-41-38.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1212.134,
3
+ "avg tokens": 1498.248,
4
+ "avg character count": 5774.754,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 5870.628,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_javascript",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 3.723493701808611,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-41-55.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2129.001,
3
+ "avg tokens": 1446.138,
4
+ "avg character count": 5024.17,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 5522.098,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_markdown",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 6.952745099660591,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-42-12.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1409.987,
3
+ "avg tokens": 1585.12,
4
+ "avg character count": 6339.622,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 6497.474,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_python",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 3.9134078347560655,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-42-26.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1590.208,
3
+ "avg tokens": 750.442,
4
+ "avg character count": 3043.39,
5
+ "parameters count": 1.720574976,
6
+ "avg bytes": 3062.292,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen3-1.7B-Base",
9
+ "tokenizer_name": "Qwen/Qwen3-1.7B-Base",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 9.36465723868738,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-42-49.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4847.352,
3
+ "avg tokens": 1949.908,
4
+ "avg character count": 7857.404,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 8012.242,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-ao3_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 10.910258782503071,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-43-05.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4517.79,
3
+ "avg tokens": 2182.888,
4
+ "avg character count": 9964.74,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 9994.128,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 8.152028407052809,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-43-28.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4149.150625,
3
+ "avg tokens": 3143.934,
4
+ "avg character count": 9913.284,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 9918.674,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 7.543799491984567,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-43-47.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4609.628,
3
+ "avg tokens": 2602.328,
4
+ "avg character count": 9946.974,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 9952.8,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 8.352282091400047,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-43-58.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1738.216,
3
+ "avg tokens": 755.956,
4
+ "avg character count": 3394.84,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 3396.996,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-bbc_news",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 9.227695599265683,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-44-14.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1344.355,
3
+ "avg tokens": 1998.214,
4
+ "avg character count": 5773.33,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 5853.154,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_cpp",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 4.141985440017216,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-44-30.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1449.103,
3
+ "avg tokens": 1865.214,
4
+ "avg character count": 5774.754,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 5870.628,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_javascript",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 4.4514268998080775,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-44-45.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2527.254,
3
+ "avg tokens": 1888.098,
4
+ "avg character count": 5024.17,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 5522.098,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_markdown",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 8.25333236766804,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-45-01.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1684.25,
3
+ "avg tokens": 1931.562,
4
+ "avg character count": 6339.622,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 6497.474,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_python",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 4.674622635306498,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-45-11.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1584.196,
3
+ "avg tokens": 779.642,
4
+ "avg character count": 3043.39,
5
+ "parameters count": 1.711376384,
6
+ "avg bytes": 3062.292,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "HuggingFaceTB/SmolLM2-1.7B",
9
+ "tokenizer_name": "HuggingFaceTB/SmolLM2-1.7B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 9.32925286434202,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-45-38.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5079.304,
3
+ "avg tokens": 1833.724,
4
+ "avg character count": 7857.404,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 8012.242,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-ao3_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 11.432328635305005,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-45-55.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4373.472,
3
+ "avg tokens": 2071.622,
4
+ "avg character count": 9964.74,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 9994.128,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 7.891616914785782,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-46-17.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 3793.949,
3
+ "avg tokens": 3000.148,
4
+ "avg character count": 9913.284,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 9918.674,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 6.897987835477859,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-46-35.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4389.584,
3
+ "avg tokens": 2501.464,
4
+ "avg character count": 9946.974,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 9952.8,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 7.9535797317909775,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-46-50.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1785.08,
3
+ "avg tokens": 720.27,
4
+ "avg character count": 3394.84,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 3396.996,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-bbc_news",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 9.476483279602297,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-47-06.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1258.625,
3
+ "avg tokens": 1617.712,
4
+ "avg character count": 5773.33,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 5853.154,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_cpp",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 3.877849544533749,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-47-21.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1324.075,
3
+ "avg tokens": 1498.248,
4
+ "avg character count": 5774.754,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 5870.628,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_javascript",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 4.067359651014027,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-47-36.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 2284.521,
3
+ "avg tokens": 1446.14,
4
+ "avg character count": 5024.17,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 5522.098,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_markdown",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 7.4606316238563135,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-47-52.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1501.532,
3
+ "avg tokens": 1585.12,
4
+ "avg character count": 6339.622,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 6497.474,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-github_python",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 4.1674902626314605,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-48-04.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 1596.3,
3
+ "avg tokens": 750.442,
4
+ "avg character count": 3043.39,
5
+ "parameters count": 1.543714304,
6
+ "avg bytes": 3062.292,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "Qwen/Qwen2.5-1.5B",
9
+ "tokenizer_name": "Qwen/Qwen2.5-1.5B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-wikipedia_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 9.400532729125164,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-48-25.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 5037.112,
3
+ "avg tokens": 1832.424,
4
+ "avg character count": 7857.404,
5
+ "parameters count": 1.2358144,
6
+ "avg bytes": 8012.242,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "meta-llama/Llama-3.2-1B",
9
+ "tokenizer_name": "meta-llama/Llama-3.2-1B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-ao3_english",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 11.337364283933086,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-48-37.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4519.312,
3
+ "avg tokens": 2045.48,
4
+ "avg character count": 9964.74,
5
+ "parameters count": 1.2358144,
6
+ "avg bytes": 9994.128,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "meta-llama/Llama-3.2-1B",
9
+ "tokenizer_name": "meta-llama/Llama-3.2-1B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_cs",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 8.154774747018926,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-48-52.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4072.908,
3
+ "avg tokens": 2984.08,
4
+ "avg character count": 9913.284,
5
+ "parameters count": 1.2358144,
6
+ "avg bytes": 9918.674,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "meta-llama/Llama-3.2-1B",
9
+ "tokenizer_name": "meta-llama/Llama-3.2-1B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_math",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 7.405178572252937,
25
+ "track_byte_wise_data": false
26
+ }
data/2025-12/2025-12-21_11-49-05.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "neg_log_prob_sum": 4462.048,
3
+ "avg tokens": 2454.32,
4
+ "avg character count": 9946.974,
5
+ "parameters count": 1.2358144,
6
+ "avg bytes": 9952.8,
7
+ "sample_count": 500,
8
+ "model_name_or_path": "meta-llama/Llama-3.2-1B",
9
+ "tokenizer_name": "meta-llama/Llama-3.2-1B",
10
+ "data_path": "Jellyfish042/UncheatableEval-2025-12-arxiv_physics",
11
+ "chunk_size": 4000,
12
+ "ensure_bos_token": true,
13
+ "model_args": {
14
+ "device_map": "auto",
15
+ "trust_remote_code": true,
16
+ "attn_implementation": "flash_attention_2",
17
+ "torch_dtype": "torch.bfloat16"
18
+ },
19
+ "tokenizer_args": {
20
+ "trust_remote_code": true
21
+ },
22
+ "requirements": [],
23
+ "batch_size": 1,
24
+ "compression_rate": 8.084878780102732,
25
+ "track_byte_wise_data": false
26
+ }