Add scheduler
Browse files- app.py +12 -1
- requirements.txt +3 -1
- update_scheduler.py +114 -0
app.py
CHANGED
|
@@ -1,12 +1,23 @@
|
|
| 1 |
#!/usr/bin/env python
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
|
| 5 |
from papers import PaperList, get_df
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
paper_list = PaperList(get_df('papers.csv'))
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
with gr.Blocks(css='style.css') as demo:
|
| 12 |
gr.Markdown(DESCRIPTION)
|
|
|
|
| 1 |
#!/usr/bin/env python
|
| 2 |
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
import gradio as gr
|
| 6 |
|
| 7 |
from papers import PaperList, get_df
|
| 8 |
+
from update_scheduler import UpdateScheduler
|
| 9 |
+
|
| 10 |
+
DESCRIPTION = '''# list of [Daily Papers](https://huggingface.co/papers)'''
|
| 11 |
|
| 12 |
paper_list = PaperList(get_df('papers.csv'))
|
| 13 |
|
| 14 |
+
if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
|
| 15 |
+
CRON_HOUR = os.getenv('CRON_HOUR', '*/4')
|
| 16 |
+
CRON_MINUTE = os.getenv('CRON_MINUTE', '0')
|
| 17 |
+
scheduler = UpdateScheduler(space_id=SPACE_ID,
|
| 18 |
+
cron_hour=CRON_HOUR,
|
| 19 |
+
cron_minute=CRON_MINUTE)
|
| 20 |
+
scheduler.start()
|
| 21 |
|
| 22 |
with gr.Blocks(css='style.css') as demo:
|
| 23 |
gr.Markdown(DESCRIPTION)
|
requirements.txt
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
|
|
| 1 |
gradio==3.39.0
|
|
|
|
| 2 |
pandas==2.0.3
|
| 3 |
requests==2.31.0
|
| 4 |
-
tqdm==4.
|
|
|
|
| 1 |
+
apscheduler==3.10.3
|
| 2 |
gradio==3.39.0
|
| 3 |
+
huggingface_hub==0.16.4
|
| 4 |
pandas==2.0.3
|
| 5 |
requests==2.31.0
|
| 6 |
+
tqdm==4.66.1
|
update_scheduler.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import pathlib
|
| 3 |
+
import re
|
| 4 |
+
import tempfile
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import requests
|
| 8 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 9 |
+
from huggingface_hub import HfApi, Repository
|
| 10 |
+
from huggingface_hub.utils import RepositoryNotFoundError
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class SpaceRestarter:
|
| 14 |
+
def __init__(self, space_id: str):
|
| 15 |
+
self.api = HfApi()
|
| 16 |
+
if self.api.get_token_permission() != 'write':
|
| 17 |
+
raise ValueError('The HF token must have write permission.')
|
| 18 |
+
try:
|
| 19 |
+
self.api.space_info(repo_id=space_id)
|
| 20 |
+
except RepositoryNotFoundError:
|
| 21 |
+
raise ValueError('The Space ID does not exist.')
|
| 22 |
+
self.space_id = space_id
|
| 23 |
+
|
| 24 |
+
def restart(self) -> None:
|
| 25 |
+
self.api.restart_space(self.space_id)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def find_github_links(summary: str) -> str:
|
| 29 |
+
links = re.findall(
|
| 30 |
+
r'https://github.com/[^/]+/[^/)}, ]+(?:/(?:tree|blob)/[^/]+/[^/)}, ]+)?',
|
| 31 |
+
summary)
|
| 32 |
+
if len(links) == 0:
|
| 33 |
+
return ''
|
| 34 |
+
if len(links) != 1:
|
| 35 |
+
raise RuntimeError(f'Found multiple GitHub links: {links}')
|
| 36 |
+
link = links[0]
|
| 37 |
+
if link.endswith('.'):
|
| 38 |
+
link = link[:-1]
|
| 39 |
+
link = link.strip()
|
| 40 |
+
return link
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class RepoUpdater:
|
| 44 |
+
def __init__(self, repo_id: str, repo_type: str):
|
| 45 |
+
api = HfApi()
|
| 46 |
+
name = api.whoami()['name']
|
| 47 |
+
|
| 48 |
+
self.repo_dir = pathlib.Path(
|
| 49 |
+
tempfile.tempdir) / repo_id.split('/')[-1] # type: ignore
|
| 50 |
+
self.repo = Repository(
|
| 51 |
+
local_dir=self.repo_dir,
|
| 52 |
+
clone_from=repo_id,
|
| 53 |
+
repo_type=repo_type,
|
| 54 |
+
git_user=name,
|
| 55 |
+
git_email=f'{name}@users.noreply.huggingface.co')
|
| 56 |
+
self.repo.git_pull()
|
| 57 |
+
|
| 58 |
+
def update(self) -> None:
|
| 59 |
+
yesterday = (datetime.datetime.now() -
|
| 60 |
+
datetime.timedelta(days=1)).strftime('%Y-%m-%d')
|
| 61 |
+
today = datetime.datetime.now().strftime('%Y-%m-%d')
|
| 62 |
+
daily_papers = requests.get(
|
| 63 |
+
f'https://huggingface.co/api/daily_papers?date={yesterday}').json(
|
| 64 |
+
)
|
| 65 |
+
daily_papers += requests.get(
|
| 66 |
+
f'https://huggingface.co/api/daily_papers?date={today}').json()
|
| 67 |
+
|
| 68 |
+
self.repo.git_pull()
|
| 69 |
+
df = pd.read_csv(self.repo_dir / 'papers.csv', dtype=str).fillna('')
|
| 70 |
+
rows = [row for _, row in df.iterrows()]
|
| 71 |
+
arxiv_ids = {row.arxiv_id for row in rows}
|
| 72 |
+
|
| 73 |
+
for paper in daily_papers:
|
| 74 |
+
arxiv_id = paper['paper']['id']
|
| 75 |
+
if arxiv_id in arxiv_ids:
|
| 76 |
+
continue
|
| 77 |
+
try:
|
| 78 |
+
github = find_github_links(paper['paper']['summary'])
|
| 79 |
+
except RuntimeError as e:
|
| 80 |
+
print(e)
|
| 81 |
+
continue
|
| 82 |
+
rows.append(pd.Series({
|
| 83 |
+
'arxiv_id': arxiv_id,
|
| 84 |
+
'github': github,
|
| 85 |
+
}))
|
| 86 |
+
df = pd.DataFrame(rows).reset_index(drop=True)
|
| 87 |
+
df.to_csv(self.repo_dir / 'papers.csv', index=False)
|
| 88 |
+
|
| 89 |
+
def push(self) -> None:
|
| 90 |
+
self.repo.push_to_hub()
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
class UpdateScheduler:
|
| 94 |
+
def __init__(self, space_id: str, cron_hour: str, cron_minute: str):
|
| 95 |
+
self.space_restarter = SpaceRestarter(space_id=space_id)
|
| 96 |
+
self.repo_updater = RepoUpdater(repo_id=space_id, repo_type='space')
|
| 97 |
+
|
| 98 |
+
self.scheduler = BackgroundScheduler()
|
| 99 |
+
self.scheduler.add_job(func=self._update,
|
| 100 |
+
trigger='cron',
|
| 101 |
+
hour=cron_hour,
|
| 102 |
+
minute=cron_minute,
|
| 103 |
+
second=0,
|
| 104 |
+
timezone='UTC')
|
| 105 |
+
|
| 106 |
+
def _update(self) -> None:
|
| 107 |
+
self.repo_updater.update()
|
| 108 |
+
if self.repo_updater.repo.is_repo_clean():
|
| 109 |
+
self.space_restarter.restart()
|
| 110 |
+
else:
|
| 111 |
+
self.repo_updater.push()
|
| 112 |
+
|
| 113 |
+
def start(self) -> None:
|
| 114 |
+
self.scheduler.start()
|