|
|
import operator |
|
|
|
|
|
import datasets |
|
|
import pandas as pd |
|
|
from huggingface_hub import HfApi |
|
|
from math import isnan |
|
|
|
|
|
api = HfApi() |
|
|
|
|
|
|
|
|
class PaperList: |
|
|
COLUMN_INFO = [ |
|
|
["ID", "str"], |
|
|
["Title", "str"], |
|
|
["Authors", "str"], |
|
|
["Paper page", "markdown"], |
|
|
["GitHub", "markdown"], |
|
|
["Spaces", "markdown"], |
|
|
["Models", "markdown"], |
|
|
["Datasets", "markdown"], |
|
|
] |
|
|
|
|
|
def __init__(self): |
|
|
self.df_raw = self.get_df() |
|
|
self.df_prettified = self.prettify(self.df_raw) |
|
|
|
|
|
@staticmethod |
|
|
def get_df() -> pd.DataFrame: |
|
|
df = datasets.load_dataset("CVPR2024/CVPR2024-papers", split="train").to_pandas() |
|
|
df["paper_page"] = df["arxiv_id"].apply( |
|
|
lambda arxiv_id: f"https://huggingface.co/papers/{arxiv_id}" if not isnan(arxiv_id) else "" |
|
|
) |
|
|
return df |
|
|
|
|
|
@staticmethod |
|
|
def create_link(text: str, url: str) -> str: |
|
|
return f'<a href="{url}" target="_blank">{text}</a>' |
|
|
|
|
|
@staticmethod |
|
|
def prettify(df: pd.DataFrame) -> pd.DataFrame: |
|
|
rows = [] |
|
|
for _, row in df.iterrows(): |
|
|
new_row = { |
|
|
"ID": row["id"], |
|
|
"Title": row["title"], |
|
|
"Authors": row["authors"], |
|
|
"Paper page": PaperList.create_link(row["arxiv_id"], row["paper_page"]) if not isnan(row["arxiv_id"]) else " ", |
|
|
"GitHub": "\n".join([PaperList.create_link("GitHub", url) for url in row["GitHub"]] if row["GitHub"]!="[]" else " "), |
|
|
"Spaces": "\n".join( |
|
|
[ |
|
|
PaperList.create_link(repo_id, f"https://huggingface.co/spaces/{repo_id}") |
|
|
for repo_id in row["Space"] |
|
|
] if row["Space"] != "[]" else [" "]), |
|
|
"Models": "\n".join( |
|
|
[PaperList.create_link(repo_id, f"https://huggingface.co/{repo_id}") for repo_id in row["Model"]] |
|
|
if row["Model"] != "[]" else [" "]) , |
|
|
"Datasets": "\n".join( |
|
|
[ |
|
|
PaperList.create_link(repo_id, f"https://huggingface.co/datasets/{repo_id}") |
|
|
for repo_id in row["Dataset"] |
|
|
] if row["Dataset"] != "[]" else [" "] |
|
|
), |
|
|
} |
|
|
rows.append(new_row) |
|
|
return pd.DataFrame(rows, columns=PaperList.get_column_names()) |
|
|
|
|
|
@staticmethod |
|
|
def get_column_names(): |
|
|
return list(map(operator.itemgetter(0), PaperList.COLUMN_INFO)) |
|
|
|
|
|
def get_column_datatypes(self, column_names: list[str]) -> list[str]: |
|
|
mapping = dict(self.COLUMN_INFO) |
|
|
return [mapping[name] for name in column_names] |
|
|
|
|
|
def search( |
|
|
self, |
|
|
title_search_query: str, |
|
|
author_search_query: str, |
|
|
) -> pd.DataFrame: |
|
|
df = self.df_raw.copy() |
|
|
df = df[df["title"].str.contains(title_search_query, case=False)] |
|
|
df = df[df["authors"].str.contains(author_search_query, case=False)] |
|
|
return self.prettify(df) |
|
|
|