Merve Noyan
fixes
521af34
import operator
import datasets
import pandas as pd
from huggingface_hub import HfApi
from math import isnan
api = HfApi()
class PaperList:
COLUMN_INFO = [
["ID", "str"],
["Title", "str"],
["Authors", "str"],
["Paper page", "markdown"],
["GitHub", "markdown"],
["Spaces", "markdown"],
["Models", "markdown"],
["Datasets", "markdown"],
]
def __init__(self):
self.df_raw = self.get_df()
self.df_prettified = self.prettify(self.df_raw)
@staticmethod
def get_df() -> pd.DataFrame:
df = datasets.load_dataset("CVPR2024/CVPR2024-papers", split="train").to_pandas()
df["paper_page"] = df["arxiv_id"].apply(
lambda arxiv_id: f"https://huggingface.co/papers/{arxiv_id}" if not isnan(arxiv_id) else ""
)
return df
@staticmethod
def create_link(text: str, url: str) -> str:
return f'<a href="{url}" target="_blank">{text}</a>'
@staticmethod
def prettify(df: pd.DataFrame) -> pd.DataFrame:
rows = []
for _, row in df.iterrows():
new_row = {
"ID": row["id"],
"Title": row["title"],
"Authors": row["authors"],
"Paper page": PaperList.create_link(row["arxiv_id"], row["paper_page"]) if not isnan(row["arxiv_id"]) else " ",
"GitHub": "\n".join([PaperList.create_link("GitHub", url) for url in row["GitHub"]] if row["GitHub"]!="[]" else " "),
"Spaces": "\n".join(
[
PaperList.create_link(repo_id, f"https://huggingface.co/spaces/{repo_id}")
for repo_id in row["Space"]
] if row["Space"] != "[]" else [" "]),
"Models": "\n".join(
[PaperList.create_link(repo_id, f"https://huggingface.co/{repo_id}") for repo_id in row["Model"]]
if row["Model"] != "[]" else [" "]) ,
"Datasets": "\n".join(
[
PaperList.create_link(repo_id, f"https://huggingface.co/datasets/{repo_id}")
for repo_id in row["Dataset"]
] if row["Dataset"] != "[]" else [" "]
),
}
rows.append(new_row)
return pd.DataFrame(rows, columns=PaperList.get_column_names())
@staticmethod
def get_column_names():
return list(map(operator.itemgetter(0), PaperList.COLUMN_INFO))
def get_column_datatypes(self, column_names: list[str]) -> list[str]:
mapping = dict(self.COLUMN_INFO)
return [mapping[name] for name in column_names]
def search(
self,
title_search_query: str,
author_search_query: str,
) -> pd.DataFrame:
df = self.df_raw.copy()
df = df[df["title"].str.contains(title_search_query, case=False)]
df = df[df["authors"].str.contains(author_search_query, case=False)]
return self.prettify(df)