import operator import datasets import pandas as pd from huggingface_hub import HfApi from math import isnan api = HfApi() class PaperList: COLUMN_INFO = [ ["ID", "str"], ["Title", "str"], ["Authors", "str"], ["Paper page", "markdown"], ["GitHub", "markdown"], ["Spaces", "markdown"], ["Models", "markdown"], ["Datasets", "markdown"], ] def __init__(self): self.df_raw = self.get_df() self.df_prettified = self.prettify(self.df_raw) @staticmethod def get_df() -> pd.DataFrame: df = datasets.load_dataset("CVPR2024/CVPR2024-papers", split="train").to_pandas() df["paper_page"] = df["arxiv_id"].apply( lambda arxiv_id: f"https://huggingface.co/papers/{arxiv_id}" if not isnan(arxiv_id) else "" ) return df @staticmethod def create_link(text: str, url: str) -> str: return f'{text}' @staticmethod def prettify(df: pd.DataFrame) -> pd.DataFrame: rows = [] for _, row in df.iterrows(): new_row = { "ID": row["id"], "Title": row["title"], "Authors": row["authors"], "Paper page": PaperList.create_link(row["arxiv_id"], row["paper_page"]) if not isnan(row["arxiv_id"]) else " ", "GitHub": "\n".join([PaperList.create_link("GitHub", url) for url in row["GitHub"]] if row["GitHub"]!="[]" else " "), "Spaces": "\n".join( [ PaperList.create_link(repo_id, f"https://huggingface.co/spaces/{repo_id}") for repo_id in row["Space"] ] if row["Space"] != "[]" else [" "]), "Models": "\n".join( [PaperList.create_link(repo_id, f"https://huggingface.co/{repo_id}") for repo_id in row["Model"]] if row["Model"] != "[]" else [" "]) , "Datasets": "\n".join( [ PaperList.create_link(repo_id, f"https://huggingface.co/datasets/{repo_id}") for repo_id in row["Dataset"] ] if row["Dataset"] != "[]" else [" "] ), } rows.append(new_row) return pd.DataFrame(rows, columns=PaperList.get_column_names()) @staticmethod def get_column_names(): return list(map(operator.itemgetter(0), PaperList.COLUMN_INFO)) def get_column_datatypes(self, column_names: list[str]) -> list[str]: mapping = dict(self.COLUMN_INFO) return [mapping[name] for name in column_names] def search( self, title_search_query: str, author_search_query: str, ) -> pd.DataFrame: df = self.df_raw.copy() df = df[df["title"].str.contains(title_search_query, case=False)] df = df[df["authors"].str.contains(author_search_query, case=False)] return self.prettify(df)