update-CVPR2024-papers

Paused

update-CVPR2024-papers / papers.py

Merve Noyan

fixes

521af34 over 1 year ago

3.03 kB

	import operator

	import datasets
	import pandas as pd
	from huggingface_hub import HfApi
	from math import isnan

	api = HfApi()


	class PaperList:
	COLUMN_INFO = [
	["ID", "str"],
	["Title", "str"],
	["Authors", "str"],
	["Paper page", "markdown"],
	["GitHub", "markdown"],
	["Spaces", "markdown"],
	["Models", "markdown"],
	["Datasets", "markdown"],
	]

	def __init__(self):
	self.df_raw = self.get_df()
	self.df_prettified = self.prettify(self.df_raw)

	@staticmethod
	def get_df() -> pd.DataFrame:
	df = datasets.load_dataset("CVPR2024/CVPR2024-papers", split="train").to_pandas()
	df["paper_page"] = df["arxiv_id"].apply(
	lambda arxiv_id: f"https://huggingface.co/papers/{arxiv_id}" if not isnan(arxiv_id) else ""
	)
	return df

	@staticmethod
	def create_link(text: str, url: str) -> str:
	return f'<a href="{url}" target="_blank">{text}</a>'

	@staticmethod
	def prettify(df: pd.DataFrame) -> pd.DataFrame:
	rows = []
	for _, row in df.iterrows():
	new_row = {
	"ID": row["id"],
	"Title": row["title"],
	"Authors": row["authors"],
	"Paper page": PaperList.create_link(row["arxiv_id"], row["paper_page"]) if not isnan(row["arxiv_id"]) else " ",
	"GitHub": "\n".join([PaperList.create_link("GitHub", url) for url in row["GitHub"]] if row["GitHub"]!="[]" else " "),
	"Spaces": "\n".join(
	[
	PaperList.create_link(repo_id, f"https://huggingface.co/spaces/{repo_id}")
	for repo_id in row["Space"]
	] if row["Space"] != "[]" else [" "]),
	"Models": "\n".join(
	[PaperList.create_link(repo_id, f"https://huggingface.co/{repo_id}") for repo_id in row["Model"]]
	if row["Model"] != "[]" else [" "]) ,
	"Datasets": "\n".join(
	[
	PaperList.create_link(repo_id, f"https://huggingface.co/datasets/{repo_id}")
	for repo_id in row["Dataset"]
	] if row["Dataset"] != "[]" else [" "]
	),
	}
	rows.append(new_row)
	return pd.DataFrame(rows, columns=PaperList.get_column_names())

	@staticmethod
	def get_column_names():
	return list(map(operator.itemgetter(0), PaperList.COLUMN_INFO))

	def get_column_datatypes(self, column_names: list[str]) -> list[str]:
	mapping = dict(self.COLUMN_INFO)
	return [mapping[name] for name in column_names]

	def search(
	self,
	title_search_query: str,
	author_search_query: str,
	) -> pd.DataFrame:
	df = self.df_raw.copy()
	df = df[df["title"].str.contains(title_search_query, case=False)]
	df = df[df["authors"].str.contains(author_search_query, case=False)]
	return self.prettify(df)