Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,9 +16,16 @@ import torch
|
|
| 16 |
PUBMED_N = 100 # Number of abstracts to retrieve initially
|
| 17 |
TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim
|
| 18 |
NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
|
| 19 |
-
SBERT_MODEL_NAME = "pritamdeka/S-
|
| 20 |
NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# --------- Indicator Phrases for Claim Extraction ---------
|
| 23 |
indicator_phrases = [
|
| 24 |
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
|
|
@@ -48,16 +55,7 @@ indicator_phrases = [
|
|
| 48 |
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
|
| 49 |
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
|
| 50 |
sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
|
| 51 |
-
|
| 52 |
-
# --- Load fast Llama-3.2-1B-Instruct summarizer pipeline ---
|
| 53 |
-
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
| 54 |
-
pipe = pipeline(
|
| 55 |
-
"text-generation",
|
| 56 |
-
model=model_id,
|
| 57 |
-
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 58 |
-
device_map="auto",
|
| 59 |
-
max_new_tokens=128,
|
| 60 |
-
)
|
| 61 |
|
| 62 |
def extract_claims_pattern(article_text):
|
| 63 |
sentences = sent_tokenize(article_text)
|
|
@@ -130,11 +128,29 @@ def extract_evidence_nli(claim, title, abstract):
|
|
| 130 |
})
|
| 131 |
return evidence
|
| 132 |
|
| 133 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
|
| 135 |
contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
|
| 136 |
-
|
| 137 |
-
# Compose prompt for summarization.
|
| 138 |
messages = [
|
| 139 |
{"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
|
| 140 |
{"role": "user", "content":
|
|
@@ -145,6 +161,7 @@ def summarize_evidence_llm(claim, evidence_list):
|
|
| 145 |
}
|
| 146 |
]
|
| 147 |
try:
|
|
|
|
| 148 |
outputs = pipe(
|
| 149 |
messages,
|
| 150 |
max_new_tokens=96,
|
|
@@ -152,7 +169,6 @@ def summarize_evidence_llm(claim, evidence_list):
|
|
| 152 |
temperature=0.1,
|
| 153 |
)
|
| 154 |
out = outputs[0]["generated_text"]
|
| 155 |
-
# If the model returns all messages, just take the last message (often the answer).
|
| 156 |
if isinstance(out, list) and "content" in out[-1]:
|
| 157 |
return out[-1]["content"].strip()
|
| 158 |
return out.strip()
|
|
@@ -171,7 +187,7 @@ def format_evidence_html(evidence_list):
|
|
| 171 |
)
|
| 172 |
return html
|
| 173 |
|
| 174 |
-
def factcheck_app(article_url):
|
| 175 |
try:
|
| 176 |
art = Article(article_url)
|
| 177 |
art.download()
|
|
@@ -204,7 +220,7 @@ def factcheck_app(article_url):
|
|
| 204 |
control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
|
| 205 |
evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
|
| 206 |
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
|
| 207 |
-
summary = summarize_evidence_llm(claim, all_evidence_sentences)
|
| 208 |
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
|
| 209 |
for abs_res in evidence_results:
|
| 210 |
results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
|
|
@@ -216,8 +232,9 @@ description = """
|
|
| 216 |
This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
|
| 217 |
<b>How to use it:</b><br>
|
| 218 |
1. Paste the link to a biomedical news article.<br>
|
| 219 |
-
2.
|
| 220 |
-
3.
|
|
|
|
| 221 |
- A plain summary of what research says.<br>
|
| 222 |
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
|
| 223 |
- The titles of the most relevant PubMed articles.<br><br>
|
|
@@ -226,11 +243,18 @@ This app extracts key scientific claims from a news article, finds the most rele
|
|
| 226 |
|
| 227 |
iface = gr.Interface(
|
| 228 |
fn=factcheck_app,
|
| 229 |
-
inputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
|
| 231 |
title="BioMedical News Fact-Checking & Research Evidence Finder",
|
| 232 |
description=description,
|
| 233 |
-
examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant"]],
|
| 234 |
allow_flagging="never"
|
| 235 |
)
|
| 236 |
|
|
|
|
| 16 |
PUBMED_N = 100 # Number of abstracts to retrieve initially
|
| 17 |
TOP_ABSTRACTS = 10 # Number of top semantic abstracts to keep per claim
|
| 18 |
NLI_MODEL_NAME = "pritamdeka/PubMedBERT-MNLI-MedNLI"
|
| 19 |
+
SBERT_MODEL_NAME = "pritamdeka/S-PubMedBert-MS-MARCO"
|
| 20 |
NLI_LABELS = ['CONTRADICTION', 'NEUTRAL', 'ENTAILMENT']
|
| 21 |
|
| 22 |
+
# --------- Summarizer model options ---------
|
| 23 |
+
model_options = {
|
| 24 |
+
"Llama-3.2-1B-Instruct (Meta, gated)": "meta-llama/Llama-3.2-1B-Instruct",
|
| 25 |
+
"Gemma-3-1B-it (Google, gated)": "google/gemma-3-1b-it",
|
| 26 |
+
"TinyLlama-1.1B-Chat (Open)": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
# --------- Indicator Phrases for Claim Extraction ---------
|
| 30 |
indicator_phrases = [
|
| 31 |
"found that", "findings suggest", "shows that", "showed that", "demonstrated", "demonstrates",
|
|
|
|
| 55 |
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_NAME)
|
| 56 |
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_NAME)
|
| 57 |
sbert_model = SentenceTransformer(SBERT_MODEL_NAME)
|
| 58 |
+
pipe_cache = {} # cache summarization pipelines
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def extract_claims_pattern(article_text):
|
| 61 |
sentences = sent_tokenize(article_text)
|
|
|
|
| 128 |
})
|
| 129 |
return evidence
|
| 130 |
|
| 131 |
+
def get_summarizer(model_choice):
|
| 132 |
+
model_id = model_options[model_choice]
|
| 133 |
+
if model_id in pipe_cache:
|
| 134 |
+
return pipe_cache[model_id]
|
| 135 |
+
kwargs = {
|
| 136 |
+
"model": model_id,
|
| 137 |
+
"torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 138 |
+
"device_map": "auto",
|
| 139 |
+
"max_new_tokens": 128
|
| 140 |
+
}
|
| 141 |
+
# Add token for gated models (Gemma, Llama)
|
| 142 |
+
if any(gated in model_id for gated in ["meta-llama", "gemma"]):
|
| 143 |
+
hf_token = os.environ.get("HF_TOKEN", None)
|
| 144 |
+
if hf_token:
|
| 145 |
+
kwargs["token"] = hf_token
|
| 146 |
+
else:
|
| 147 |
+
raise RuntimeError(f"Model '{model_choice}' requires a Hugging Face access token. Please set 'HF_TOKEN' as a Space secret or environment variable.")
|
| 148 |
+
pipe_cache[model_id] = pipeline("text-generation", **kwargs)
|
| 149 |
+
return pipe_cache[model_id]
|
| 150 |
+
|
| 151 |
+
def summarize_evidence_llm(claim, evidence_list, model_choice):
|
| 152 |
support = [ev['sentence'] for ev in evidence_list if ev['label'] == 'ENTAILMENT']
|
| 153 |
contradict = [ev['sentence'] for ev in evidence_list if ev['label'] == 'CONTRADICTION']
|
|
|
|
|
|
|
| 154 |
messages = [
|
| 155 |
{"role": "system", "content": "You are a helpful biomedical assistant. Summarize scientific evidence in plain English for the general public."},
|
| 156 |
{"role": "user", "content":
|
|
|
|
| 161 |
}
|
| 162 |
]
|
| 163 |
try:
|
| 164 |
+
pipe = get_summarizer(model_choice)
|
| 165 |
outputs = pipe(
|
| 166 |
messages,
|
| 167 |
max_new_tokens=96,
|
|
|
|
| 169 |
temperature=0.1,
|
| 170 |
)
|
| 171 |
out = outputs[0]["generated_text"]
|
|
|
|
| 172 |
if isinstance(out, list) and "content" in out[-1]:
|
| 173 |
return out[-1]["content"].strip()
|
| 174 |
return out.strip()
|
|
|
|
| 187 |
)
|
| 188 |
return html
|
| 189 |
|
| 190 |
+
def factcheck_app(article_url, model_choice):
|
| 191 |
try:
|
| 192 |
art = Article(article_url)
|
| 193 |
art.download()
|
|
|
|
| 220 |
control_ev = extract_evidence_nli(claim, titles[idx_non_top], abstracts[idx_non_top])
|
| 221 |
evidence_results.append({"title": f"(Control) {titles[idx_non_top]}", "evidence": control_ev})
|
| 222 |
all_evidence_sentences = [ev for abs_res in evidence_results for ev in abs_res["evidence"]]
|
| 223 |
+
summary = summarize_evidence_llm(claim, all_evidence_sentences, model_choice)
|
| 224 |
results_html += f"<hr><b>Claim:</b> {claim}<br><b>Layman summary:</b> {summary}<br>"
|
| 225 |
for abs_res in evidence_results:
|
| 226 |
results_html += f"<br><b>Abstract:</b> {abs_res['title']}<br>{format_evidence_html(abs_res['evidence'])}"
|
|
|
|
| 232 |
This app extracts key scientific claims from a news article, finds the most relevant PubMed biomedical research papers, checks which sentences in those papers support or contradict each claim, and gives you a plain-English summary verdict.<br><br>
|
| 233 |
<b>How to use it:</b><br>
|
| 234 |
1. Paste the link to a biomedical news article.<br>
|
| 235 |
+
2. Choose an AI summarizer model below. If you have no special access, use 'TinyLlama' (works for everyone).<br>
|
| 236 |
+
3. Wait for the results.<br>
|
| 237 |
+
4. For each claim, you will see:<br>
|
| 238 |
- A plain summary of what research says.<br>
|
| 239 |
- Color-coded evidence sentences (green=support, red=contradict, gray=neutral).<br>
|
| 240 |
- The titles of the most relevant PubMed articles.<br><br>
|
|
|
|
| 243 |
|
| 244 |
iface = gr.Interface(
|
| 245 |
fn=factcheck_app,
|
| 246 |
+
inputs=[
|
| 247 |
+
gr.Textbox(lines=2, label="Paste a news article URL"),
|
| 248 |
+
gr.Dropdown(
|
| 249 |
+
choices=list(model_options.keys()),
|
| 250 |
+
value="TinyLlama-1.1B-Chat (Open)",
|
| 251 |
+
label="Choose summarizer model"
|
| 252 |
+
)
|
| 253 |
+
],
|
| 254 |
outputs=[gr.HTML(label="Fact-Check Results (Summary & Evidence)"), gr.JSON(label="All Results (JSON)")],
|
| 255 |
title="BioMedical News Fact-Checking & Research Evidence Finder",
|
| 256 |
description=description,
|
| 257 |
+
examples=[["https://www.medicalnewstoday.com/articles/omicron-what-do-we-know-about-the-stealth-variant", "TinyLlama-1.1B-Chat (Open)"]],
|
| 258 |
allow_flagging="never"
|
| 259 |
)
|
| 260 |
|