Spaces:
Running
Running
| # news_sentiment.py | |
| # pip install gnews nltk rapidfuzz | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import time | |
| from typing import List, Dict, Any | |
| from gnews import GNews | |
| from rapidfuzz import fuzz | |
| from nltk.sentiment import SentimentIntensityAnalyzer | |
| import nltk | |
| # Ensure VADER is available (safe to call multiple times) | |
| try: | |
| nltk.data.find("sentiment/vader_lexicon.zip") | |
| except LookupError: | |
| nltk.download("vader_lexicon") | |
| # Keep one analyzer instance | |
| _SIA = SentimentIntensityAnalyzer() | |
| def _sentiment_label(compound: float) -> str: | |
| if compound > 0.05: | |
| return "Positive" | |
| elif compound < -0.05: | |
| return "Negative" | |
| return "Neutral" | |
| def _is_similar(title: str, seen_titles: List[str], threshold: int = 60) -> bool: | |
| for t in seen_titles: | |
| if fuzz.ratio(title, t) > threshold: | |
| return True | |
| return False | |
| def get_latest_news_with_sentiment( | |
| query: str, | |
| *, | |
| period: str = "1d", | |
| max_results: int = 20, | |
| language: str = "en", | |
| country: str = "US", | |
| retries: int = 3, | |
| backoff_seconds: int = 3 | |
| ) -> Dict[str, Any]: | |
| seen_titles: List[str] = [] | |
| results = [] | |
| for attempt in range(retries): | |
| try: | |
| g = GNews(language=language, country=country, period=period, max_results=max_results) | |
| results = g.get_news(query) or [] | |
| if results: | |
| break | |
| except Exception as e: | |
| print(f"[Attempt {attempt+1}] GNews error: {e}") | |
| time.sleep(backoff_seconds * (attempt + 1)) | |
| if not results: | |
| return {"overall_news_score": 0.0, "count": 0, "items": []} | |
| items: List[Dict[str, Any]] = [] | |
| total_compound = 0.0 | |
| for art in results: | |
| title = (art.get("title") or "").strip() | |
| if not title: | |
| continue | |
| if _is_similar(title, seen_titles, threshold=60): | |
| continue | |
| seen_titles.append(title) | |
| url = (art.get("url") | |
| or art.get("link") | |
| or art.get("source", {}).get("url") | |
| or "") | |
| published_raw = (art.get("published date") | |
| or art.get("publishedDate") | |
| or art.get("datetime") | |
| or "") | |
| if isinstance(published_raw, datetime): | |
| if published_raw.tzinfo is None: | |
| published_raw = published_raw.replace(tzinfo=timezone.utc) | |
| published = published_raw.strftime("%Y-%m-%d %H:%M") | |
| else: | |
| published = str(published_raw) | |
| compound = _SIA.polarity_scores(title)["compound"] | |
| items.append({ | |
| "title": title, | |
| "url": url, | |
| "published": published, | |
| "sentiment": _sentiment_label(compound), | |
| "compound": round(compound, 3), | |
| }) | |
| total_compound += compound | |
| n = len(items) | |
| overall = round(((total_compound / n) + 1) * 2.5, 2) if n else 0.0 | |
| return {"overall_news_score": overall, "count": n, "items": items} | |