Spaces:
Build error
Build error
| from huggingface_hub import InferenceClient | |
| from config import BASE_MODEL, MY_MODEL, HF_TOKEN | |
| import pandas as pd | |
| import json | |
| import re | |
| from difflib import get_close_matches | |
| class SchoolChatbot: | |
| """Boston School Chatbot integrating structured data, vector context, and model completion.""" | |
| def __init__(self): | |
| model_id = MY_MODEL if MY_MODEL else BASE_MODEL | |
| self.client = InferenceClient(model=model_id, token=HF_TOKEN) | |
| self.df = pd.read_csv("bps_data.csv") | |
| with open("keyword_to_column_map.json") as f: | |
| self.keyword_map = json.load(f) | |
| # Build name variants for school matching | |
| self.school_name_map = {} | |
| for _, row in self.df.iterrows(): | |
| primary = row.get("BPS_School_Name") | |
| hist = row.get("BPS_Historical_Name") | |
| abbrev = row.get("SMMA_Abbreviated_Name") | |
| if pd.notna(primary): | |
| self.school_name_map[primary.lower()] = primary | |
| if pd.notna(hist): | |
| self.school_name_map[hist.lower()] = primary | |
| if pd.notna(abbrev): | |
| self.school_name_map[abbrev.lower()] = primary | |
| self.school_name_map.update({ | |
| "acc": "Another Course to College*", | |
| "baldwin": "Baldwin Early Learning Pilot Academy", | |
| "adams elementary": "Adams, Samuel Elementary", | |
| "alighieri montessori": "Alighieri, Dante Montessori School", | |
| "phineas bates": "Bates, Phineas Elementary", | |
| }) | |
| def format_prompt(self, user_input): | |
| return ( | |
| "<|system|>You are a helpful assistant that specializes in Boston public school enrollment.<|end|>\n" | |
| f"<|user|>{user_input}<|end|>\n" | |
| "<|assistant|>" | |
| ) | |
| def match_school_name(self, query): | |
| for key in self.school_name_map: | |
| if key in query.lower(): | |
| return self.school_name_map[key] | |
| return None | |
| def extract_context_with_keywords(self, prompt, school_name=None): | |
| def extract_keywords(text): | |
| tokens = re.findall(r'\b\w+\b', text.lower()) | |
| matched = set() | |
| for token in tokens: | |
| matched.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85)) | |
| return matched | |
| matched_keywords = extract_keywords(prompt) | |
| df_filtered = self.df | |
| if school_name: | |
| df_filtered = self.df[self.df["BPS_School_Name"].str.contains(school_name, case=False, na=False)] | |
| if df_filtered.empty: | |
| return [] | |
| row = df_filtered.iloc[0] | |
| context_items = [] | |
| for kw in matched_keywords: | |
| col = self.keyword_map.get(kw) | |
| val = row.get(col) if col else None | |
| if col and pd.notna(val): | |
| context_items.append(f"The school's {kw} is {val.lower()}.") | |
| return context_items | |
| def query_schools_by_feature(self, query): | |
| tokens = re.findall(r'\b\w+\b', query.lower()) | |
| matched_keywords = set() | |
| for token in tokens: | |
| matched_keywords.update(get_close_matches(token, self.keyword_map.keys(), cutoff=0.85)) | |
| positive_terms = "yes|accessible|adequate|good|excellent|present" | |
| negative_terms = "no|not accessible|inadequate|poor|bad|limited" | |
| matching_schools = set() | |
| inverse = any(t in query.lower() for t in ["not", "inaccessible", "bad", "poor", "lacking"]) | |
| for keyword in matched_keywords: | |
| col = self.keyword_map.get(keyword) | |
| if col and col in self.df.columns: | |
| if inverse: | |
| subset = self.df[~self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)] | |
| else: | |
| subset = self.df[self.df[col].astype(str).str.lower().str.contains(positive_terms, na=False)] | |
| schools = subset["BPS_School_Name"].dropna().unique().tolist() | |
| matching_schools.update(schools) | |
| if not matching_schools: | |
| return None | |
| return ( | |
| "The following schools match your criteria:\n" + | |
| "\n".join(f"- {s}" for s in sorted(matching_schools)) | |
| ) | |
| def get_response(self, user_input): | |
| # School-wide filter query | |
| school_filter = self.query_schools_by_feature(user_input) | |
| if school_filter: | |
| return school_filter | |
| # Per-school context query | |
| matched_school = self.match_school_name(user_input) | |
| structured_facts = self.extract_context_with_keywords(user_input, matched_school) | |
| if structured_facts: | |
| natural_context = ( | |
| f"You know the following facts about {matched_school or 'a Boston public school'}:\n" | |
| + "\n".join(f"- {fact}" for fact in structured_facts) | |
| ) | |
| prompt = ( | |
| "<|system|>You are a helpful assistant that specializes in Boston public school enrollment. " | |
| "Use any known facts about the school to answer helpfully.<|end|>\n" | |
| f"<|user|>{user_input}<|end|>\n" | |
| f"<|context|>{natural_context}<|end|>\n" | |
| "<|assistant|>" | |
| ) | |
| else: | |
| prompt = self.format_prompt(user_input) | |
| response = self.client.text_generation( | |
| prompt, | |
| max_new_tokens=512, | |
| temperature=0.7, | |
| top_p=0.9, | |
| stop_sequences=["<|end|>"] | |
| ) | |
| return response.strip() | |