topic-prediction

Running

App Files Files Community

avsolatorio commited on Feb 6, 2025

Commit

96070b5

1 Parent(s): 2b1e4b7

Add wbgtopic

Browse files

Signed-off-by: avsolatorio <avsolatorio@users.noreply.huggingface.co>

Files changed (3) hide show

app.py +8 -3
requirements.txt +2 -0
wbgtopic.py +98 -0

app.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
+import wbgtopic
+clf = wbgtopic.WBGDocTopic()
+def fn(inputs):
+    return clf.suggest_topics(inputs)
+demo = gr.Interface(fn=clf.suggest_topics, inputs="text", outputs=gr.JSON(label="Suggested topics"))
 demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ nltk
2	+ transformers[torch]

wbgtopic.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from transformers import pipeline
+from tqdm.auto import tqdm
+import pandas as pd
+from transformers import AutoTokenizer
+import nltk
+# Download the nltk data if not present
+nltk.download('punkt_tab')
+nltk.download('punkt')
+class WBGDocTopic:
+    """
+    A class to handle document topic suggestion using multiple pre-trained text classification models.
+    This class loads a set of text classification models from Hugging Face's model hub and
+    provides a method to suggest topics for input documents based on the aggregated classification
+    results from all the models.
+    Attributes:
+    -----------
+    classifiers : dict
+        A dictionary mapping model names to corresponding classification pipelines. It holds
+        instances of Hugging Face's `pipeline` used for text classification.
+    Methods:
+    --------
+    __init__(classifiers: dict = None)
+        Initializes the `WBGDocTopic` instance. If no classifiers are provided, it loads a default
+        set of classifiers by calling `load_classifiers`.
+    load_classifiers()
+        Loads a predefined set of document topic classifiers into the `classifiers` dictionary.
+        It uses `tqdm` to display progress as the classifiers are loaded.
+    suggest_topics(input_docs: str | list[str]) -> list
+        Suggests topics for the given document or list of documents. It runs each document
+        through all classifiers, averages their scores, and returns a list of dictionaries where each
+        dictionary contains the mean and standard deviation of the topic scores per document.
+        Parameters:
+        -----------
+        input_docs : str or list of str
+            A single document or a list of documents for which to suggest topics.
+        Returns:
+        --------
+        list
+            A list of dictionaries, where each dictionary represents the suggested topics for
+            each document, along with the mean and standard deviation of the topic classification scores.
+    """
+    def __init__(self, classifiers: dict = None, device: str = None):
+        self.classifiers = classifiers or {}
+        self.device = device
+        if classifiers is None:
+            self.load_classifiers()
+    def load_classifiers(self):
+        num_evals = 5
+        num_train = 5
+        tokenizer = AutoTokenizer.from_pretrained("avsolatorio/doc-topic-model_eval-04_train-03")
+        for i in tqdm(range(num_evals)):
+            for j in tqdm(range(num_train)):
+                if i == j:
+                    continue
+                model_name = f"avsolatorio/doc-topic-model_eval-{i:02}_train-{j:02}"
+                classifier = pipeline("text-classification", model=model_name, tokenizer=tokenizer, top_k=None, device=self.device)
+                self.classifiers[model_name] = classifier
+    def suggest_topics(self, input_docs: str | list[str]):
+        if isinstance(input_docs, str):
+            input_docs = [input_docs]
+        doc_outs = {i: [] for i in range(len(input_docs))}
+        topics = []
+        for _, classifier in self.classifiers.items():
+            for doc_idx, doc in enumerate(classifier(input_docs)):
+                doc_outs[doc_idx].append(pd.DataFrame.from_records(doc, index="label"))
+        for doc_idx, outs in doc_outs.items():
+            all_scores = pd.concat(outs, axis=1)
+            mean_probs = all_scores.mean(axis=1).sort_values(ascending=False)
+            std_probs = all_scores.std(axis=1).loc[mean_probs.index]
+            output = pd.DataFrame({"score_mean": mean_probs, "score_std": std_probs})
+            output["doc_idx"] = doc_idx
+            output.reset_index(inplace=True)
+            topics.append(output.to_dict(orient="records"))
+        return topics