Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,8 +3,6 @@ import chromadb
|
|
| 3 |
import torch
|
| 4 |
from langchain.vectorstores import Chroma
|
| 5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 6 |
-
from langchain.retrievers import ContextualCompressionRetriever
|
| 7 |
-
from langchain.retrievers.document_compressors import CrossEncoderReranker
|
| 8 |
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
|
| 9 |
import gradio as gr
|
| 10 |
|
|
@@ -31,11 +29,6 @@ reranker = HuggingFaceCrossEncoder(
|
|
| 31 |
model_name="BAAI/bge-reranker-base",
|
| 32 |
model_kwargs={"device": device}
|
| 33 |
)
|
| 34 |
-
compressor = CrossEncoderReranker(model=reranker, top_n=5)
|
| 35 |
-
retriever = vectorstore.as_retriever(search_kwargs={"k": 10}) # Retrieve 2k initially
|
| 36 |
-
compression_retriever = ContextualCompressionRetriever(
|
| 37 |
-
base_compressor=compressor, base_retriever=retriever
|
| 38 |
-
)
|
| 39 |
|
| 40 |
def add_text_to_db(text):
|
| 41 |
"""
|
|
@@ -84,20 +77,30 @@ def search_similar_texts(query, k, threshold):
|
|
| 84 |
if not isinstance(threshold, (int, float)) or threshold < 0 or threshold > 1:
|
| 85 |
return "Error: Threshold must be a number between 0 and 1."
|
| 86 |
|
| 87 |
-
# Retrieve
|
| 88 |
-
retriever.search_kwargs
|
| 89 |
-
|
| 90 |
-
docs = compression_retriever.get_relevant_documents(query)
|
| 91 |
|
| 92 |
if not docs:
|
| 93 |
return "No such record."
|
| 94 |
|
| 95 |
-
#
|
| 96 |
-
|
| 97 |
-
for
|
| 98 |
text = doc.metadata.get("text", "No text available")
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
if score >= threshold:
|
|
|
|
| 101 |
results.append(f"Result {i+1}:\nText: {text}\nScore: {score:.4f}\n")
|
| 102 |
|
| 103 |
if not results:
|
|
|
|
| 3 |
import torch
|
| 4 |
from langchain.vectorstores import Chroma
|
| 5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
|
|
|
|
|
|
| 6 |
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
|
| 7 |
import gradio as gr
|
| 8 |
|
|
|
|
| 29 |
model_name="BAAI/bge-reranker-base",
|
| 30 |
model_kwargs={"device": device}
|
| 31 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def add_text_to_db(text):
|
| 34 |
"""
|
|
|
|
| 77 |
if not isinstance(threshold, (int, float)) or threshold < 0 or threshold > 1:
|
| 78 |
return "Error: Threshold must be a number between 0 and 1."
|
| 79 |
|
| 80 |
+
# Retrieve initial documents
|
| 81 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": max(k * 2, 10)})
|
| 82 |
+
docs = retriever.get_relevant_documents(query)
|
|
|
|
| 83 |
|
| 84 |
if not docs:
|
| 85 |
return "No such record."
|
| 86 |
|
| 87 |
+
# Compute reranker scores
|
| 88 |
+
scored_docs = []
|
| 89 |
+
for doc in docs:
|
| 90 |
text = doc.metadata.get("text", "No text available")
|
| 91 |
+
# Prepare input for reranker: list of [query, document] pairs
|
| 92 |
+
score = reranker.predict([[query, text]])[0]
|
| 93 |
+
doc.metadata["score"] = float(score)
|
| 94 |
+
scored_docs.append((doc, score))
|
| 95 |
+
|
| 96 |
+
# Sort by score in descending order
|
| 97 |
+
scored_docs.sort(key=lambda x: x[1], reverse=True)
|
| 98 |
+
|
| 99 |
+
# Filter by threshold and limit to k
|
| 100 |
+
results = []
|
| 101 |
+
for i, (doc, score) in enumerate(scored_docs[:k]):
|
| 102 |
if score >= threshold:
|
| 103 |
+
text = doc.metadata.get("text", "No text available")
|
| 104 |
results.append(f"Result {i+1}:\nText: {text}\nScore: {score:.4f}\n")
|
| 105 |
|
| 106 |
if not results:
|