Spaces:
Sleeping
Sleeping
File size: 1,612 Bytes
01d5a5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from typing import List
from lpm_kernel.L1.bio import Chunk
import traceback
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from lpm_kernel.configs.logging import get_train_process_logger
logger = get_train_process_logger()
class DocumentChunker:
def __init__(self, chunk_size: int = 1000, overlap: int = 200):
self.chunk_size = chunk_size
self.overlap = overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.overlap,
length_function=len,
separators=["\n\n", "\n", "。", "!", "?", ".", "!", "?", " ", ""],
)
def split(self, content: str) -> List[Chunk]:
try:
if not content:
logger.warning("Empty content provided")
return []
logger.info(f"Starting to split content of length {len(content)}")
# use LangChain splitter
texts = self.text_splitter.split_text(content)
chunks = [
Chunk(
id=None,
document_id=None,
content=text,
embedding=None,
tags=None,
topic=None,
)
for text in texts
]
logger.info(f"Split completed, created {len(chunks)} chunks")
return chunks
except Exception as e:
logger.error(f"Error in split method: {str(e)}")
logger.error(traceback.format_exc())
raise
|