chunk size + chunk overlap in settings

This commit is contained in:
Niklas Mueller 2024-07-04 18:01:44 +02:00
parent e3b62b76f9
commit c7d748f7fd
3 changed files with 18 additions and 3 deletions

View file

@ -17,5 +17,9 @@ class Settings:
LLM_MODEL_NAME: str = os.getenv("LLM_MODEL_NAME")
LLM_LANGUAGE: str = os.getenv("LLM_LANGUAGE")
# Chunking
CHUNKING_CHUNK_SIZE: int = os.getenv("CHUNKING_CHUNK_SIZE")
CHUNKING_CHUNK_OVERLAP: int = os.getenv("CHUNKING_CHUNK_OVERLAP")
settings = Settings()

View file

@ -4,6 +4,7 @@ import os
import io
import PyPDF2
from core.config import settings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from common_packages import logging
@ -39,9 +40,10 @@ class PDFProcessor:
Returns:
list: List containing tuples of the preprocessed text and their page numbers from the PDF.
"""
logger.debug("Text Chunking | Chunk Size: %s \t Chunk Overlap: %s", str(settings.CHUNKING_CHUNK_SIZE), str(settings.CHUNKING_CHUNK_OVERLAP))
splitter = RecursiveCharacterTextSplitter(
chunk_size=100,
chunk_overlap=10,
chunk_size=int(settings.CHUNKING_CHUNK_SIZE),
chunk_overlap=int(settings.CHUNKING_CHUNK_OVERLAP),
length_function=self._calculate_spaces_length,
)
@ -81,6 +83,9 @@ def read_pdf(pdf_bytes: io.BytesIO) -> tuple:
pdf_processor = PDFProcessor()
processed_chunks = pdf_processor.chunk_text(text_pages)
logger.debug("text_pages: %s", str(text_pages))
logger.debug("processed_chunks: %s", str(processed_chunks))
chunks = [chunk for chunk, _ in processed_chunks]
pages = [page for _, page in processed_chunks]
logger.info("PDF processed. Number of chunks: %s", len(chunks))