From c7d748f7fdb2ee49d21f73c6ce6ec24befb1bcc3 Mon Sep 17 00:00:00 2001 From: Niklas Mueller Date: Thu, 4 Jul 2024 18:01:44 +0200 Subject: [PATCH] chunk size + chunk overlap in settings --- .env | 8 +++++++- rag-chat-backend/src/core/config.py | 4 ++++ rag-chat-backend/src/preprocessing/pdf.py | 9 +++++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/.env b/.env index 31d7180..d01a2f2 100644 --- a/.env +++ b/.env @@ -14,6 +14,7 @@ S3_ENDPOINT=http://localhost:9000 S3_ACCESS_KEY=admin S3_SECRET_KEY=secret-admin-key BUCKET_NAME=rag-bucket +BUCKET_FILE_PATH=my-project-name ################################ @@ -34,7 +35,8 @@ OPENSEARCH_USE_SSL=True LLM_OPTION=ollamallm # LLM_API_ENDPOINT=http://:11434 -LLM_API_ENDPOINT=http://127.0.0.1:11434 +# LLM_API_ENDPOINT=http://127.0.0.1:11434 +LLM_API_ENDPOINT=http://18.192.122.66:11434 # if required for you llm setup (for self hosted ollama not needed) LLM_API_KEY=placeholder @@ -44,3 +46,7 @@ LLM_LANGUAGE=de # LLM Model to be used - "mistral", "phi3", "llama3", etc. (Check available with :11434/api/tags) LLM_MODEL_NAME=mistral + +# CHUNKING +CHUNKING_CHUNK_SIZE=100 +CHUNKING_CHUNK_OVERLAP=10 diff --git a/rag-chat-backend/src/core/config.py b/rag-chat-backend/src/core/config.py index be02525..6ebef36 100644 --- a/rag-chat-backend/src/core/config.py +++ b/rag-chat-backend/src/core/config.py @@ -17,5 +17,9 @@ class Settings: LLM_MODEL_NAME: str = os.getenv("LLM_MODEL_NAME") LLM_LANGUAGE: str = os.getenv("LLM_LANGUAGE") + # Chunking + CHUNKING_CHUNK_SIZE: int = os.getenv("CHUNKING_CHUNK_SIZE") + CHUNKING_CHUNK_OVERLAP: int = os.getenv("CHUNKING_CHUNK_OVERLAP") + settings = Settings() diff --git a/rag-chat-backend/src/preprocessing/pdf.py b/rag-chat-backend/src/preprocessing/pdf.py index 75ac3ba..b367cce 100644 --- a/rag-chat-backend/src/preprocessing/pdf.py +++ b/rag-chat-backend/src/preprocessing/pdf.py @@ -4,6 +4,7 @@ import os import io import PyPDF2 +from core.config import settings from langchain.text_splitter import RecursiveCharacterTextSplitter from common_packages import logging @@ -39,9 +40,10 @@ class PDFProcessor: Returns: list: List containing tuples of the preprocessed text and their page numbers from the PDF. """ + logger.debug("Text Chunking | Chunk Size: %s \t Chunk Overlap: %s", str(settings.CHUNKING_CHUNK_SIZE), str(settings.CHUNKING_CHUNK_OVERLAP)) splitter = RecursiveCharacterTextSplitter( - chunk_size=100, - chunk_overlap=10, + chunk_size=int(settings.CHUNKING_CHUNK_SIZE), + chunk_overlap=int(settings.CHUNKING_CHUNK_OVERLAP), length_function=self._calculate_spaces_length, ) @@ -81,6 +83,9 @@ def read_pdf(pdf_bytes: io.BytesIO) -> tuple: pdf_processor = PDFProcessor() processed_chunks = pdf_processor.chunk_text(text_pages) + logger.debug("text_pages: %s", str(text_pages)) + logger.debug("processed_chunks: %s", str(processed_chunks)) + chunks = [chunk for chunk, _ in processed_chunks] pages = [page for _, page in processed_chunks] logger.info("PDF processed. Number of chunks: %s", len(chunks))