From c7d748f7fdb2ee49d21f73c6ce6ec24befb1bcc3 Mon Sep 17 00:00:00 2001
From: Niklas Mueller <n.mueller@ibm.com>
Date: Thu, 4 Jul 2024 18:01:44 +0200
Subject: [PATCH] chunk size + chunk overlap in settings

---
 .env                                      | 8 +++++++-
 rag-chat-backend/src/core/config.py       | 4 ++++
 rag-chat-backend/src/preprocessing/pdf.py | 9 +++++++--
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/.env b/.env
index 31d7180..d01a2f2 100644
--- a/.env
+++ b/.env
@@ -14,6 +14,7 @@ S3_ENDPOINT=http://localhost:9000
 S3_ACCESS_KEY=admin
 S3_SECRET_KEY=secret-admin-key
 BUCKET_NAME=rag-bucket
+BUCKET_FILE_PATH=my-project-name
 
 
 ################################
@@ -34,7 +35,8 @@ OPENSEARCH_USE_SSL=True
 LLM_OPTION=ollamallm
 
 # LLM_API_ENDPOINT=http://<ip-of-ollama-instance>:11434
-LLM_API_ENDPOINT=http://127.0.0.1:11434
+# LLM_API_ENDPOINT=http://127.0.0.1:11434
+LLM_API_ENDPOINT=http://18.192.122.66:11434
 
 # if required for you llm setup (for self hosted ollama not needed)
 LLM_API_KEY=placeholder
@@ -44,3 +46,7 @@ LLM_LANGUAGE=de
 
 # LLM Model to be used - "mistral", "phi3", "llama3", etc. (Check available with <ip_of_ollama_instance>:11434/api/tags)
 LLM_MODEL_NAME=mistral
+
+# CHUNKING
+CHUNKING_CHUNK_SIZE=100
+CHUNKING_CHUNK_OVERLAP=10
diff --git a/rag-chat-backend/src/core/config.py b/rag-chat-backend/src/core/config.py
index be02525..6ebef36 100644
--- a/rag-chat-backend/src/core/config.py
+++ b/rag-chat-backend/src/core/config.py
@@ -17,5 +17,9 @@ class Settings:
     LLM_MODEL_NAME: str = os.getenv("LLM_MODEL_NAME")
     LLM_LANGUAGE: str = os.getenv("LLM_LANGUAGE")
 
+    # Chunking
+    CHUNKING_CHUNK_SIZE: int = os.getenv("CHUNKING_CHUNK_SIZE")
+    CHUNKING_CHUNK_OVERLAP: int = os.getenv("CHUNKING_CHUNK_OVERLAP")
+
 
 settings = Settings()
diff --git a/rag-chat-backend/src/preprocessing/pdf.py b/rag-chat-backend/src/preprocessing/pdf.py
index 75ac3ba..b367cce 100644
--- a/rag-chat-backend/src/preprocessing/pdf.py
+++ b/rag-chat-backend/src/preprocessing/pdf.py
@@ -4,6 +4,7 @@ import os
 import io
 import PyPDF2
 
+from core.config import settings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 from common_packages import logging
@@ -39,9 +40,10 @@ class PDFProcessor:
         Returns:
             list: List containing tuples of the preprocessed text and their page numbers from the PDF.
         """
+        logger.debug("Text Chunking | Chunk Size: %s \t Chunk Overlap: %s", str(settings.CHUNKING_CHUNK_SIZE), str(settings.CHUNKING_CHUNK_OVERLAP))
         splitter = RecursiveCharacterTextSplitter(
-            chunk_size=100,
-            chunk_overlap=10,
+            chunk_size=int(settings.CHUNKING_CHUNK_SIZE),
+            chunk_overlap=int(settings.CHUNKING_CHUNK_OVERLAP),
             length_function=self._calculate_spaces_length,
         )
 
@@ -81,6 +83,9 @@ def read_pdf(pdf_bytes: io.BytesIO) -> tuple:
     pdf_processor = PDFProcessor()
     processed_chunks = pdf_processor.chunk_text(text_pages)
 
+    logger.debug("text_pages: %s", str(text_pages))
+    logger.debug("processed_chunks: %s", str(processed_chunks))
+
     chunks = [chunk for chunk, _ in processed_chunks]
     pages = [page for _, page in processed_chunks]
     logger.info("PDF processed. Number of chunks: %s", len(chunks))