chunk size + chunk overlap in settings
This commit is contained in:
parent
e3b62b76f9
commit
c7d748f7fd
3 changed files with 18 additions and 3 deletions
8
.env
8
.env
|
|
@ -14,6 +14,7 @@ S3_ENDPOINT=http://localhost:9000
|
||||||
S3_ACCESS_KEY=admin
|
S3_ACCESS_KEY=admin
|
||||||
S3_SECRET_KEY=secret-admin-key
|
S3_SECRET_KEY=secret-admin-key
|
||||||
BUCKET_NAME=rag-bucket
|
BUCKET_NAME=rag-bucket
|
||||||
|
BUCKET_FILE_PATH=my-project-name
|
||||||
|
|
||||||
|
|
||||||
################################
|
################################
|
||||||
|
|
@ -34,7 +35,8 @@ OPENSEARCH_USE_SSL=True
|
||||||
LLM_OPTION=ollamallm
|
LLM_OPTION=ollamallm
|
||||||
|
|
||||||
# LLM_API_ENDPOINT=http://<ip-of-ollama-instance>:11434
|
# LLM_API_ENDPOINT=http://<ip-of-ollama-instance>:11434
|
||||||
LLM_API_ENDPOINT=http://127.0.0.1:11434
|
# LLM_API_ENDPOINT=http://127.0.0.1:11434
|
||||||
|
LLM_API_ENDPOINT=http://18.192.122.66:11434
|
||||||
|
|
||||||
# if required for you llm setup (for self hosted ollama not needed)
|
# if required for you llm setup (for self hosted ollama not needed)
|
||||||
LLM_API_KEY=placeholder
|
LLM_API_KEY=placeholder
|
||||||
|
|
@ -44,3 +46,7 @@ LLM_LANGUAGE=de
|
||||||
|
|
||||||
# LLM Model to be used - "mistral", "phi3", "llama3", etc. (Check available with <ip_of_ollama_instance>:11434/api/tags)
|
# LLM Model to be used - "mistral", "phi3", "llama3", etc. (Check available with <ip_of_ollama_instance>:11434/api/tags)
|
||||||
LLM_MODEL_NAME=mistral
|
LLM_MODEL_NAME=mistral
|
||||||
|
|
||||||
|
# CHUNKING
|
||||||
|
CHUNKING_CHUNK_SIZE=100
|
||||||
|
CHUNKING_CHUNK_OVERLAP=10
|
||||||
|
|
|
||||||
|
|
@ -17,5 +17,9 @@ class Settings:
|
||||||
LLM_MODEL_NAME: str = os.getenv("LLM_MODEL_NAME")
|
LLM_MODEL_NAME: str = os.getenv("LLM_MODEL_NAME")
|
||||||
LLM_LANGUAGE: str = os.getenv("LLM_LANGUAGE")
|
LLM_LANGUAGE: str = os.getenv("LLM_LANGUAGE")
|
||||||
|
|
||||||
|
# Chunking
|
||||||
|
CHUNKING_CHUNK_SIZE: int = os.getenv("CHUNKING_CHUNK_SIZE")
|
||||||
|
CHUNKING_CHUNK_OVERLAP: int = os.getenv("CHUNKING_CHUNK_OVERLAP")
|
||||||
|
|
||||||
|
|
||||||
settings = Settings()
|
settings = Settings()
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ import os
|
||||||
import io
|
import io
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
|
|
||||||
|
from core.config import settings
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
|
|
||||||
from common_packages import logging
|
from common_packages import logging
|
||||||
|
|
@ -39,9 +40,10 @@ class PDFProcessor:
|
||||||
Returns:
|
Returns:
|
||||||
list: List containing tuples of the preprocessed text and their page numbers from the PDF.
|
list: List containing tuples of the preprocessed text and their page numbers from the PDF.
|
||||||
"""
|
"""
|
||||||
|
logger.debug("Text Chunking | Chunk Size: %s \t Chunk Overlap: %s", str(settings.CHUNKING_CHUNK_SIZE), str(settings.CHUNKING_CHUNK_OVERLAP))
|
||||||
splitter = RecursiveCharacterTextSplitter(
|
splitter = RecursiveCharacterTextSplitter(
|
||||||
chunk_size=100,
|
chunk_size=int(settings.CHUNKING_CHUNK_SIZE),
|
||||||
chunk_overlap=10,
|
chunk_overlap=int(settings.CHUNKING_CHUNK_OVERLAP),
|
||||||
length_function=self._calculate_spaces_length,
|
length_function=self._calculate_spaces_length,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -81,6 +83,9 @@ def read_pdf(pdf_bytes: io.BytesIO) -> tuple:
|
||||||
pdf_processor = PDFProcessor()
|
pdf_processor = PDFProcessor()
|
||||||
processed_chunks = pdf_processor.chunk_text(text_pages)
|
processed_chunks = pdf_processor.chunk_text(text_pages)
|
||||||
|
|
||||||
|
logger.debug("text_pages: %s", str(text_pages))
|
||||||
|
logger.debug("processed_chunks: %s", str(processed_chunks))
|
||||||
|
|
||||||
chunks = [chunk for chunk, _ in processed_chunks]
|
chunks = [chunk for chunk, _ in processed_chunks]
|
||||||
pages = [page for _, page in processed_chunks]
|
pages = [page for _, page in processed_chunks]
|
||||||
logger.info("PDF processed. Number of chunks: %s", len(chunks))
|
logger.info("PDF processed. Number of chunks: %s", len(chunks))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue