generalize logging

This commit is contained in:
Niklas Mueller 2024-07-03 10:14:23 +02:00
parent a47fe71bce
commit e737c3895c
2 changed files with 10 additions and 19 deletions

View file

@ -35,7 +35,7 @@ def create_logger(log_level: str, logger_name: str = "custom_logger"):
# Create a formatter and set it for the console handler
formatter = logging.Formatter(
"%(asctime)s - %(levelname)s [%(name)s] - %(message)s",
"%(asctime)s [%(name)s] | %(levelname)s\t - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
console_handler.setFormatter(formatter)

View file

@ -1,26 +1,17 @@
"""Module for tools to process PDF documents"""
import os
import sys
import io
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging
import datetime as dt
from datetime import datetime
import traceback
from common_packages import logging
# Setup Logging
logging.basicConfig(
level=logging.DEBUG,
# level=logging.INFO,
format="Start: " + str(dt.datetime.now()).replace(" ", "_") + " | %(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("/<path>-_" + str(datetime.today().strftime('%Y-%m-%d')) + "_-_debug.log"),
logging.StreamHandler(sys.stdout)
]
# instantiate logger
logger = logging.create_logger(
log_level=os.getenv("LOGGING_LEVEL", "INFO"),
logger_name=__name__,
)
@ -73,11 +64,11 @@ def read_pdf(pdf_bytes: io.BytesIO) -> tuple:
Returns:
tuple of lists: (List of chunked text, List of corresponding page numbers).
"""
logging.info("Reading PDF document")
logger.info("Reading PDF document")
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
num_pages = len(pdf_reader.pages)
logging.info("Read PDF document with '%s' pages", num_pages)
logger.info("Read PDF document with '%s' pages", num_pages)
text_pages = []
for i in range(num_pages):
@ -86,12 +77,12 @@ def read_pdf(pdf_bytes: io.BytesIO) -> tuple:
if text:
text_pages.append((text, i + 1))
logging.info("Processing PDF content")
logger.info("Processing PDF content")
pdf_processor = PDFProcessor()
processed_chunks = pdf_processor.chunk_text(text_pages)
chunks = [chunk for chunk, _ in processed_chunks]
pages = [page for _, page in processed_chunks]
logging.info("PDF processed. Number of chunks: %s", len(chunks))
logger.info("PDF processed. Number of chunks: %s", len(chunks))
return chunks, pages