added API-Endpoints for files

This commit is contained in:
Niklas Mueller 2024-06-18 20:21:04 +02:00
parent 95f8488227
commit 16e5004228
7 changed files with 207 additions and 1 deletions

View file

@ -7,9 +7,13 @@ from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from dotenv import load_dotenv
from endpoints import files
from core.config import settings
# load env and disable several warning
load_dotenv()
app = FastAPI()
app.add_middleware(
@ -20,6 +24,9 @@ app.add_middleware(
allow_headers=["*"],
)
app.include_router(files.router, prefix=settings.API_V1_STR) # , tags=["files"]
print('OPENSEARCH_USE_SSL')
print(os.getenv('OPENSEARCH_USE_SSL'))
print('settings.API_V1_STR')

View file

@ -1,7 +1,14 @@
# config.py
import os
class Settings:
API_V1_STR: str = "/api"
# OS_INTERFACE: str = ""
BUCKET: str = os.getenv("BUCKET_NAME")
BUCKET_FILE_PATH: str = os.getenv("BUCKET_FILE_PATH")
settings = Settings()

View file

@ -0,0 +1,183 @@
"""API-Endpoints to interaction with the pdfs."""
import os
import sys
from typing import List
from io import BytesIO
import boto3
from fastapi import APIRouter, File, UploadFile, Form, HTTPException
from fastapi.responses import StreamingResponse
from core.config import settings
import logging
import datetime as dt
from datetime import datetime
import traceback
# Setup Logging
logging.basicConfig(
level=logging.DEBUG,
# level=logging.INFO,
format="Start: " + str(dt.datetime.now()).replace(" ", "_") + " | %(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("/<path>-_" + str(datetime.today().strftime('%Y-%m-%d')) + "_-_debug.log"),
logging.StreamHandler(sys.stdout)
]
)
router = APIRouter()
# Create MinIO client
minio_client = boto3.client(
"s3",
endpoint_url=os.getenv("S3_ENDPOINT"),
aws_access_key_id=os.getenv("S3_ACCESS_KEY"),
aws_secret_access_key=os.getenv("S3_SECRET_KEY"),
use_ssl=False,
)
@router.get("/list-all-buckets", tags=["Object Storage"])
def list_buckets():
"""Lists object storage buckets.
Returns:
list: Returns the list of buckets
Example Response:
{
"response": [
"rag-bucket"
]
}
"""
logging.info("Listing all available buckets")
response = minio_client.list_buckets()
buckets = [bucket["Name"] for bucket in response["Buckets"]]
if settings.BUCKET not in buckets:
# Create the bucket
minio_client.create_bucket(Bucket=settings.BUCKET)
else:
logging.info("Bucket '%s' already exists", settings.BUCKET)
return {"response": buckets}
@router.get("/list-all-pdfs", tags=["pdf"])
def list_pdf_files():
"""Lists all document names from the object storage.
Returns:
list: Returns the list of pdf document names
"""
logging.info("Listing all available files in object storage")
# Get all entries from MinIO bucket
try:
response = minio_client.list_objects_v2(Bucket=settings.BUCKET)
except minio_client.exceptions.NoSuchBucket as error:
logging.warning("No such MinIO bucket: '%s'", settings.BUCKET)
raise HTTPException(status_code=404, detail="Bucket not found") from error
# Check if empty
if "Contents" not in response.keys():
logging.info("Bucket is empty")
return {"message": "success", "count": 0, "objects": []}
files_in_directory = response["Contents"]
object_names = [entry["Key"] for entry in files_in_directory]
logging.info("Number of objects in MinIO: %s", len(object_names))
return {"message": "success", "count": len(object_names), "objects": object_names}
@router.get("/get-pdf", tags=["pdf"])
def get_pdf(file_path: str):
"""Retrieve and return a PDF based on the file path.
Args:
file_path (str): Path to the PDF file.
Returns:
StreamingResponse: Response containing the PDF file.
"""
try:
buffer = BytesIO()
minio_client.download_fileobj(settings.BUCKET, file_path, buffer)
buffer.seek(0)
pdf_bytes = buffer.read()
logging.info("PDF read successful")
# Create a streaming response with the PDF bytes
return StreamingResponse(BytesIO(pdf_bytes), media_type="application/pdf")
except Exception as e:
logging.error("Retrieving PDF failed with error: %s", e)
logging.error("Stacktrace: " + str(traceback.format_exc()))
raise HTTPException("Retrieving PDF failed") from e
@router.post("/upload-pdf-list", tags=["pdf"], status_code=201)
def upload_pdf_list(tag: str = Form(...), pdf_files: List[UploadFile] = File(...)):
"""Upload multiple pdf documents, unify them, and add to storage.
Args:
unified_pdf_name (str): Initialize pdf name for unified pdf
pdf_files (List[UploadFile]): List of uploaded pdf files.
tag (str): Industry tag given by user
Returns:
response (dict): A successful response.
"""
upload_responses = []
logging.info("Number of files to be processed: %s", len(pdf_files))
# read pdf files and upload it, then process it. Finally, delete it.
for pdf_file in pdf_files:
logging.info("Processing file: %s", pdf_file.filename)
pdf_file_name = pdf_file.filename
# pdf_file_path = f"{settings.BUCKET_FILE_PATH}/{pdf_file.filename}"
pdf_contents = pdf_file.file.read()
# process pdf
# docs, pages_list = pdf.read_pdf(pdf_bytes=pdf_contents)
try:
object_name = f"{settings.BUCKET_FILE_PATH}/{pdf_file_name}"
put_response = minio_client.put_object(
Bucket=settings.BUCKET, Key=object_name, Body=pdf_contents
)
logging.info("Upload to object store successful")
logging.debug(put_response)
# return put_response
if isinstance(put_response, dict):
upload_responses.append("success")
logging.info("File upload successful")
else:
upload_responses.append("failure")
except Exception as e:
logging.error("PDF upload failed with error: %s ", e)
logging.error("Stacktrace: " + str(traceback.format_exc()))
raise HTTPException("PDF upload failed") from e
if "failure" in upload_responses:
logging.error(
"Error while uploading. At least one document was not processed correctly"
)
raise HTTPException(status_code=400, detail="Error while uploading documents")
return {"message": "success"}