From 7325f650b6622709e03e0422af09b69eabc7b3c9 Mon Sep 17 00:00:00 2001 From: Niklas Mueller Date: Tue, 23 Jul 2024 18:23:54 +0200 Subject: [PATCH] INIT --- .gitignore | 2 ++ Dockerfile | 22 ++++++++++++++++++ README.md | 18 +++++++++++++++ init.sh | 4 ++++ requirements.txt | 3 +++ runner.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 109 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 README.md create mode 100755 init.sh create mode 100644 requirements.txt create mode 100644 runner.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a0f0e53 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.vscode +.DS_Store diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..67f4109 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.11-slim + +# set the working directory +WORKDIR /app +RUN mkdir /app/input_files +RUN mkdir /app/transcripts +RUN apt-get update + +RUN apt-get install -y ffmpeg + +# install dependencies +COPY ./requirements.txt /app +RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt + +# copy model to container +COPY ./large-v3.pt /root/.cache/whisper/large-v3.pt + +# copy the scripts to the /app folder +COPY ./init.sh /app +COPY ./runner.py /app + +CMD ["bash", "init.sh"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..9967a72 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# STT-Function +With the Speech-to-Text (STT) Function you can transcribe a file ("convert" an audio/video file into text). +Internally Whisper from OpenAI (https://github.com/openai/whisper) is used to transcribe the audiofile. + + +## Structure +* The container has two folders attached, the input folder with files that should be transcribed and the output path, where the transcript should be saved to. + + +## Setup +Make sure [Podman](https://podman.io/docs/installation) or [Docker](https://docs.docker.com/get-docker/) is installed. + +Download the Model into the Folder where you will build the container image. [Download Link](https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt) or run `wget https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt` + +``` +podman build -t stt-function . +podman run -e LANGUAGE_CODE='de' -e WHISPER_MODEL='tiny' -v '/path/to/audio_video/file/':/app/input_files/ -v /output_path/of/transcript/:/app/transcripts/ --name stt-function_container --rm -t stt-function +``` diff --git a/init.sh b/init.sh new file mode 100755 index 0000000..eafe7bd --- /dev/null +++ b/init.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +env >> /etc/environment +/usr/local/bin/python /app/runner.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..682f4a3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +openai-whisper +pydub +ffmpeg \ No newline at end of file diff --git a/runner.py b/runner.py new file mode 100644 index 0000000..d7baddb --- /dev/null +++ b/runner.py @@ -0,0 +1,60 @@ +import whisper +import os +import sys +import logging +import datetime as dt +from datetime import datetime +import traceback +from pydub import AudioSegment + +env_var_language_code = os.environ['LANGUAGE_CODE'] +env_var_whisper_model = os.environ['WHISPER_MODEL'] + +# Setup Logging +logging.basicConfig( + level=logging.DEBUG, + # level=logging.INFO, + format="Start: " + str(dt.datetime.now()).replace(" ", "_") + " | %(asctime)s [%(levelname)s] %(message)s", + handlers=[ + logging.FileHandler("/var/log/" + str(datetime.today().strftime('%Y-%m-%d')) + "_-_cron.log"), + logging.StreamHandler(sys.stdout) + ] +) + + +def get_audio_duration(file_path): + audio = AudioSegment.from_file(file_path) + duration_seconds = len(audio) / 1000 + return duration_seconds + + +try: + for root, dirs, files in os.walk('/app/input_files'): + for file in files: + try: + file_path = os.path.join(root, file) + logging.debug("#" * 32) + logging.debug(file_path) + + duration = get_audio_duration(file_path) + logging.debug("Duration: " + str(duration) + " Seconds") + + model = whisper.load_model(env_var_whisper_model) + if env_var_language_code == "multi": + result = model.transcribe(file_path) + else: + result = model.transcribe(file_path, language=env_var_language_code, initial_prompt="") + logging.debug("result: " + str(result)) + result_text = result["text"] + logging.debug("result: " + result_text) + + transcript_file = '/app/transcripts/' + file.split(".")[0] + '_transcript_' + env_var_language_code + '_.txt' + logging.debug("result: " + str(transcript_file)) + with open(transcript_file, 'w') as f: + f.write(result_text) + except Exception as e: + logging.debug("There was an error: " + str(e)) + logging.debug("Stacktrace: " + str(traceback.format_exc())) +except Exception as e: + logging.debug("There was an error: " + str(e)) + logging.debug("Stacktrace: " + str(traceback.format_exc()))