From 7325f650b6622709e03e0422af09b69eabc7b3c9 Mon Sep 17 00:00:00 2001
From: Niklas Mueller <n.mueller@ibm.com>
Date: Tue, 23 Jul 2024 18:23:54 +0200
Subject: [PATCH] INIT

---
 .gitignore       |  2 ++
 Dockerfile       | 22 ++++++++++++++++++
 README.md        | 18 +++++++++++++++
 init.sh          |  4 ++++
 requirements.txt |  3 +++
 runner.py        | 60 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 109 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 README.md
 create mode 100755 init.sh
 create mode 100644 requirements.txt
 create mode 100644 runner.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a0f0e53
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.vscode
+.DS_Store
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..67f4109
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.11-slim
+
+# set the working directory
+WORKDIR /app
+RUN mkdir /app/input_files
+RUN mkdir /app/transcripts
+RUN apt-get update
+
+RUN apt-get install -y ffmpeg
+
+# install dependencies
+COPY ./requirements.txt /app
+RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --no-cache-dir --upgrade -r requirements.txt
+
+# copy model to container
+COPY ./large-v3.pt /root/.cache/whisper/large-v3.pt
+
+# copy the scripts to the /app folder
+COPY ./init.sh /app
+COPY ./runner.py /app
+
+CMD ["bash", "init.sh"]
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9967a72
--- /dev/null
+++ b/README.md
@@ -0,0 +1,18 @@
+# STT-Function
+With the Speech-to-Text (STT) Function you can transcribe a file ("convert" an audio/video file into text).
+Internally Whisper from OpenAI (https://github.com/openai/whisper) is used to transcribe the audiofile.
+
+
+## Structure
+* The container has two folders attached, the input folder with files that should be transcribed and the output path, where the transcript should be saved to.
+
+
+## Setup
+Make sure [Podman](https://podman.io/docs/installation) or [Docker](https://docs.docker.com/get-docker/) is installed.
+
+Download the Model into the Folder where you will build the container image. [Download Link](https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt) or run `wget https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt`
+
+```
+podman build -t stt-function .
+podman run -e LANGUAGE_CODE='de' -e WHISPER_MODEL='tiny' -v '/path/to/audio_video/file/':/app/input_files/ -v /output_path/of/transcript/:/app/transcripts/ --name stt-function_container --rm -t stt-function
+```
diff --git a/init.sh b/init.sh
new file mode 100755
index 0000000..eafe7bd
--- /dev/null
+++ b/init.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+env >> /etc/environment
+/usr/local/bin/python /app/runner.py
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..682f4a3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+openai-whisper
+pydub
+ffmpeg
\ No newline at end of file
diff --git a/runner.py b/runner.py
new file mode 100644
index 0000000..d7baddb
--- /dev/null
+++ b/runner.py
@@ -0,0 +1,60 @@
+import whisper
+import os
+import sys
+import logging
+import datetime as dt
+from datetime import datetime
+import traceback
+from pydub import AudioSegment
+
+env_var_language_code = os.environ['LANGUAGE_CODE']
+env_var_whisper_model = os.environ['WHISPER_MODEL']
+
+# Setup Logging
+logging.basicConfig(
+    level=logging.DEBUG,
+    # level=logging.INFO,
+    format="Start: " + str(dt.datetime.now()).replace(" ", "_") + " | %(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.FileHandler("/var/log/" + str(datetime.today().strftime('%Y-%m-%d')) + "_-_cron.log"),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+
+
+def get_audio_duration(file_path):
+    audio = AudioSegment.from_file(file_path)
+    duration_seconds = len(audio) / 1000
+    return duration_seconds
+
+
+try:
+    for root, dirs, files in os.walk('/app/input_files'):
+        for file in files:
+            try:
+                file_path = os.path.join(root, file)
+                logging.debug("#" * 32)
+                logging.debug(file_path)
+
+                duration = get_audio_duration(file_path)
+                logging.debug("Duration: " + str(duration) + " Seconds")
+
+                model = whisper.load_model(env_var_whisper_model)
+                if env_var_language_code == "multi":
+                    result = model.transcribe(file_path)
+                else:
+                    result = model.transcribe(file_path, language=env_var_language_code, initial_prompt="")
+                logging.debug("result: " + str(result))
+                result_text = result["text"]
+                logging.debug("result: " + result_text)
+
+                transcript_file = '/app/transcripts/' + file.split(".")[0] + '_transcript_' + env_var_language_code + '_.txt'
+                logging.debug("result: " + str(transcript_file))
+                with open(transcript_file, 'w') as f:
+                    f.write(result_text)
+            except Exception as e:
+                logging.debug("There was an error: " + str(e))
+                logging.debug("Stacktrace: " + str(traceback.format_exc()))
+except Exception as e:
+    logging.debug("There was an error: " + str(e))
+    logging.debug("Stacktrace: " + str(traceback.format_exc()))