Merge pull request #28 from guardian/faster-transcription-code

Use tiny model to transcribe on CODE
guardian · Feb 14, 2024 · ce890a6 · ce890a6
2 parents ef75b99 + 939a477
commit ce890a6
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 7 deletions.
diff --git a/.github/workflows/build-whisper-docker.yml b/.github/workflows/build-whisper-docker.yml
@@ -6,8 +6,8 @@ on:
   workflow_dispatch:
   push:
     paths:
-      - "whisper_container/Dockerfile"
-      - ".github/workflows/build-whisper-docker.yml"
+      - 'whisper_container/Dockerfile'
+      - '.github/workflows/build-whisper-docker.yml'
 
 # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
 env:
@@ -70,5 +70,5 @@ jobs:
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-
-
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/packages/worker/src/index.ts b/packages/worker/src/index.ts
@@ -89,6 +89,7 @@ const main = async () => {
 			ffmpegResult.wavPath,
 			fileToTranscribe,
 			numberOfThreads,
+			config.app.stage === 'PROD' ? 'medium' : 'tiny',
 		);
 
 		const transcriptionOutput: TranscriptionOutput = {

diff --git a/packages/worker/src/transcribe.ts b/packages/worker/src/transcribe.ts
@@ -13,6 +13,8 @@ interface FfmpegResult {
 	duration?: number;
 }
 
+export type WhisperModel = 'medium' | 'tiny';
+
 const CONTAINER_FOLDER = '/input';
 
 const runSpawnCommand = (
@@ -144,8 +146,14 @@ export const getTranscriptionText = async (
 	wavPath: string,
 	file: string,
 	numberOfThreads: number,
+	model: WhisperModel,
 ) => {
-	const resultFile = await transcribe(containerId, wavPath, numberOfThreads);
+	const resultFile = await transcribe(
+		containerId,
+		wavPath,
+		numberOfThreads,
+		model,
+	);
 	const transcriptText = readFile(
 		path.resolve(path.parse(file).dir, resultFile),
 	);
@@ -156,6 +164,7 @@ const transcribe = async (
 	containerId: string,
 	file: string,
 	numberOfThreads: number,
+	model: WhisperModel,
 ) => {
 	const outputFile = path.resolve(CONTAINER_FOLDER, path.parse(file).name);
 	console.log(`transcribe outputFile: ${outputFile}`);
@@ -166,7 +175,7 @@ const transcribe = async (
 			containerId,
 			'whisper.cpp/main',
 			'--model',
-			'whisper.cpp/models/ggml-medium.bin',
+			`whisper.cpp/models/ggml-${model}.bin`,
 			'--threads',
 			numberOfThreads.toString(),
 			'--file',

diff --git a/whisper_container/Dockerfile b/whisper_container/Dockerfile
@@ -5,7 +5,8 @@ LABEL com.theguardian.transcription-service.whisper-container="Whisper.cpp conta
 RUN apt-get update
 RUN apt-get install -y ffmpeg wget git build-essential
 RUN git clone https://github.com/ggerganov/whisper.cpp
-RUN cd whisper.cpp && make
+RUN cd whisper.cpp && git reset --hard v1.5.4 && make
+RUN bash /opt/whisper.cpp/models/download-ggml-model.sh tiny
 RUN bash /opt/whisper.cpp/models/download-ggml-model.sh medium
 
 # Large model not currently in use - but we might want to add it as an option at some point