Skip to content

Commit

Permalink
fw + vad segments.
Browse files Browse the repository at this point in the history
  • Loading branch information
boocmp committed Aug 16, 2024
1 parent a28d1d6 commit 40befd2
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 104 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloa
RUN set -eux && \
apt-get update -y && \
apt-get install -q -y --no-install-recommends --allow-remove-essential \
ca-certificates gnupg2 bash build-essential git
ca-certificates gnupg2 bash build-essential libsndfile1 ffmpeg

RUN \
set -eux && \
Expand Down
1 change: 0 additions & 1 deletion env/python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,3 @@ pydantic
pydantic-settings
six
msgspec
whisperx
2 changes: 1 addition & 1 deletion src/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ runners:
resources:
cpu: 4
nvidia.com/gpu: 1
workers_per_resource: 6
workers_per_resource: 12
181 changes: 84 additions & 97 deletions src/runners/audio_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def transcribe_audio(self, audio, lang):
from pydantic import BaseModel
import numpy as np
import io
from datetime import datetime
from faster_whisper.vad import get_speech_timestamps, collect_chunks


class BatchInput(BaseModel):
Expand All @@ -50,129 +52,114 @@ class BatchInput(BaseModel):
lang: str = "en"


"""
class BatchableAudioTranscriber(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
SUPPORTS_CPU_MULTI_THREADING = True
class BatchOutput(BaseModel):
text: str
batched_count: int
merge_audio_time: float
transcribe_time: float
restore_time: float

def __init__(self):
device = "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
compute_type = (
"int8_float16" if ctranslate2.get_cuda_device_count() > 0 else "int8"
)

print(device, " ", compute_type)
class BatchItem(BaseModel):
start_time: float
end_time: float
chunks_count: int
transcription: str = ""

model = "base.en"
self.model = whisper.load_model(model_identifier=model, backend='CTranslate2')
def transcribe(self, audio):
segments, info = self.model.transcribe(
audio,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500),
condition_on_previous_text=False,
language="en",
)
return segments
@bentoml.Runnable.method(batchable=True)
def transcribe_audio(self, inputs: list[BatchInput]) -> list[str]:
if len(inputs) == 1:
segments = self.transcribe(io.BytesIO(inputs[0].audio))
text = ""
for segment in segments:
text += segment.text
return [text]
MAX_SILENCE = 16000 * 30
# merging audio
audio_batch = np.ndarray(1, dtype=np.float32)
for input in inputs:
wav = decode_audio(io.BytesIO(input.audio))
wav = np.append(wav, np.zeros(MAX_SILENCE - len(wav), dtype=np.float32))
audio_batch = np.append(audio_batch, wav)
segments, info = self.model.g(
audio_batch,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=250),
condition_on_previous_text=False,
word_timestamps=True,
language="en",
)
result = []
print("inputs ", len(inputs))
for segment in segments:
print(segment.start, " -> ", segment.end, " : ", segment.text)
for word in segment.words:
print(" ", word.start, " -> ", word.end, " : ", word)
if segment.start >= 30 * len(result):
result.append("")
if segment.end < 30 * (len(result)):
result[-1] += segment.text
return result
"""

import whisperx as whisper
def add(self, word):
if (
self.chunks_count > 0
and word.start >= self.start_time
and word.end <= self.end_time
):
self.transcription += word.word


class BatchableAudioTranscriber(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
SUPPORTS_CPU_MULTI_THREADING = True

def __init__(self):
self.device = "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
device = "cuda" if ctranslate2.get_cuda_device_count() > 0 else "cpu"
compute_type = "float16" if ctranslate2.get_cuda_device_count() > 0 else "int8"

print(self.device, " ", compute_type)
print(device, " ", compute_type)

model = "base.en"
self.model = whisper.load_model(
whisper_arch=model,
device=self.device,
compute_type=compute_type,
download_root="/home/bentoml/models",
)
self.model = WhisperModel(model, device=device, compute_type=compute_type)

def transcribe(self, audios):
result = self.model.transcribe(audios, batch_size=8, language="en")
return result["segments"]
segments, info = self.model.transcribe(
audios,
vad_filter=False,
vad_parameters=dict(min_silence_duration_ms=250),
language="en",
condition_on_previous_text=False,
word_timestamps=True,
no_speech_threshold=10,
)
return segments

@bentoml.Runnable.method(batchable=True)
def transcribe_audio(self, inputs: list[BatchInput]) -> list[str]:
if len(inputs) == 1:
segments = self.transcribe(decode_audio(io.BytesIO(inputs[0].audio)))

text = ""
for segment in segments:
text += segment["text"]
return [text]
result = []

# merging audio
MAX_SILENCE = 16000 * 30
ts = datetime.now()

# merging audio
batch_list = []
audio_batch = np.ndarray(1, dtype=np.float32)
for input in inputs:
wav = decode_audio(io.BytesIO(input.audio))
wav = np.append(wav, np.zeros(MAX_SILENCE - len(wav), dtype=np.float32))
audio_batch = np.append(audio_batch, wav)

chunks = get_speech_timestamps(wav)
if len(chunks) == 0:
batch_list.append(
BatchItem(
start_time=len(audio_batch) / 16000.0,
end_time=len(audio_batch) / 16000.0,
chunks_count=0,
)
)
else:
wav = collect_chunks(wav, chunks=chunks)
wav = np.append(wav, np.zeros(16000, dtype=np.float32))
batch_list.append(
BatchItem(
start_time=len(audio_batch) / 16000.0,
end_time=(len(audio_batch) + len(wav)) / 16000.0,
chunks_count=len(chunks),
)
)
audio_batch = np.append(audio_batch, wav)

for item in batch_list:
print(item)

merge_time = (datetime.now() - ts).total_seconds()

ts = datetime.now()
segments = self.transcribe(audio_batch)
transcribe_time = (datetime.now() - ts).total_seconds()

result = []
for segment in segments:
if segment["start"] + 0.1 >= 30 * len(result):
result.append("")
if segment["end"] < 30 * (len(result)):
result[-1] += segment["text"]
ts = datetime.now()
output = [segment for segment in segments]

for segment in output:
for word in segment.words:
for item in batch_list:
item.add(word)

restore_time = (datetime.now() - ts).total_seconds()

for item in batch_list:
result.append(
BatchOutput(
text=item.transcription,
batched_count=len(inputs),
merge_audio_time=merge_time,
transcribe_time=transcribe_time,
restore_time=restore_time,
)
)

return result
25 changes: 21 additions & 4 deletions src/stt_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
runner_audio_transcriber = bentoml.Runner(
BatchableAudioTranscriber,
name="audio_transcriber",
max_batch_size=32,
max_batch_size=16,
)


Expand Down Expand Up @@ -72,11 +72,28 @@ async def handleUpstream(
)
process_time = datetime.now() - process_time

text = transciption[0]
if text:
out = transciption[0]
print(
pair,
" : ",
out.batched_count,
"",
out.merge_audio_time,
" ",
out.transcribe_time,
" ",
out.restore_time,
)

if out.text:
await pipe.push(
ipc.messages.Text(
text, False, len(mic_data), process_time.total_seconds()
out.text,
False,
len(mic_data),
out.merge_audio_time
+ out.transcribe_time
+ out.restore_time,
)
)
finally:
Expand Down

0 comments on commit 40befd2

Please sign in to comment.