Spaces:

tan-z-tan
/

speech_language_detection

Sleeping

App Files Files Community

tan-z-tan commited on Jun 21, 2024

Commit

3d444ab

1 Parent(s): 1ecc4f1

Fix

Browse files

Files changed (2) hide show

app.py +82 -35
whisper.py +5 -2

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from whisper import transcribe
 # アプリケーションの状態を保持する変数
 data = []
 current_chunk = []
 SAMPLING_RATE = 16000
@@ -30,8 +31,83 @@ def resample_audio(audio, orig_sr, target_sr=16000):
     return audio
 def process_audio(audio, chunk_duration, language_set):
-    global data, current_chunk, SAMPLING_RATE
     print("Process_audio")
     print(audio)
     if audio is None:
@@ -60,39 +136,10 @@ def process_audio(audio, chunk_duration, language_set):
         audio_sec += chunk_duration
         print(f"Processing audio chunk of length {len(chunk)}")
-        volume_norm = np.linalg.norm(chunk)
-        length = len(chunk) / SAMPLING_RATE  # 音声データの長さ（秒）
-        s = datetime.now()
-        selected_scores, all_scores = identify_languages(chunk, language_set)
-        lang_id_time = (datetime.now() - s).total_seconds()
-        # 日本語と英語の確率値を取得
-        ja_prob = selected_scores['Japanese']
-        en_prob = selected_scores['English']
-        ja_en = 'ja' if ja_prob > en_prob else 'en'
-        # Top 3言語を取得
-        top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
-        # テキストの認識
-        s = datetime.now()
-        transcription = transcribe(chunk)
-        transcribe_time = (datetime.now() - s).total_seconds()
-        data.append({
-            "Time": audio_sec,
-            "Length (s)": length,
-            "Volume": volume_norm,
-            "Japanese_English": f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})",
-            "Language": top3_languages,
-            "Lang ID Time": lang_id_time,
-            "Transcribe Time": transcribe_time,
-            "Text": transcription,
-        })
-        df = pd.DataFrame(data)
-        yield (SAMPLING_RATE, chunk), df
     # 未処理の残りのデータを保持
     current_chunk = [total_chunk]
@@ -119,7 +166,7 @@ with gr.Blocks() as demo:
     with gr.TabItem("Microphone"):
         gr.Interface(
-            fn=process_audio,
             inputs=inputs_stream,
             outputs=outputs,
             live=True,

 # アプリケーションの状態を保持する変数
 data = []
+data_df = pd.DataFrame()
 current_chunk = []
 SAMPLING_RATE = 16000
     return audio
+def process_chunk(chunk, language_set) -> pd.DataFrame:
+    print(f"Processing audio chunk of length {len(chunk)}")
+    volume_norm = np.linalg.norm(chunk)
+    length = len(chunk) / SAMPLING_RATE  # 音声データの長さ（秒）
+    s = datetime.now()
+    selected_scores, all_scores = identify_languages(chunk, language_set)
+    lang_id_time = (datetime.now() - s).total_seconds()
+    # 日本語と英語の確率値を取得
+    ja_prob = selected_scores['Japanese']
+    en_prob = selected_scores['English']
+    ja_en = 'ja' if ja_prob > en_prob else 'en'
+    # Top 3言語を取得
+    top3_languages = ", ".join([f"{lang} ({all_scores[lang]:.2f})" for lang in sorted(all_scores, key=all_scores.get, reverse=True)[:3]])
+    # テキストの認識
+    s = datetime.now()
+    transcription = transcribe(chunk, language=ja_en)
+    transcribe_time = (datetime.now() - s).total_seconds()
+    return pd.DataFrame({
+        "Length (s)": [length],
+        "Volume": [volume_norm],
+        "Japanese_English": [f"{ja_en} ({ja_prob:.2f}, {en_prob:.2f})"],
+        "Language": [top3_languages],
+        "Lang ID Time": [lang_id_time],
+        "Transcribe Time": [transcribe_time],
+        "Text": [transcription],
+    })
+def process_audio_stream(audio, chunk_duration, language_set):
+    global data_df, current_chunk, SAMPLING_RATE
+    print("Process_audio_stream")
+    if audio is None:
+        return None, data_df
+    sr, audio_data = audio
+    # language_set
+    language_set = [lang.strip() for lang in language_set.split(",")]
+    print(audio_data.shape, audio_data.dtype)
+    # 一番最初にSampling rateを揃えておく
+    audio_data = resample_audio(audio_data, sr, target_sr=SAMPLING_RATE)
+    audio_sec = 0
+    # 音量の正規化
+    audio_data = normalize_audio(audio_data)
+    current_chunk.append(audio_data)
+    total_chunk = np.concatenate(current_chunk)
+    # CHUNK_DURATIONを超えていたら処理
+    if len(total_chunk) >= SAMPLING_RATE * chunk_duration:
+        chunk = total_chunk[:SAMPLING_RATE * chunk_duration]
+        total_chunk = total_chunk[SAMPLING_RATE * chunk_duration:]
+        audio_sec += chunk_duration
+        df = process_chunk(chunk, language_set)
+        data_df = pd.concat([data_df, df], ignore_index=True)
+        current_chunk = [total_chunk]
+        return (SAMPLING_RATE, chunk), data_df
+    else:
+        return (SAMPLING_RATE, total_chunk), data_df
 def process_audio(audio, chunk_duration, language_set):
+    global data, data_df, current_chunk, SAMPLING_RATE
+    # reset state
+    data = []
+    current_chunk = []
     print("Process_audio")
     print(audio)
     if audio is None:
         audio_sec += chunk_duration
         print(f"Processing audio chunk of length {len(chunk)}")
+        df = process_chunk(chunk, language_set)
+        data_df = pd.concat([data_df, df], ignore_index=True)
+        yield (SAMPLING_RATE, chunk), data_df
     # 未処理の残りのデータを保持
     current_chunk = [total_chunk]
     with gr.TabItem("Microphone"):
         gr.Interface(
+            fn=process_audio_stream,
             inputs=inputs_stream,
             outputs=outputs,
             live=True,

whisper.py CHANGED Viewed

@@ -13,9 +13,12 @@ model.to(device)
 SAMPLING_RATE = 16000
-def transcribe(chunk: np.ndarray) -> str:
     input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
-    predicted_ids = model.generate(input_features)
     transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
     print(transcriptions)
     return "\n".join(transcriptions)

 SAMPLING_RATE = 16000
+def transcribe(chunk: np.ndarray, language: str = "en") -> str:
+    # 言語設定用のトークナイズオプションを設定
+    forced_decoder_ids = processor.tokenizer.get_decoder_prompt_ids(language=language, task="transcribe")
     input_features = processor(chunk, sampling_rate=SAMPLING_RATE, return_tensors="pt").input_features.to(device)
+    predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
     transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
     print(transcriptions)
     return "\n".join(transcriptions)