canary-qwen-2.5b

Running

App Files Files Community

piotrzelasko commited on Jul 2

Commit

eaf7016

1 Parent(s): f5a656a

Works

Browse files

Signed-off-by: Piotr Żelasko <[email protected]>

Files changed (2) hide show

app.py +81 -213
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -12,240 +12,108 @@ from nemo.collections.speechlm2 import SALM
 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
-device = torch.device("cuda")
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
 with device:
     torch.set_default_dtype(torch.bfloat16)  # speed up start-up time
-    model = SALM.from_pretrained("nvidia/canary-qwen-2.5b").bfloat16().eval().cuda()
     torch.set_default_dtype(torch.float32)
-feature_stride = model.cfg.preprocessor['window_stride']
-model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
-#frame_asr = FrameBatchMultiTaskAED(
-#	asr_model=model,
-#	frame_len=40.0,
-#	total_buffer=40.0,
-#	batch_size=16,
-#)
 def as_batches(audio_filepath, utt_id):
     rec = Recording.from_file(audio_filepath, recording_id=utt_id)
-	if rec.duration / 60.0 > MAX_AUDIO_MINUTES:
-		raise gr.Error(
-			f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
-			"If you wish, you may trim the audio using the Audio viewer in Step 1 "
-			"(click on the scissors icon to start trimming audio)."
-		)
-    return DynamicCutSampler(
-        cuts=rec.resample(SAMPLE_RATE).to_cut().cut_into_windows(40.0),
-        max_cuts=4,
-    )
-def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
-	if audio_filepath is None:
-		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
-	utt_id = uuid.uuid4()
     pred_text = []
     for batch in as_batches(audio_filepath, str(utt_id)):
         audio, audio_lens = batch.load_audio(collate=True)
-        output_ids = model.generate(
-            prompts=[[{"role": "user", "content": f"Transcribe the following: {model.audio_locator_tag}"}]],
-            audio=audio,
-            audio_lens=audio_lens,
-            max_new_tokens=256,
-        )
-        pred_text.extend(model.tokenizer.ids_to_text(oids) for ids in output_ids.cpu())
-	return ' '.join(pred_text)
-# add logic to make sure dropdown menus only suggest valid combos
-def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
-	"""Callback function for when src_lang or tgt_lang dropdown menus are changed.
-	Args:
-		src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
-			chosen "values" of each Gradio component
-	Returns:
-		src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
-	Note: I found the required logic is easier to understand if you think about the possible src & tgt langs as
-	a matrix, e.g. with English, Spanish, French, German as the langs, and only transcription in the same language,
-	and X -> English and English -> X translation being allowed, the matrix looks like the diagram below ("Y" means it is
-	allowed to go into that state).
-	It is easier to understand the code if you think about which state you are in, given the current src_lang_value and
-	tgt_lang_value, and then which states you can go to from there.
-			tgt lang
-			- |EN |ES |FR |DE
-			------------------
-			EN| Y | Y | Y | Y
-			------------------
-		src 	ES| Y | Y |   |
-		lang	------------------
-			FR| Y |   | Y |
-			------------------
-			DE| Y |   |   | Y
-	"""
-	if src_lang_value == "English" and tgt_lang_value == "English":
-		# src_lang and tgt_lang can go anywhere
-		src_lang = gr.Dropdown(
-			choices=["English", "Spanish", "French", "German"],
-			value=src_lang_value,
-			label="Input audio is spoken in:"
-		)
-		tgt_lang = gr.Dropdown(
-			choices=["English", "Spanish", "French", "German"],
-			value=tgt_lang_value,
-			label="Transcribe in language:"
-		)
-	elif src_lang_value == "English":
-		# src is English & tgt is non-English
-		# => src can only be English or current tgt_lang_values
-		# & tgt can be anything
-		src_lang = gr.Dropdown(
-			choices=["English", tgt_lang_value],
-			value=src_lang_value,
-			label="Input audio is spoken in:"
-		)
-		tgt_lang = gr.Dropdown(
-			choices=["English", "Spanish", "French", "German"],
-			value=tgt_lang_value,
-			label="Transcribe in language:"
-		)
-	elif tgt_lang_value == "English":
-		# src is non-English & tgt is English
-		# => src can be anything
-		# & tgt can only be English or current src_lang_value
-		src_lang = gr.Dropdown(
-			choices=["English", "Spanish", "French", "German"],
-			value=src_lang_value,
-			label="Input audio is spoken in:"
-		)
-		tgt_lang = gr.Dropdown(
-			choices=["English", src_lang_value],
-			value=tgt_lang_value,
-			label="Transcribe in language:"
-		)
-	else:
-		# both src and tgt are non-English
-		# => both src and tgt can only be switch to English or themselves
-		src_lang = gr.Dropdown(
-			choices=["English", src_lang_value],
-			value=src_lang_value,
-			label="Input audio is spoken in:"
-		)
-		tgt_lang = gr.Dropdown(
-			choices=["English", tgt_lang_value],
-			value=tgt_lang_value,
-			label="Transcribe in language:"
-		)
-	# let pnc be anything if src_lang_value == tgt_lang_value, else fix to True
-	if src_lang_value == tgt_lang_value:
-		pnc = gr.Checkbox(
-			value=pnc_value,
-			label="Punctuation & Capitalization in transcript?",
-			interactive=True
-		)
-	else:
-		pnc = gr.Checkbox(
-			value=True,
-			label="Punctuation & Capitalization in transcript?",
-			interactive=False
-		)
-	return src_lang, tgt_lang, pnc
 with gr.Blocks(
-	title="NeMo Canary-Qwen-2.5B Model",
-	css="""
-		textarea { font-size: 18px;}
-		#model_output_text_box span {
-			font-size: 18px;
-			font-weight: bold;
-		}
-	""",
-	theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
-	gr.HTML("<h1 style='text-align: center'>NeMo Canary-Qwen-2.5B model: Transcribe audio</h1>")
-	with gr.Row():
-		with gr.Column():
-			gr.HTML(
-				"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
-				"<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
-				"You can transcribe longer files locally with this NeMo "
-				"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
-			)
-			audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
-			gr.HTML("<p><b>Step 2:</b> Choose the input and output language.</p>")
-			src_lang = gr.Dropdown(
-				choices=["English", "Spanish", "French", "German"],
-				value="English",
-				label="Input audio is spoken in:"
-			)
-			with gr.Column():
-				tgt_lang = gr.Dropdown(
-					choices=["English", "Spanish", "French", "German"],
-					value="English",
-					label="Transcribe in language:"
-				)
-				pnc = gr.Checkbox(
-					value=True,
-					label="Punctuation & Capitalization in transcript?",
-				)
-		with gr.Column():
-			gr.HTML("<p><b>Step 3:</b> Run the model.</p>")
-			go_button = gr.Button(
-				value="Run model",
-				variant="primary", # make "primary" so it stands out (default is "secondary")
-			)
-			model_output_text_box = gr.Textbox(
-				label="Model Output",
-				elem_id="model_output_text_box",
-			)
-	with gr.Row():
-		gr.HTML(
-			"<p style='text-align: center'>"
-				"🐤 <a href='https://huggingface.co/nvidia/canary-qwen-2.5b' target='_blank'>Canary model</a> | "
-				"🧑‍💻 <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
-			"</p>"
-		)
-	go_button.click(
-		fn=transcribe,
-		inputs = [audio_file, src_lang, tgt_lang, pnc],
-		outputs = [model_output_text_box]
-	)
-	# call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
-	src_lang.change(
-		fn=on_src_or_tgt_lang_change,
-		inputs=[src_lang, tgt_lang, pnc],
-		outputs=[src_lang, tgt_lang, pnc],
-	)
-	tgt_lang.change(
-		fn=on_src_or_tgt_lang_change,
-		inputs=[src_lang, tgt_lang, pnc],
-		outputs=[src_lang, tgt_lang, pnc],
-	)
 demo.queue()

 from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
 from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
 SAMPLE_RATE = 16000 # Hz
 MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
+CHUNK_SECONDS = 40.0  # max audio length seen by the model
+BATCH_SIZE = 4  # for parallel transcription of audio longer than CHUNK_SECONDS
 with device:
     torch.set_default_dtype(torch.bfloat16)  # speed up start-up time
+    model = SALM.from_pretrained("nvidia/canary-qwen-2.5b").bfloat16().eval().to(device)
     torch.set_default_dtype(torch.float32)
 def as_batches(audio_filepath, utt_id):
     rec = Recording.from_file(audio_filepath, recording_id=utt_id)
+    if rec.duration / 60.0 > MAX_AUDIO_MINUTES:
+        raise gr.Error(
+            f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
+            "If you wish, you may trim the audio using the Audio viewer in Step 1 "
+            "(click on the scissors icon to start trimming audio)."
+        )
+    cut = rec.resample(SAMPLE_RATE).to_cut()
+    if cut.num_channels > 1:
+        cut = cut.to_mono(mono_downmix=True)
+    return DynamicCutSampler(cut.cut_into_windows(CHUNK_SECONDS), max_cuts=BATCH_SIZE)
+def transcribe(audio_filepath):
+    if audio_filepath is None:
+        raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
+    utt_id = uuid.uuid4()
     pred_text = []
     for batch in as_batches(audio_filepath, str(utt_id)):
         audio, audio_lens = batch.load_audio(collate=True)
+        with torch.inference_mode():
+            output_ids = model.generate(
+                prompts=[[{"role": "user", "content": f"Transcribe the following: {model.audio_locator_tag}"}]] * len(batch),
+                audios=torch.as_tensor(audio).to(device, non_blocking=True),
+                audio_lens=torch.as_tensor(audio_lens).to(device, non_blocking=True),
+                max_new_tokens=256,
+            )
+        pred_text.extend(model.tokenizer.ids_to_text(oids) for oids in output_ids.cpu())
+    return ' '.join(pred_text)
 with gr.Blocks(
+    title="NeMo Canary-Qwen-2.5B Model",
+    css="""
+        textarea { font-size: 18px;}
+        #model_output_text_box span {
+            font-size: 18px;
+            font-weight: bold;
+        }
+    """,
+    theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
 ) as demo:
+    gr.HTML("<h1 style='text-align: center'>NeMo Canary-Qwen-2.5B model: Transcribe audio</h1>")
+    with gr.Row():
+        with gr.Column():
+            gr.HTML(
+                "<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
+                "<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
+                "You can transcribe longer files locally with NeMo. "
+                #"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
+            )
+            audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
+        with gr.Column():
+            gr.HTML("<p><b>Step 2:</b> Run the model.</p>")
+            go_button = gr.Button(
+                value="Run model",
+                variant="primary", # make "primary" so it stands out (default is "secondary")
+            )
+            model_output_text_box = gr.Textbox(
+                label="Model Output",
+                elem_id="model_output_text_box",
+            )
+    with gr.Row():
+        gr.HTML(
+            "<p style='text-align: center'>"
+                "🐤 <a href='https://huggingface.co/nvidia/canary-qwen-2.5b' target='_blank'>Canary model</a> | "
+                "🧑‍💻 <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
+            "</p>"
+        )
+    go_button.click(
+        fn=transcribe,
+        inputs=[audio_file],
+        outputs=[model_output_text_box]
+    )
 demo.queue()

requirements.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	nemo_toolkit[asr] @ git+https://github.com/NVIDIA/NeMo.git

 nemo_toolkit[asr] @ git+https://github.com/NVIDIA/NeMo.git
+sacrebleu
+seaborn