Spaces:
Sleeping
Sleeping
ftshijt
commited on
Commit
·
6c509e2
1
Parent(s):
37d87af
fix docker setup for sdk
Browse files- README.md +4 -4
- universal_metrics.yaml +46 -1
README.md
CHANGED
|
@@ -3,10 +3,10 @@ title: VERSA Speech & Audio Evaluation Demo
|
|
| 3 |
emoji: 🎙️
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: indigo
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version: 5.25.0
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
license: apache-2.0
|
| 11 |
---
|
| 12 |
|
|
@@ -51,4 +51,4 @@ If you use VERSA in your research, please cite:
|
|
| 51 |
primaryClass={cs.SD},
|
| 52 |
url={https://arxiv.org/abs/2412.17667},
|
| 53 |
}
|
| 54 |
-
```
|
|
|
|
| 3 |
emoji: 🎙️
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: indigo
|
| 6 |
+
sdk: docker
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
hf_oauth: false
|
| 10 |
license: apache-2.0
|
| 11 |
---
|
| 12 |
|
|
|
|
| 51 |
primaryClass={cs.SD},
|
| 52 |
url={https://arxiv.org/abs/2412.17667},
|
| 53 |
}
|
| 54 |
+
```
|
universal_metrics.yaml
CHANGED
|
@@ -155,4 +155,49 @@
|
|
| 155 |
fmin: 50
|
| 156 |
fmax: 8000 #14000
|
| 157 |
n_fft: 1024 # 1028
|
| 158 |
-
hop_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
fmin: 50
|
| 156 |
fmax: 8000 #14000
|
| 157 |
n_fft: 1024 # 1028
|
| 158 |
+
hop_size: 320
|
| 159 |
+
mel_bins: 64
|
| 160 |
+
window_size: 1024
|
| 161 |
+
# PROJECTION SPACE CONFIG
|
| 162 |
+
d_proj: 1024
|
| 163 |
+
temperature: 0.003
|
| 164 |
+
# TRAINING AND EVALUATION CONFIG
|
| 165 |
+
num_classes: 527
|
| 166 |
+
batch_size: 1024
|
| 167 |
+
demo: False
|
| 168 |
+
|
| 169 |
+
# Speaking rate calculating
|
| 170 |
+
# --speaking_rate: correct matching words/character counts
|
| 171 |
+
- name: speaking_rate
|
| 172 |
+
model_tag: default
|
| 173 |
+
beam_size: 1
|
| 174 |
+
text_cleaner: whisper_basic
|
| 175 |
+
|
| 176 |
+
# Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
|
| 177 |
+
- name: audiobox_aesthetics
|
| 178 |
+
batch_size: 1
|
| 179 |
+
cache_dir: versa_cache/audiobox
|
| 180 |
+
|
| 181 |
+
# ASR-match calculating
|
| 182 |
+
# --asr_match_error_rate: correct matching words/character counts
|
| 183 |
+
- name: asr_match
|
| 184 |
+
model_tag: default
|
| 185 |
+
beam_size: 1
|
| 186 |
+
text_cleaner: whisper_basic
|
| 187 |
+
|
| 188 |
+
# speaker related metrics
|
| 189 |
+
# -- spk_similarity: speaker cosine similarity
|
| 190 |
+
- name: speaker
|
| 191 |
+
model_tag: default
|
| 192 |
+
|
| 193 |
+
# asvspoof related metrics
|
| 194 |
+
# -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
|
| 195 |
+
- name: asvspoof_score
|
| 196 |
+
|
| 197 |
+
# signal related metrics
|
| 198 |
+
# -- sir: signal to interference ratio
|
| 199 |
+
# -- sar: signal to artifact ratio
|
| 200 |
+
# -- sdr: signal to distortion ratio
|
| 201 |
+
# -- ci-sdr: scale-invariant signal to distortion ratio
|
| 202 |
+
# -- si-snri: scale-invariant signal to noise ratio improvement
|
| 203 |
+
- name: signal_metric
|