Spaces:
Build error
Build error
initial commit
Browse files- README.md +5 -7
- app.py +75 -0
- face_emotion_detection.py +124 -0
- facial_analysis.py +334 -0
- packages.txt +1 -0
- requirements.txt +6 -0
- vid_to_wav.py +17 -0
README.md
CHANGED
|
@@ -1,13 +1,11 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 3.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
license: cc-by-nc-sa-4.0
|
| 11 |
---
|
| 12 |
|
| 13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Speech Evaluation
|
| 3 |
+
emoji: 💬
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 3.23.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
|
|
app.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import torch.cuda
|
| 4 |
+
import whisper
|
| 5 |
+
from whisper.tokenizer import LANGUAGES
|
| 6 |
+
from vid_to_wav import extract_audio
|
| 7 |
+
from face_emotion_detection import process_video
|
| 8 |
+
gpu = torch.cuda.is_available()
|
| 9 |
+
model = None
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def analyze_transcription(text, duration):
|
| 13 |
+
word_count = len(text.split())
|
| 14 |
+
analysis_text = "The video is {} sec. long and the speaker speaks {} words.".format(
|
| 15 |
+
duration, word_count)
|
| 16 |
+
duration_in_min = duration/60
|
| 17 |
+
words_per_min = round(word_count /duration_in_min)
|
| 18 |
+
analysis_text = analysis_text + "The speech speed is {} words-per-minute".format(words_per_min)
|
| 19 |
+
if words_per_min < 130:
|
| 20 |
+
analysis_text = analysis_text + "The speaker has spoken slowly that average speakers"
|
| 21 |
+
elif words_per_min > 150:
|
| 22 |
+
analysis_text = analysis_text + "The speaker has spoken faster that average speakers"
|
| 23 |
+
else:
|
| 24 |
+
analysis_text = analysis_text + "The speaker maintains normal speed during speech making the speech comprehensible to most audiences!"
|
| 25 |
+
return analysis_text
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def transcribe(filepath, language, task):
|
| 29 |
+
print(filepath)
|
| 30 |
+
video = process_video(filepath)
|
| 31 |
+
audio, audio_file, duration = extract_audio(filepath)
|
| 32 |
+
print(type)
|
| 33 |
+
language = None if language == "Detect" else language
|
| 34 |
+
text = model.transcribe(
|
| 35 |
+
audio_file, task=task.lower(), language=language, fp16=gpu,
|
| 36 |
+
)["text"].strip()
|
| 37 |
+
return video, text, analyze_transcription(text, duration)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_interface(model_name="medium"):
|
| 41 |
+
global model
|
| 42 |
+
model = whisper.load_model(model_name)
|
| 43 |
+
|
| 44 |
+
return gr.Interface(
|
| 45 |
+
fn=transcribe,
|
| 46 |
+
inputs=[
|
| 47 |
+
# gr.Audio(label="Record", source="microphone", type="filepath"),
|
| 48 |
+
gr.Video(label="Upload", source="upload", type="filepath"),
|
| 49 |
+
gr.Dropdown(
|
| 50 |
+
label="Language",
|
| 51 |
+
choices=["Detect"] + sorted([i.title()
|
| 52 |
+
for i in LANGUAGES.values()]),
|
| 53 |
+
value="Detect",
|
| 54 |
+
),
|
| 55 |
+
gr.Dropdown(
|
| 56 |
+
label="Task",
|
| 57 |
+
choices=["Transcribe", "Translate"],
|
| 58 |
+
value="Transcribe",
|
| 59 |
+
info="Whether to perform X->X speech recognition or X->English translation",
|
| 60 |
+
),
|
| 61 |
+
],
|
| 62 |
+
outputs=[gr.Video(label="Emotion Analysis"),
|
| 63 |
+
gr.Textbox(label="Transcription", lines=26),
|
| 64 |
+
gr.Textbox(label="Speech Analysis", lines=4)],
|
| 65 |
+
# theme=gr.themes.Default(),
|
| 66 |
+
theme=gr.themes.Glass(
|
| 67 |
+
primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.purple),
|
| 68 |
+
title="Whisper is listening to you",
|
| 69 |
+
# description=DESCRIPTION,
|
| 70 |
+
allow_flagging="never",
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
demo = get_interface()
|
| 75 |
+
demo.queue().launch(debug=True)
|
face_emotion_detection.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import datetime
|
| 3 |
+
import os
|
| 4 |
+
import json
|
| 5 |
+
import numpy as np
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
import cv2
|
| 8 |
+
from PIL import Image
|
| 9 |
+
import tensorflow as tf
|
| 10 |
+
from tensorflow.keras.models import Model, Sequential, load_model, model_from_json
|
| 11 |
+
from tensorflow.compat.v1.keras.backend import set_session
|
| 12 |
+
from facial_analysis import FacialImageProcessing
|
| 13 |
+
|
| 14 |
+
class NpEncoder(json.JSONEncoder):
|
| 15 |
+
def default(self, obj):
|
| 16 |
+
if isinstance(obj, np.integer):
|
| 17 |
+
return int(obj)
|
| 18 |
+
if isinstance(obj, np.floating):
|
| 19 |
+
return float(obj)
|
| 20 |
+
if isinstance(obj, np.ndarray):
|
| 21 |
+
return obj.tolist()
|
| 22 |
+
return super(NpEncoder, self).default(obj)
|
| 23 |
+
|
| 24 |
+
def initialize():
|
| 25 |
+
config = tf.compat.v1.ConfigProto()
|
| 26 |
+
config.gpu_options.allow_growth = True
|
| 27 |
+
sess = tf.compat.v1.Session(config=config)
|
| 28 |
+
set_session(sess)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def mobilenet_preprocess_input(x, **kwargs):
|
| 32 |
+
x[..., 0] -= 103.939
|
| 33 |
+
x[..., 1] -= 116.779
|
| 34 |
+
x[..., 2] -= 123.68
|
| 35 |
+
return x
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def detect_emotion(frame_bgr):
|
| 39 |
+
imgProcessing = FacialImageProcessing(False)
|
| 40 |
+
model = load_model('./models/affectnet_emotions/mobilenet_7.h5')
|
| 41 |
+
# print(model.summary())
|
| 42 |
+
preprocessing_function = mobilenet_preprocess_input
|
| 43 |
+
INPUT_SIZE = (224, 224)
|
| 44 |
+
idx_to_class = {0: 'Anger', 1: 'Disgust', 2: 'Fear',
|
| 45 |
+
3: 'Happiness', 4: 'Neutral', 5: 'Sadness', 6: 'Surprise'}
|
| 46 |
+
|
| 47 |
+
frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
|
| 48 |
+
bounding_boxes, points = imgProcessing.detect_faces(frame)
|
| 49 |
+
points = points.T
|
| 50 |
+
detections = {"id": str(datetime.datetime.now())}
|
| 51 |
+
|
| 52 |
+
for bbox, p in zip(bounding_boxes, points):
|
| 53 |
+
face_pred = {}
|
| 54 |
+
box = bbox.astype(np.int)
|
| 55 |
+
x1, y1, x2, y2 = box[0:4]
|
| 56 |
+
face_img = frame[y1:y2, x1:x2, :]
|
| 57 |
+
try:
|
| 58 |
+
face_img = cv2.resize(face_img, INPUT_SIZE)
|
| 59 |
+
except:
|
| 60 |
+
break
|
| 61 |
+
inp = face_img.astype(np.float32)
|
| 62 |
+
inp[..., 0] -= 103.939
|
| 63 |
+
inp[..., 1] -= 116.779
|
| 64 |
+
inp[..., 2] -= 123.68
|
| 65 |
+
inp = np.expand_dims(inp, axis=0)
|
| 66 |
+
scores = model.predict(inp)[0]
|
| 67 |
+
frame = cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 9, 12), 4)
|
| 68 |
+
cv2.putText(frame, idx_to_class[np.argmax(scores)] + ' ' + str(scores[np.argmax(
|
| 69 |
+
scores)]), (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
|
| 70 |
+
face_pred["face_bbox"] = [x1,y1,x2,y2]
|
| 71 |
+
face_pred["emotion_predicted"] = idx_to_class[np.argmax(scores)]
|
| 72 |
+
all_scores = {}
|
| 73 |
+
for i in range(len(scores)):
|
| 74 |
+
all_scores[str(idx_to_class[i])] = scores[i]
|
| 75 |
+
face_pred["scores"] = all_scores
|
| 76 |
+
|
| 77 |
+
detections["face"] = face_pred
|
| 78 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 79 |
+
print(detections)
|
| 80 |
+
return frame, detections
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def process_video(video):
|
| 84 |
+
basename = os.path.basename(video)
|
| 85 |
+
name_only = os.path.splitext(basename)[0]
|
| 86 |
+
video_outputpath = os.path.join('./output',basename)
|
| 87 |
+
json_outputpath = os.path.join('./output',name_only + '.json')
|
| 88 |
+
|
| 89 |
+
# Writing to sample.json
|
| 90 |
+
with open(json_outputpath, "w") as jsonfile:
|
| 91 |
+
videocap = cv2.VideoCapture(video) # fpath)
|
| 92 |
+
ret, frame = videocap.read()
|
| 93 |
+
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
|
| 94 |
+
fps = 24.0
|
| 95 |
+
size = (frame.shape[1], frame.shape[0])
|
| 96 |
+
out = cv2.VideoWriter(video_outputpath, fourcc, fps, size)
|
| 97 |
+
# for i in range(len(image_array)):
|
| 98 |
+
# out.write(image_array[i])
|
| 99 |
+
max_frame = 500
|
| 100 |
+
cnt = 0
|
| 101 |
+
while ret == True and cnt < 50:
|
| 102 |
+
processed_frame, detections = detect_emotion(frame)
|
| 103 |
+
json_object = json.dumps(detections, indent=4, cls=NpEncoder)
|
| 104 |
+
jsonfile.write(json_object)
|
| 105 |
+
cv2.imshow('img', np.array(processed_frame, dtype=np.uint8))
|
| 106 |
+
out.write(processed_frame)
|
| 107 |
+
ret, frame = videocap.read()
|
| 108 |
+
cv2.waitKey(1)
|
| 109 |
+
cnt += 1
|
| 110 |
+
videocap.release()
|
| 111 |
+
cv2.destroyAllWindows()
|
| 112 |
+
return out
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def main():
|
| 116 |
+
parser = argparse.ArgumentParser(description='Analysis of Video')
|
| 117 |
+
parser.add_argument(
|
| 118 |
+
'-v', '--video', help='Video to be analysed', required=True)
|
| 119 |
+
args = parser.parse_args()
|
| 120 |
+
process_video(args.video)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
if __name__ == '__main__':
|
| 124 |
+
main()
|
facial_analysis.py
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Reduced version of file https://github.com/HSE-asavchenko/HSE_FaceRec_tf/blob/master/age_gender_identity/facial_analysis.py
|
| 2 |
+
from __future__ import absolute_import
|
| 3 |
+
from __future__ import division
|
| 4 |
+
from __future__ import print_function
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
#os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
| 9 |
+
import argparse
|
| 10 |
+
import tensorflow as tf
|
| 11 |
+
import numpy as np
|
| 12 |
+
import cv2
|
| 13 |
+
import time
|
| 14 |
+
|
| 15 |
+
import subprocess, re
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def is_specialfile(path,exts):
|
| 19 |
+
_, file_extension = os.path.splitext(path)
|
| 20 |
+
return file_extension.lower() in exts
|
| 21 |
+
|
| 22 |
+
img_extensions=['.jpg','.jpeg','.png']
|
| 23 |
+
def is_image(path):
|
| 24 |
+
return is_specialfile(path,img_extensions)
|
| 25 |
+
|
| 26 |
+
video_extensions=['.mov','.avi']
|
| 27 |
+
def is_video(path):
|
| 28 |
+
return is_specialfile(path,video_extensions)
|
| 29 |
+
|
| 30 |
+
class FacialImageProcessing:
|
| 31 |
+
# minsize: minimum of faces' size
|
| 32 |
+
def __init__(self, print_stat=False, minsize = 32):
|
| 33 |
+
self.print_stat=print_stat
|
| 34 |
+
self.minsize=minsize
|
| 35 |
+
|
| 36 |
+
models_path,_ = os.path.split(os.path.realpath(__file__))
|
| 37 |
+
models_path=os.path.join(models_path,'models','face_detection')
|
| 38 |
+
model_files={os.path.join(models_path,'mtcnn.pb'):''}
|
| 39 |
+
|
| 40 |
+
with tf.Graph().as_default() as full_graph:
|
| 41 |
+
for model_file in model_files:
|
| 42 |
+
tf.import_graph_def(FacialImageProcessing.load_graph_def(model_file), name=model_files[model_file])
|
| 43 |
+
self.sess=tf.compat.v1.Session(graph=full_graph)#,config=tf.ConfigProto(device_count={'CPU':1,'GPU':0}))
|
| 44 |
+
self.pnet, self.rnet, self.onet = FacialImageProcessing.load_mtcnn(self.sess,full_graph)
|
| 45 |
+
|
| 46 |
+
def close(self):
|
| 47 |
+
self.sess.close()
|
| 48 |
+
|
| 49 |
+
@staticmethod
|
| 50 |
+
def load_graph_def(frozen_graph_filename):
|
| 51 |
+
graph_def=None
|
| 52 |
+
with tf.io.gfile.GFile(frozen_graph_filename, 'rb') as f:
|
| 53 |
+
graph_def = tf.compat.v1.GraphDef()
|
| 54 |
+
graph_def.ParseFromString(f.read())
|
| 55 |
+
return graph_def
|
| 56 |
+
|
| 57 |
+
@staticmethod
|
| 58 |
+
def load_graph(frozen_graph_filename, prefix=''):
|
| 59 |
+
graph_def = FacialImageProcessing.load_graph_def(frozen_graph_filename)
|
| 60 |
+
with tf.Graph().as_default() as graph:
|
| 61 |
+
tf.import_graph_def(graph_def, name=prefix)
|
| 62 |
+
return graph
|
| 63 |
+
|
| 64 |
+
@staticmethod
|
| 65 |
+
def load_mtcnn(sess,graph):
|
| 66 |
+
pnet_out_1=graph.get_tensor_by_name('pnet/conv4-2/BiasAdd:0')
|
| 67 |
+
pnet_out_2=graph.get_tensor_by_name('pnet/prob1:0')
|
| 68 |
+
pnet_in=graph.get_tensor_by_name('pnet/input:0')
|
| 69 |
+
|
| 70 |
+
rnet_out_1=graph.get_tensor_by_name('rnet/conv5-2/conv5-2:0')
|
| 71 |
+
rnet_out_2=graph.get_tensor_by_name('rnet/prob1:0')
|
| 72 |
+
rnet_in=graph.get_tensor_by_name('rnet/input:0')
|
| 73 |
+
|
| 74 |
+
onet_out_1=graph.get_tensor_by_name('onet/conv6-2/conv6-2:0')
|
| 75 |
+
onet_out_2=graph.get_tensor_by_name('onet/conv6-3/conv6-3:0')
|
| 76 |
+
onet_out_3=graph.get_tensor_by_name('onet/prob1:0')
|
| 77 |
+
onet_in=graph.get_tensor_by_name('onet/input:0')
|
| 78 |
+
|
| 79 |
+
pnet_fun = lambda img : sess.run((pnet_out_1, pnet_out_2), feed_dict={pnet_in:img})
|
| 80 |
+
rnet_fun = lambda img : sess.run((rnet_out_1, rnet_out_2), feed_dict={rnet_in:img})
|
| 81 |
+
onet_fun = lambda img : sess.run((onet_out_1, onet_out_2, onet_out_3), feed_dict={onet_in:img})
|
| 82 |
+
return pnet_fun, rnet_fun, onet_fun
|
| 83 |
+
|
| 84 |
+
@staticmethod
|
| 85 |
+
def bbreg(boundingbox,reg):
|
| 86 |
+
# calibrate bounding boxes
|
| 87 |
+
if reg.shape[1]==1:
|
| 88 |
+
reg = np.reshape(reg, (reg.shape[2], reg.shape[3]))
|
| 89 |
+
|
| 90 |
+
w = boundingbox[:,2]-boundingbox[:,0]+1
|
| 91 |
+
h = boundingbox[:,3]-boundingbox[:,1]+1
|
| 92 |
+
b1 = boundingbox[:,0]+reg[:,0]*w
|
| 93 |
+
b2 = boundingbox[:,1]+reg[:,1]*h
|
| 94 |
+
b3 = boundingbox[:,2]+reg[:,2]*w
|
| 95 |
+
b4 = boundingbox[:,3]+reg[:,3]*h
|
| 96 |
+
boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ]))
|
| 97 |
+
return boundingbox
|
| 98 |
+
|
| 99 |
+
@staticmethod
|
| 100 |
+
def generateBoundingBox(imap, reg, scale, t):
|
| 101 |
+
# use heatmap to generate bounding boxes
|
| 102 |
+
stride=2
|
| 103 |
+
cellsize=12
|
| 104 |
+
|
| 105 |
+
imap = np.transpose(imap)
|
| 106 |
+
dx1 = np.transpose(reg[:,:,0])
|
| 107 |
+
dy1 = np.transpose(reg[:,:,1])
|
| 108 |
+
dx2 = np.transpose(reg[:,:,2])
|
| 109 |
+
dy2 = np.transpose(reg[:,:,3])
|
| 110 |
+
y, x = np.where(imap >= t)
|
| 111 |
+
if y.shape[0]==1:
|
| 112 |
+
dx1 = np.flipud(dx1)
|
| 113 |
+
dy1 = np.flipud(dy1)
|
| 114 |
+
dx2 = np.flipud(dx2)
|
| 115 |
+
dy2 = np.flipud(dy2)
|
| 116 |
+
score = imap[(y,x)]
|
| 117 |
+
reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ]))
|
| 118 |
+
if reg.size==0:
|
| 119 |
+
reg = np.empty((0,3))
|
| 120 |
+
bb = np.transpose(np.vstack([y,x]))
|
| 121 |
+
q1 = np.fix((stride*bb+1)/scale)
|
| 122 |
+
q2 = np.fix((stride*bb+cellsize-1+1)/scale)
|
| 123 |
+
boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg])
|
| 124 |
+
return boundingbox, reg
|
| 125 |
+
|
| 126 |
+
# function pick = nms(boxes,threshold,type)
|
| 127 |
+
@staticmethod
|
| 128 |
+
def nms(boxes, threshold, method):
|
| 129 |
+
if boxes.size==0:
|
| 130 |
+
return np.empty((0,3))
|
| 131 |
+
x1 = boxes[:,0]
|
| 132 |
+
y1 = boxes[:,1]
|
| 133 |
+
x2 = boxes[:,2]
|
| 134 |
+
y2 = boxes[:,3]
|
| 135 |
+
s = boxes[:,4]
|
| 136 |
+
area = (x2-x1+1) * (y2-y1+1)
|
| 137 |
+
I = np.argsort(s)
|
| 138 |
+
pick = np.zeros_like(s, dtype=np.int16)
|
| 139 |
+
counter = 0
|
| 140 |
+
while I.size>0:
|
| 141 |
+
i = I[-1]
|
| 142 |
+
pick[counter] = i
|
| 143 |
+
counter += 1
|
| 144 |
+
idx = I[0:-1]
|
| 145 |
+
xx1 = np.maximum(x1[i], x1[idx])
|
| 146 |
+
yy1 = np.maximum(y1[i], y1[idx])
|
| 147 |
+
xx2 = np.minimum(x2[i], x2[idx])
|
| 148 |
+
yy2 = np.minimum(y2[i], y2[idx])
|
| 149 |
+
w = np.maximum(0.0, xx2-xx1+1)
|
| 150 |
+
h = np.maximum(0.0, yy2-yy1+1)
|
| 151 |
+
inter = w * h
|
| 152 |
+
if method == 'Min':
|
| 153 |
+
o = inter / np.minimum(area[i], area[idx])
|
| 154 |
+
else:
|
| 155 |
+
o = inter / (area[i] + area[idx] - inter)
|
| 156 |
+
I = I[np.where(o<=threshold)]
|
| 157 |
+
pick = pick[0:counter]
|
| 158 |
+
return pick
|
| 159 |
+
|
| 160 |
+
# function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h)
|
| 161 |
+
@staticmethod
|
| 162 |
+
def pad(total_boxes, w, h):
|
| 163 |
+
# compute the padding coordinates (pad the bounding boxes to square)
|
| 164 |
+
tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32)
|
| 165 |
+
tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32)
|
| 166 |
+
numbox = total_boxes.shape[0]
|
| 167 |
+
|
| 168 |
+
dx = np.ones((numbox), dtype=np.int32)
|
| 169 |
+
dy = np.ones((numbox), dtype=np.int32)
|
| 170 |
+
edx = tmpw.copy().astype(np.int32)
|
| 171 |
+
edy = tmph.copy().astype(np.int32)
|
| 172 |
+
|
| 173 |
+
x = total_boxes[:,0].copy().astype(np.int32)
|
| 174 |
+
y = total_boxes[:,1].copy().astype(np.int32)
|
| 175 |
+
ex = total_boxes[:,2].copy().astype(np.int32)
|
| 176 |
+
ey = total_boxes[:,3].copy().astype(np.int32)
|
| 177 |
+
|
| 178 |
+
tmp = np.where(ex>w)
|
| 179 |
+
edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1)
|
| 180 |
+
ex[tmp] = w
|
| 181 |
+
|
| 182 |
+
tmp = np.where(ey>h)
|
| 183 |
+
edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1)
|
| 184 |
+
ey[tmp] = h
|
| 185 |
+
|
| 186 |
+
tmp = np.where(x<1)
|
| 187 |
+
dx.flat[tmp] = np.expand_dims(2-x[tmp],1)
|
| 188 |
+
x[tmp] = 1
|
| 189 |
+
|
| 190 |
+
tmp = np.where(y<1)
|
| 191 |
+
dy.flat[tmp] = np.expand_dims(2-y[tmp],1)
|
| 192 |
+
y[tmp] = 1
|
| 193 |
+
|
| 194 |
+
return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph
|
| 195 |
+
|
| 196 |
+
# function [bboxA] = rerec(bboxA)
|
| 197 |
+
@staticmethod
|
| 198 |
+
def rerec(bboxA):
|
| 199 |
+
# convert bboxA to square
|
| 200 |
+
h = bboxA[:,3]-bboxA[:,1]
|
| 201 |
+
w = bboxA[:,2]-bboxA[:,0]
|
| 202 |
+
l = np.maximum(w, h)
|
| 203 |
+
bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5
|
| 204 |
+
bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5
|
| 205 |
+
bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1)))
|
| 206 |
+
return bboxA
|
| 207 |
+
|
| 208 |
+
def detect_faces(self,img):
|
| 209 |
+
# im: input image
|
| 210 |
+
# threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold
|
| 211 |
+
threshold = [ 0.6, 0.7, 0.9 ] # three steps's threshold
|
| 212 |
+
# fastresize: resize img from last scale (using in high-resolution images) if fastresize==true
|
| 213 |
+
factor = 0.709 # scale factor
|
| 214 |
+
factor_count=0
|
| 215 |
+
total_boxes=np.empty((0,9))
|
| 216 |
+
points=np.array([])
|
| 217 |
+
h=img.shape[0]
|
| 218 |
+
w=img.shape[1]
|
| 219 |
+
minl=np.amin([h, w])
|
| 220 |
+
m=12.0/self.minsize
|
| 221 |
+
minl=minl*m
|
| 222 |
+
# creat scale pyramid
|
| 223 |
+
scales=[]
|
| 224 |
+
while minl>=12:
|
| 225 |
+
scales += [m*np.power(factor, factor_count)]
|
| 226 |
+
minl = minl*factor
|
| 227 |
+
factor_count += 1
|
| 228 |
+
|
| 229 |
+
# first stage
|
| 230 |
+
#t=time.time()
|
| 231 |
+
for j in range(len(scales)):
|
| 232 |
+
scale=scales[j]
|
| 233 |
+
hs=int(np.ceil(h*scale))
|
| 234 |
+
ws=int(np.ceil(w*scale))
|
| 235 |
+
im_data = cv2.resize(img, (ws,hs), interpolation=cv2.INTER_AREA)
|
| 236 |
+
im_data = (im_data-127.5)*0.0078125
|
| 237 |
+
img_x = np.expand_dims(im_data, 0)
|
| 238 |
+
img_y = np.transpose(img_x, (0,2,1,3))
|
| 239 |
+
out = self.pnet(img_y)
|
| 240 |
+
out0 = np.transpose(out[0], (0,2,1,3))
|
| 241 |
+
out1 = np.transpose(out[1], (0,2,1,3))
|
| 242 |
+
|
| 243 |
+
boxes, _ = FacialImageProcessing.generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0])
|
| 244 |
+
|
| 245 |
+
# inter-scale nms
|
| 246 |
+
pick = FacialImageProcessing.nms(boxes.copy(), 0.5, 'Union')
|
| 247 |
+
if boxes.size>0 and pick.size>0:
|
| 248 |
+
boxes = boxes[pick,:]
|
| 249 |
+
total_boxes = np.append(total_boxes, boxes, axis=0)
|
| 250 |
+
numbox = total_boxes.shape[0]
|
| 251 |
+
#elapsed = time.time() - t
|
| 252 |
+
#print('1 phase nb=%d elapsed=%f'%(numbox,elapsed))
|
| 253 |
+
if numbox>0:
|
| 254 |
+
pick = FacialImageProcessing.nms(total_boxes.copy(), 0.7, 'Union')
|
| 255 |
+
total_boxes = total_boxes[pick,:]
|
| 256 |
+
regw = total_boxes[:,2]-total_boxes[:,0]
|
| 257 |
+
regh = total_boxes[:,3]-total_boxes[:,1]
|
| 258 |
+
qq1 = total_boxes[:,0]+total_boxes[:,5]*regw
|
| 259 |
+
qq2 = total_boxes[:,1]+total_boxes[:,6]*regh
|
| 260 |
+
qq3 = total_boxes[:,2]+total_boxes[:,7]*regw
|
| 261 |
+
qq4 = total_boxes[:,3]+total_boxes[:,8]*regh
|
| 262 |
+
total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]]))
|
| 263 |
+
total_boxes = FacialImageProcessing.rerec(total_boxes.copy())
|
| 264 |
+
total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32)
|
| 265 |
+
dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = FacialImageProcessing.pad(total_boxes.copy(), w, h)
|
| 266 |
+
|
| 267 |
+
numbox = total_boxes.shape[0]
|
| 268 |
+
#elapsed = time.time() - t
|
| 269 |
+
#print('2 phase nb=%d elapsed=%f'%(numbox,elapsed))
|
| 270 |
+
if numbox>0:
|
| 271 |
+
# second stage
|
| 272 |
+
tempimg = np.zeros((24,24,3,numbox))
|
| 273 |
+
for k in range(0,numbox):
|
| 274 |
+
tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
|
| 275 |
+
tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
|
| 276 |
+
if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
|
| 277 |
+
tempimg[:,:,:,k] = cv2.resize(tmp, (24,24), interpolation=cv2.INTER_AREA)
|
| 278 |
+
else:
|
| 279 |
+
return np.empty()
|
| 280 |
+
tempimg = (tempimg-127.5)*0.0078125
|
| 281 |
+
tempimg1 = np.transpose(tempimg, (3,1,0,2))
|
| 282 |
+
out = self.rnet(tempimg1)
|
| 283 |
+
out0 = np.transpose(out[0])
|
| 284 |
+
out1 = np.transpose(out[1])
|
| 285 |
+
score = out1[1,:]
|
| 286 |
+
ipass = np.where(score>threshold[1])
|
| 287 |
+
total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
|
| 288 |
+
mv = out0[:,ipass[0]]
|
| 289 |
+
if total_boxes.shape[0]>0:
|
| 290 |
+
pick = FacialImageProcessing.nms(total_boxes, 0.7, 'Union')
|
| 291 |
+
total_boxes = total_boxes[pick,:]
|
| 292 |
+
total_boxes = FacialImageProcessing.bbreg(total_boxes.copy(), np.transpose(mv[:,pick]))
|
| 293 |
+
total_boxes = FacialImageProcessing.rerec(total_boxes.copy())
|
| 294 |
+
|
| 295 |
+
numbox = total_boxes.shape[0]
|
| 296 |
+
#elapsed = time.time() - t
|
| 297 |
+
#print('3 phase nb=%d elapsed=%f'%(numbox,elapsed))
|
| 298 |
+
if numbox>0:
|
| 299 |
+
# third stage
|
| 300 |
+
total_boxes = np.fix(total_boxes).astype(np.int32)
|
| 301 |
+
dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = FacialImageProcessing.pad(total_boxes.copy(), w, h)
|
| 302 |
+
tempimg = np.zeros((48,48,3,numbox))
|
| 303 |
+
for k in range(0,numbox):
|
| 304 |
+
tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3))
|
| 305 |
+
tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:]
|
| 306 |
+
if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0:
|
| 307 |
+
tempimg[:,:,:,k] = cv2.resize(tmp, (48,48), interpolation=cv2.INTER_AREA)
|
| 308 |
+
else:
|
| 309 |
+
return np.empty()
|
| 310 |
+
tempimg = (tempimg-127.5)*0.0078125
|
| 311 |
+
tempimg1 = np.transpose(tempimg, (3,1,0,2))
|
| 312 |
+
out = self.onet(tempimg1)
|
| 313 |
+
out0 = np.transpose(out[0])
|
| 314 |
+
out1 = np.transpose(out[1])
|
| 315 |
+
out2 = np.transpose(out[2])
|
| 316 |
+
score = out2[1,:]
|
| 317 |
+
points = out1
|
| 318 |
+
ipass = np.where(score>threshold[2])
|
| 319 |
+
points = points[:,ipass[0]]
|
| 320 |
+
total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)])
|
| 321 |
+
mv = out0[:,ipass[0]]
|
| 322 |
+
|
| 323 |
+
w = total_boxes[:,2]-total_boxes[:,0]+1
|
| 324 |
+
h = total_boxes[:,3]-total_boxes[:,1]+1
|
| 325 |
+
points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1
|
| 326 |
+
points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1
|
| 327 |
+
if total_boxes.shape[0]>0:
|
| 328 |
+
total_boxes = FacialImageProcessing.bbreg(total_boxes.copy(), np.transpose(mv))
|
| 329 |
+
pick = FacialImageProcessing.nms(total_boxes.copy(), 0.7, 'Min')
|
| 330 |
+
total_boxes = total_boxes[pick,:]
|
| 331 |
+
points = points[:,pick]
|
| 332 |
+
#elapsed = time.time() - t
|
| 333 |
+
#print('4 phase elapsed=%f'%(elapsed))
|
| 334 |
+
return total_boxes, points
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchvision
|
| 3 |
+
torchaudio
|
| 4 |
+
openai-whisper
|
| 5 |
+
gradio
|
| 6 |
+
moviepy
|
vid_to_wav.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import moviepy
|
| 2 |
+
import os
|
| 3 |
+
import glob
|
| 4 |
+
import moviepy.editor
|
| 5 |
+
|
| 6 |
+
def extract_audio(vid_filename):
|
| 7 |
+
video = moviepy.editor.VideoFileClip(vid_filename)
|
| 8 |
+
duration = video.duration
|
| 9 |
+
|
| 10 |
+
audio = video.audio
|
| 11 |
+
wav_file_name = ""
|
| 12 |
+
if audio is not None:
|
| 13 |
+
wav_file_name = vid_filename.replace('.mp4', '.wav') # Replace .mkv with .wav
|
| 14 |
+
audio.write_audiofile(wav_file_name)
|
| 15 |
+
|
| 16 |
+
return audio, wav_file_name, duration
|
| 17 |
+
|