From f09a577ab5795451835526383be75f20f88f4c8f Mon Sep 17 00:00:00 2001 From: Sinitsina Maria Date: Sun, 5 Dec 2021 17:58:44 +0300 Subject: [PATCH] add OpenCV audio reading --- samples/dnn/speech_recognition.py | 113 +++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 26 deletions(-) diff --git a/samples/dnn/speech_recognition.py b/samples/dnn/speech_recognition.py index 025607edab..7bc424b37c 100644 --- a/samples/dnn/speech_recognition.py +++ b/samples/dnn/speech_recognition.py @@ -2,7 +2,6 @@ import numpy as np import cv2 as cv import argparse import os -import soundfile as sf # Temporary import to load audio files ''' You can download the converted onnx model from https://drive.google.com/drive/folders/1wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6?usp=sharing @@ -399,11 +398,6 @@ def predict(features, net, decoder): decoder : Decoder object return : Predicted text ''' - # This is a workaround https://github.com/opencv/opencv/issues/19091 - # expanding 1 dimentions allows us to pass it to the network - # from python. This should be resolved in the future. - features = np.expand_dims(features,axis=3) - # make prediction net.setInput(features) output = net.forward() @@ -412,6 +406,63 @@ def predict(features, net, decoder): prediction = decoder.decode(output.squeeze(0)) return prediction[0] +def readAudioFile(file, audioStream): + cap = cv.VideoCapture(file) + samplingRate = 16000 + params = np.asarray([cv.CAP_PROP_AUDIO_STREAM, audioStream, + cv.CAP_PROP_VIDEO_STREAM, -1, + cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_32F, + cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate + ]) + cap.open(file, cv.CAP_ANY, params) + if cap.isOpened() is False: + print("Error : Can't read audio file:", file, "with audioStream = ", audioStream) + return + audioBaseIndex = int (cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX)) + inputAudio = [] + while(1): + if (cap.grab()): + frame = np.asarray([]) + frame = cap.retrieve(frame, audioBaseIndex) + for i in range(len(frame[1][0])): + inputAudio.append(frame[1][0][i]) + else: + break + inputAudio = np.asarray(inputAudio, dtype=np.float64) + return inputAudio, samplingRate + +def readAudioMicrophone(microTime): + cap = cv.VideoCapture() + samplingRate = 16000 + params = np.asarray([cv.CAP_PROP_AUDIO_STREAM, 0, + cv.CAP_PROP_VIDEO_STREAM, -1, + cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_32F, + cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate + ]) + cap.open(0, cv.CAP_ANY, params) + if cap.isOpened() is False: + print("Error: Can't open microphone") + print("Error: problems with audio reading, check input arguments") + return + audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX)) + cvTickFreq = cv.getTickFrequency() + sysTimeCurr = cv.getTickCount() + sysTimePrev = sysTimeCurr + inputAudio = [] + while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime): + if (cap.grab()): + frame = np.asarray([]) + frame = cap.retrieve(frame, audioBaseIndex) + for i in range(len(frame[1][0])): + inputAudio.append(frame[1][0][i]) + sysTimeCurr = cv.getTickCount() + else: + print("Error: Grab error") + break + inputAudio = np.asarray(inputAudio, dtype=np.float64) + print("Number of samples: ", len(inputAudio)) + return inputAudio, samplingRate + if __name__ == '__main__': # Computation backends supported by layers @@ -421,7 +472,10 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='This script runs Jasper Speech recognition model', formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--input_audio', type=str, required=True, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines') + parser.add_argument('--input_type', type=str, required=True, help='file or microphone') + parser.add_argument('--micro_time', type=int, default=15, help='Duration of microphone work in seconds. Must be more than 6 sec') + parser.add_argument('--input_audio', type=str, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines') + parser.add_argument('--audio_stream', type=int, default=0, help='CAP_PROP_AUDIO_STREAM value') parser.add_argument('--show_spectrogram', action='store_true', help='Whether to show a spectrogram of the input audio.') parser.add_argument('--model', type=str, default='jasper.onnx', help='Path to the onnx file of Jasper. default="jasper.onnx"') parser.add_argument('--output', type=str, help='Path to file where recognized audio transcript must be saved. Leave this to print on console.') @@ -442,28 +496,35 @@ if __name__ == '__main__': raise OSError("Input audio file does not exist") if not os.path.isfile(args.model): raise OSError("Jasper model file does not exist") - if args.input_audio.endswith('.txt'): - with open(args.input_audio) as f: - content = f.readlines() - content = [x.strip() for x in content] - audio_file_paths = content - for audio_file_path in audio_file_paths: - if not os.path.isfile(audio_file_path): - raise OSError("Audio file({audio_file_path}) does not exist") - else: - audio_file_paths = [args.input_audio] - audio_file_paths = [os.path.abspath(x) for x in audio_file_paths] - # Read audio Files features = [] - try: + if args.input_type == "file": + if args.input_audio.endswith('.txt'): + with open(args.input_audio) as f: + content = f.readlines() + content = [x.strip() for x in content] + audio_file_paths = content + for audio_file_path in audio_file_paths: + if not os.path.isfile(audio_file_path): + raise OSError("Audio file({audio_file_path}) does not exist") + else: + audio_file_paths = [args.input_audio] + audio_file_paths = [os.path.abspath(x) for x in audio_file_paths] + + # Read audio Files for audio_file_path in audio_file_paths: - audio = sf.read(audio_file_path) - # If audio is stereo, just take one channel. - X = audio[0] if audio[0].ndim==1 else audio[0][:,0] - features.append(X) - except: - raise Exception(f"Soundfile cannot read {args.input_audio}. Try a different format") + audio = readAudioFile(audio_file_path, args.audio_stream) + if audio is None: + raise Exception(f"Can't read {args.input_audio}. Try a different format") + features.append(audio[0]) + elif args.input_type == "microphone": + # Read audio from microphone + audio = readAudioMicrophone(args.micro_time) + if audio is None: + raise Exception(f"Can't open microphone. Try a different format") + features.append(audio[0]) + else: + raise Exception(f"input_type {args.input_type} doesn't exist. Please enter 'file' or 'microphone'") # Get Filterbank Features feature_extractor = FilterbankFeatures()