Merge pull request #21197 from SinM9:speech_recognition_python

pull/21343/head
Alexander Alekhin 3 years ago
commit 547bb3b053
  1. 113
      samples/dnn/speech_recognition.py

@ -2,7 +2,6 @@ import numpy as np
import cv2 as cv import cv2 as cv
import argparse import argparse
import os import os
import soundfile as sf # Temporary import to load audio files
''' '''
You can download the converted onnx model from https://drive.google.com/drive/folders/1wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6?usp=sharing You can download the converted onnx model from https://drive.google.com/drive/folders/1wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6?usp=sharing
@ -399,11 +398,6 @@ def predict(features, net, decoder):
decoder : Decoder object decoder : Decoder object
return : Predicted text return : Predicted text
''' '''
# This is a workaround https://github.com/opencv/opencv/issues/19091
# expanding 1 dimentions allows us to pass it to the network
# from python. This should be resolved in the future.
features = np.expand_dims(features,axis=3)
# make prediction # make prediction
net.setInput(features) net.setInput(features)
output = net.forward() output = net.forward()
@ -412,6 +406,63 @@ def predict(features, net, decoder):
prediction = decoder.decode(output.squeeze(0)) prediction = decoder.decode(output.squeeze(0))
return prediction[0] return prediction[0]
def readAudioFile(file, audioStream):
cap = cv.VideoCapture(file)
samplingRate = 16000
params = np.asarray([cv.CAP_PROP_AUDIO_STREAM, audioStream,
cv.CAP_PROP_VIDEO_STREAM, -1,
cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_32F,
cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate
])
cap.open(file, cv.CAP_ANY, params)
if cap.isOpened() is False:
print("Error : Can't read audio file:", file, "with audioStream = ", audioStream)
return
audioBaseIndex = int (cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
inputAudio = []
while(1):
if (cap.grab()):
frame = np.asarray([])
frame = cap.retrieve(frame, audioBaseIndex)
for i in range(len(frame[1][0])):
inputAudio.append(frame[1][0][i])
else:
break
inputAudio = np.asarray(inputAudio, dtype=np.float64)
return inputAudio, samplingRate
def readAudioMicrophone(microTime):
cap = cv.VideoCapture()
samplingRate = 16000
params = np.asarray([cv.CAP_PROP_AUDIO_STREAM, 0,
cv.CAP_PROP_VIDEO_STREAM, -1,
cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_32F,
cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate
])
cap.open(0, cv.CAP_ANY, params)
if cap.isOpened() is False:
print("Error: Can't open microphone")
print("Error: problems with audio reading, check input arguments")
return
audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
cvTickFreq = cv.getTickFrequency()
sysTimeCurr = cv.getTickCount()
sysTimePrev = sysTimeCurr
inputAudio = []
while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime):
if (cap.grab()):
frame = np.asarray([])
frame = cap.retrieve(frame, audioBaseIndex)
for i in range(len(frame[1][0])):
inputAudio.append(frame[1][0][i])
sysTimeCurr = cv.getTickCount()
else:
print("Error: Grab error")
break
inputAudio = np.asarray(inputAudio, dtype=np.float64)
print("Number of samples: ", len(inputAudio))
return inputAudio, samplingRate
if __name__ == '__main__': if __name__ == '__main__':
# Computation backends supported by layers # Computation backends supported by layers
@ -421,7 +472,10 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser(description='This script runs Jasper Speech recognition model', parser = argparse.ArgumentParser(description='This script runs Jasper Speech recognition model',
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--input_audio', type=str, required=True, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines') parser.add_argument('--input_type', type=str, required=True, help='file or microphone')
parser.add_argument('--micro_time', type=int, default=15, help='Duration of microphone work in seconds. Must be more than 6 sec')
parser.add_argument('--input_audio', type=str, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines')
parser.add_argument('--audio_stream', type=int, default=0, help='CAP_PROP_AUDIO_STREAM value')
parser.add_argument('--show_spectrogram', action='store_true', help='Whether to show a spectrogram of the input audio.') parser.add_argument('--show_spectrogram', action='store_true', help='Whether to show a spectrogram of the input audio.')
parser.add_argument('--model', type=str, default='jasper.onnx', help='Path to the onnx file of Jasper. default="jasper.onnx"') parser.add_argument('--model', type=str, default='jasper.onnx', help='Path to the onnx file of Jasper. default="jasper.onnx"')
parser.add_argument('--output', type=str, help='Path to file where recognized audio transcript must be saved. Leave this to print on console.') parser.add_argument('--output', type=str, help='Path to file where recognized audio transcript must be saved. Leave this to print on console.')
@ -442,28 +496,35 @@ if __name__ == '__main__':
raise OSError("Input audio file does not exist") raise OSError("Input audio file does not exist")
if not os.path.isfile(args.model): if not os.path.isfile(args.model):
raise OSError("Jasper model file does not exist") raise OSError("Jasper model file does not exist")
if args.input_audio.endswith('.txt'):
with open(args.input_audio) as f:
content = f.readlines()
content = [x.strip() for x in content]
audio_file_paths = content
for audio_file_path in audio_file_paths:
if not os.path.isfile(audio_file_path):
raise OSError("Audio file({audio_file_path}) does not exist")
else:
audio_file_paths = [args.input_audio]
audio_file_paths = [os.path.abspath(x) for x in audio_file_paths]
# Read audio Files
features = [] features = []
try: if args.input_type == "file":
if args.input_audio.endswith('.txt'):
with open(args.input_audio) as f:
content = f.readlines()
content = [x.strip() for x in content]
audio_file_paths = content
for audio_file_path in audio_file_paths:
if not os.path.isfile(audio_file_path):
raise OSError("Audio file({audio_file_path}) does not exist")
else:
audio_file_paths = [args.input_audio]
audio_file_paths = [os.path.abspath(x) for x in audio_file_paths]
# Read audio Files
for audio_file_path in audio_file_paths: for audio_file_path in audio_file_paths:
audio = sf.read(audio_file_path) audio = readAudioFile(audio_file_path, args.audio_stream)
# If audio is stereo, just take one channel. if audio is None:
X = audio[0] if audio[0].ndim==1 else audio[0][:,0] raise Exception(f"Can't read {args.input_audio}. Try a different format")
features.append(X) features.append(audio[0])
except: elif args.input_type == "microphone":
raise Exception(f"Soundfile cannot read {args.input_audio}. Try a different format") # Read audio from microphone
audio = readAudioMicrophone(args.micro_time)
if audio is None:
raise Exception(f"Can't open microphone. Try a different format")
features.append(audio[0])
else:
raise Exception(f"input_type {args.input_type} doesn't exist. Please enter 'file' or 'microphone'")
# Get Filterbank Features # Get Filterbank Features
feature_extractor = FilterbankFeatures() feature_extractor = FilterbankFeatures()

Loading…
Cancel
Save