Merge pull request #21197 from SinM9:speech_recognition_python

3 years ago · 547bb3b053
parent 7c2087cf05 f09a577ab5
commit 547bb3b053
1 changed files with 87 additions and 26 deletions
--- a/samples/dnn/speech_recognition.py
+++ b/samples/dnn/speech_recognition.py
@ -2,7 +2,6 @@ import numpy as np
 import cv2 as cv
 import argparse
 import os
 import soundfile as sf # Temporary import to load audio files
 '''
 You can download the converted onnx model from https://drive.google.com/drive/folders/1wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6?usp=sharing
@ -399,11 +398,6 @@ def predict(features, net, decoder):
            decoder : Decoder object
        return : Predicted text
    '''
    # This is a workaround https://github.com/opencv/opencv/issues/19091
    # expanding 1 dimentions allows us to pass it to the network
    # from python. This should be resolved in the future.
    features = np.expand_dims(features,axis=3)
    # make prediction
    net.setInput(features)
    output = net.forward()
@ -412,6 +406,63 @@ def predict(features, net, decoder):
    prediction = decoder.decode(output.squeeze(0))
    return prediction[0]
 def readAudioFile(file, audioStream):
    cap = cv.VideoCapture(file)
    samplingRate = 16000
    params = np.asarray([cv.CAP_PROP_AUDIO_STREAM, audioStream,
              cv.CAP_PROP_VIDEO_STREAM, -1,
              cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_32F,
              cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate
              ])
    cap.open(file, cv.CAP_ANY, params)
    if cap.isOpened() is False:
        print("Error : Can't read audio file:", file, "with audioStream = ", audioStream)
        return
    audioBaseIndex = int (cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
    inputAudio = []
    while(1):
        if (cap.grab()):
            frame = np.asarray([])
            frame = cap.retrieve(frame, audioBaseIndex)
            for i in range(len(frame[1][0])):
                inputAudio.append(frame[1][0][i])
        else:
            break
    inputAudio = np.asarray(inputAudio, dtype=np.float64)
    return inputAudio, samplingRate
 def readAudioMicrophone(microTime):
    cap = cv.VideoCapture()
    samplingRate = 16000
    params = np.asarray([cv.CAP_PROP_AUDIO_STREAM, 0,
              cv.CAP_PROP_VIDEO_STREAM, -1,
              cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_32F,
              cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND, samplingRate
              ])
    cap.open(0, cv.CAP_ANY, params)
    if cap.isOpened() is False:
        print("Error: Can't open microphone")
        print("Error: problems with audio reading, check input arguments")
        return
    audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
    cvTickFreq = cv.getTickFrequency()
    sysTimeCurr = cv.getTickCount()
    sysTimePrev = sysTimeCurr
    inputAudio = []
    while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime):
        if (cap.grab()):
            frame = np.asarray([])
            frame = cap.retrieve(frame, audioBaseIndex)
            for i in range(len(frame[1][0])):
                inputAudio.append(frame[1][0][i])
            sysTimeCurr = cv.getTickCount()
        else:
            print("Error: Grab error")
            break
    inputAudio = np.asarray(inputAudio, dtype=np.float64)
    print("Number of samples: ", len(inputAudio))
    return inputAudio, samplingRate
 if __name__ == '__main__':
    # Computation backends supported by layers
@ -421,7 +472,10 @@ if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='This script runs Jasper Speech recognition model',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--input_audio', type=str, required=True, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines')
+    parser.add_argument('--input_type', type=str, required=True, help='file or microphone')
    parser.add_argument('--micro_time', type=int, default=15, help='Duration of microphone work in seconds. Must be more than 6 sec')
    parser.add_argument('--input_audio', type=str, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines')
    parser.add_argument('--audio_stream', type=int, default=0, help='CAP_PROP_AUDIO_STREAM value')
    parser.add_argument('--show_spectrogram', action='store_true', help='Whether to show a spectrogram of the input audio.')
    parser.add_argument('--model', type=str, default='jasper.onnx', help='Path to the onnx file of Jasper. default="jasper.onnx"')
    parser.add_argument('--output', type=str, help='Path to file where recognized audio transcript must be saved. Leave this to print on console.')
@ -442,28 +496,35 @@ if __name__ == '__main__':
        raise OSError("Input audio file does not exist")
    if not os.path.isfile(args.model):
        raise OSError("Jasper model file does not exist")
    if args.input_audio.endswith('.txt'):
        with open(args.input_audio) as f:
            content = f.readlines()
            content = [x.strip() for x in content]
            audio_file_paths = content
        for audio_file_path in audio_file_paths:
            if not os.path.isfile(audio_file_path):
                raise OSError("Audio file({audio_file_path}) does not exist")
    else:
        audio_file_paths = [args.input_audio]
    audio_file_paths = [os.path.abspath(x) for x in audio_file_paths]
    # Read audio Files
    features = []
-    try:
+    if args.input_type == "file":
        if args.input_audio.endswith('.txt'):
            with open(args.input_audio) as f:
                content = f.readlines()
                content = [x.strip() for x in content]
                audio_file_paths = content
            for audio_file_path in audio_file_paths:
                if not os.path.isfile(audio_file_path):
                    raise OSError("Audio file({audio_file_path}) does not exist")
        else:
            audio_file_paths = [args.input_audio]
        audio_file_paths = [os.path.abspath(x) for x in audio_file_paths]
        # Read audio Files
        for audio_file_path in audio_file_paths:
-            audio = sf.read(audio_file_path)
+            audio = readAudioFile(audio_file_path, args.audio_stream)
-            # If audio is stereo, just take one channel.
+            if audio is None:
-            X = audio[0] if audio[0].ndim==1 else audio[0][:,0]
+                raise Exception(f"Can't read {args.input_audio}. Try a different format")
-            features.append(X)
+            features.append(audio[0])
-    except:
+    elif args.input_type == "microphone":
-        raise Exception(f"Soundfile cannot read {args.input_audio}. Try a different format")
+        # Read audio from microphone
        audio = readAudioMicrophone(args.micro_time)
        if audio is None:
            raise Exception(f"Can't open microphone. Try a different format")
        features.append(audio[0])
    else:
        raise Exception(f"input_type {args.input_type} doesn't exist. Please enter 'file' or 'microphone'")
    # Get Filterbank Features
    feature_extractor = FilterbankFeatures()