@ -2,7 +2,6 @@ import numpy as np
import cv2 as cv
import cv2 as cv
import argparse
import argparse
import os
import os
import soundfile as sf # Temporary import to load audio files
'''
'''
You can download the converted onnx model from https : / / drive . google . com / drive / folders / 1 wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6 ? usp = sharing
You can download the converted onnx model from https : / / drive . google . com / drive / folders / 1 wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6 ? usp = sharing
@ -399,11 +398,6 @@ def predict(features, net, decoder):
decoder : Decoder object
decoder : Decoder object
return : Predicted text
return : Predicted text
'''
'''
# This is a workaround https://github.com/opencv/opencv/issues/19091
# expanding 1 dimentions allows us to pass it to the network
# from python. This should be resolved in the future.
features = np . expand_dims ( features , axis = 3 )
# make prediction
# make prediction
net . setInput ( features )
net . setInput ( features )
output = net . forward ( )
output = net . forward ( )
@ -412,6 +406,63 @@ def predict(features, net, decoder):
prediction = decoder . decode ( output . squeeze ( 0 ) )
prediction = decoder . decode ( output . squeeze ( 0 ) )
return prediction [ 0 ]
return prediction [ 0 ]
def readAudioFile ( file , audioStream ) :
cap = cv . VideoCapture ( file )
samplingRate = 16000
params = np . asarray ( [ cv . CAP_PROP_AUDIO_STREAM , audioStream ,
cv . CAP_PROP_VIDEO_STREAM , - 1 ,
cv . CAP_PROP_AUDIO_DATA_DEPTH , cv . CV_32F ,
cv . CAP_PROP_AUDIO_SAMPLES_PER_SECOND , samplingRate
] )
cap . open ( file , cv . CAP_ANY , params )
if cap . isOpened ( ) is False :
print ( " Error : Can ' t read audio file: " , file , " with audioStream = " , audioStream )
return
audioBaseIndex = int ( cap . get ( cv . CAP_PROP_AUDIO_BASE_INDEX ) )
inputAudio = [ ]
while ( 1 ) :
if ( cap . grab ( ) ) :
frame = np . asarray ( [ ] )
frame = cap . retrieve ( frame , audioBaseIndex )
for i in range ( len ( frame [ 1 ] [ 0 ] ) ) :
inputAudio . append ( frame [ 1 ] [ 0 ] [ i ] )
else :
break
inputAudio = np . asarray ( inputAudio , dtype = np . float64 )
return inputAudio , samplingRate
def readAudioMicrophone ( microTime ) :
cap = cv . VideoCapture ( )
samplingRate = 16000
params = np . asarray ( [ cv . CAP_PROP_AUDIO_STREAM , 0 ,
cv . CAP_PROP_VIDEO_STREAM , - 1 ,
cv . CAP_PROP_AUDIO_DATA_DEPTH , cv . CV_32F ,
cv . CAP_PROP_AUDIO_SAMPLES_PER_SECOND , samplingRate
] )
cap . open ( 0 , cv . CAP_ANY , params )
if cap . isOpened ( ) is False :
print ( " Error: Can ' t open microphone " )
print ( " Error: problems with audio reading, check input arguments " )
return
audioBaseIndex = int ( cap . get ( cv . CAP_PROP_AUDIO_BASE_INDEX ) )
cvTickFreq = cv . getTickFrequency ( )
sysTimeCurr = cv . getTickCount ( )
sysTimePrev = sysTimeCurr
inputAudio = [ ]
while ( ( sysTimeCurr - sysTimePrev ) / cvTickFreq < microTime ) :
if ( cap . grab ( ) ) :
frame = np . asarray ( [ ] )
frame = cap . retrieve ( frame , audioBaseIndex )
for i in range ( len ( frame [ 1 ] [ 0 ] ) ) :
inputAudio . append ( frame [ 1 ] [ 0 ] [ i ] )
sysTimeCurr = cv . getTickCount ( )
else :
print ( " Error: Grab error " )
break
inputAudio = np . asarray ( inputAudio , dtype = np . float64 )
print ( " Number of samples: " , len ( inputAudio ) )
return inputAudio , samplingRate
if __name__ == ' __main__ ' :
if __name__ == ' __main__ ' :
# Computation backends supported by layers
# Computation backends supported by layers
@ -421,7 +472,10 @@ if __name__ == '__main__':
parser = argparse . ArgumentParser ( description = ' This script runs Jasper Speech recognition model ' ,
parser = argparse . ArgumentParser ( description = ' This script runs Jasper Speech recognition model ' ,
formatter_class = argparse . ArgumentDefaultsHelpFormatter )
formatter_class = argparse . ArgumentDefaultsHelpFormatter )
parser . add_argument ( ' --input_audio ' , type = str , required = True , help = ' Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines ' )
parser . add_argument ( ' --input_type ' , type = str , required = True , help = ' file or microphone ' )
parser . add_argument ( ' --micro_time ' , type = int , default = 15 , help = ' Duration of microphone work in seconds. Must be more than 6 sec ' )
parser . add_argument ( ' --input_audio ' , type = str , help = ' Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines ' )
parser . add_argument ( ' --audio_stream ' , type = int , default = 0 , help = ' CAP_PROP_AUDIO_STREAM value ' )
parser . add_argument ( ' --show_spectrogram ' , action = ' store_true ' , help = ' Whether to show a spectrogram of the input audio. ' )
parser . add_argument ( ' --show_spectrogram ' , action = ' store_true ' , help = ' Whether to show a spectrogram of the input audio. ' )
parser . add_argument ( ' --model ' , type = str , default = ' jasper.onnx ' , help = ' Path to the onnx file of Jasper. default= " jasper.onnx " ' )
parser . add_argument ( ' --model ' , type = str , default = ' jasper.onnx ' , help = ' Path to the onnx file of Jasper. default= " jasper.onnx " ' )
parser . add_argument ( ' --output ' , type = str , help = ' Path to file where recognized audio transcript must be saved. Leave this to print on console. ' )
parser . add_argument ( ' --output ' , type = str , help = ' Path to file where recognized audio transcript must be saved. Leave this to print on console. ' )
@ -442,6 +496,9 @@ if __name__ == '__main__':
raise OSError ( " Input audio file does not exist " )
raise OSError ( " Input audio file does not exist " )
if not os . path . isfile ( args . model ) :
if not os . path . isfile ( args . model ) :
raise OSError ( " Jasper model file does not exist " )
raise OSError ( " Jasper model file does not exist " )
features = [ ]
if args . input_type == " file " :
if args . input_audio . endswith ( ' .txt ' ) :
if args . input_audio . endswith ( ' .txt ' ) :
with open ( args . input_audio ) as f :
with open ( args . input_audio ) as f :
content = f . readlines ( )
content = f . readlines ( )
@ -455,15 +512,19 @@ if __name__ == '__main__':
audio_file_paths = [ os . path . abspath ( x ) for x in audio_file_paths ]
audio_file_paths = [ os . path . abspath ( x ) for x in audio_file_paths ]
# Read audio Files
# Read audio Files
features = [ ]
try :
for audio_file_path in audio_file_paths :
for audio_file_path in audio_file_paths :
audio = sf . read ( audio_file_path )
audio = readAudioFile ( audio_file_path , args . audio_stream )
# If audio is stereo, just take one channel.
if audio is None :
X = audio [ 0 ] if audio [ 0 ] . ndim == 1 else audio [ 0 ] [ : , 0 ]
raise Exception ( f " Can ' t read { args . input_audio } . Try a different format " )
features . append ( X )
features . append ( audio [ 0 ] )
except :
elif args . input_type == " microphone " :
raise Exception ( f " Soundfile cannot read { args . input_audio } . Try a different format " )
# Read audio from microphone
audio = readAudioMicrophone ( args . micro_time )
if audio is None :
raise Exception ( f " Can ' t open microphone. Try a different format " )
features . append ( audio [ 0 ] )
else :
raise Exception ( f " input_type { args . input_type } doesn ' t exist. Please enter ' file ' or ' microphone ' " )
# Get Filterbank Features
# Get Filterbank Features
feature_extractor = FilterbankFeatures ( )
feature_extractor = FilterbankFeatures ( )