Merge pull request #20934 from SinM9:spectrogram_samples

AudioIO: add spectrogram samples for C++/python
3 years ago · d8b1fc45aa
parent 295da7e5f3
commit d8b1fc45aa
2 changed files with 1875 additions and 0 deletions
--- a/samples/cpp/audio_spectrogram.cpp
+++ b/samples/cpp/audio_spectrogram.cpp
--- a/samples/python/audio_spectrogram.py
+++ b/samples/python/audio_spectrogram.py
@ -0,0 +1,804 @@
+import numpy as np
+import cv2 as cv
+import math
+import argparse
+
+class AudioDrawing:
+    '''
+        Used for drawing audio graphics
+    '''
+    def __init__(self, args):
+
+        self.inputType = args.inputType
+        self.draw = args.draw
+        self.graph = args.graph
+        self.audio = cv.samples.findFile(args.audio)
+        self.audioStream = args.audioStream
+
+        self.windowType = args.windowType
+        self.windLen = args.windLen
+        self.overlap = args.overlap
+
+        self.enableGrid = args.enableGrid
+
+        self.rows = args.rows
+        self.cols = args.cols
+
+        self.xmarkup = args.xmarkup
+        self.ymarkup = args.ymarkup
+        self.zmarkup = args.zmarkup
+
+        self.microTime = args.microTime
+        self.frameSizeTime = args.frameSizeTime
+        self.updateTime = args.updateTime
+        self.waitTime = args.waitTime
+
+        if self.initAndCheckArgs(args) is False:
+            exit()
+
+
+    def Draw(self):
+        if self.draw == "static":
+
+            if self.inputType == "file":
+                samplingRate, inputAudio = self.readAudioFile(self.audio)
+
+            elif self.inputType == "microphone":
+                samplingRate, inputAudio = self.readAudioMicrophone()
+
+            duration = len(inputAudio) // samplingRate
+
+            # since the dimensional grid is counted in integer seconds,
+            # if the input audio has an incomplete last second,
+            # then it is filled with zeros to complete
+            remainder = len(inputAudio) % samplingRate
+            if remainder != 0:
+                sizeToFullSec = samplingRate - remainder
+                zeroArr = np.zeros(sizeToFullSec)
+                inputAudio = np.concatenate((inputAudio, zeroArr), axis=0)
+                duration += 1
+                print("Update duration of audio to full second with ",
+                    sizeToFullSec, " zero samples")
+                print("New number of samples ", len(inputAudio))
+
+            if duration <= self.xmarkup:
+                self.xmarkup = duration + 1
+
+            if self.graph == "ampl":
+                imgAmplitude = self.drawAmplitude(inputAudio)
+                imgAmplitude = self.drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate)
+                cv.imshow("Display window", imgAmplitude)
+                cv.waitKey(0)
+
+            elif self.graph == "spec":
+                stft = self.STFT(inputAudio)
+                imgSpec = self.drawSpectrogram(stft)
+                imgSpec = self.drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft)
+                cv.imshow("Display window", imgSpec)
+                cv.waitKey(0)
+
+            elif self.graph == "ampl_and_spec":
+                imgAmplitude = self.drawAmplitude(inputAudio)
+                imgAmplitude = self.drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate)
+
+                stft = self.STFT(inputAudio)
+                imgSpec = self.drawSpectrogram(stft)
+                imgSpec = self.drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft)
+
+                imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
+                cv.imshow("Display window", imgTotal)
+                cv.waitKey(0)
+
+        elif self.draw == "dynamic":
+
+            if self.inputType == "file":
+                self.dynamicFile(self.audio)
+
+            elif self.inputType == "microphone":
+                self.dynamicMicrophone()
+
+
+    def readAudioFile(self, file):
+        cap = cv.VideoCapture(file)
+
+        params = [cv.CAP_PROP_AUDIO_STREAM, self.audioStream,
+                cv.CAP_PROP_VIDEO_STREAM, -1,
+                cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_16S]
+        params = np.asarray(params)
+
+        cap.open(file, cv.CAP_ANY, params)
+        if cap.isOpened() == False:
+            print("Error : Can't read audio file: '", self.audio, "' with audioStream = ", self.audioStream)
+            print("Error: problems with audio reading, check input arguments")
+            exit()
+        audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+        numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
+
+        print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
+        print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
+        print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
+
+        frame = []
+        frame = np.asarray(frame)
+        inputAudio = []
+
+        while (1):
+            if (cap.grab()):
+                frame = []
+                frame = np.asarray(frame)
+                frame = cap.retrieve(frame, audioBaseIndex)
+                for i in range(len(frame[1][0])):
+                    inputAudio.append(frame[1][0][i])
+            else:
+                break
+
+        inputAudio = np.asarray(inputAudio)
+        print("Number of samples: ", len(inputAudio))
+        samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        return samplingRate, inputAudio
+
+
+    def readAudioMicrophone(self):
+        cap = cv.VideoCapture()
+
+        params = [cv.CAP_PROP_AUDIO_STREAM, 0, cv.CAP_PROP_VIDEO_STREAM, -1]
+        params = np.asarray(params)
+
+        cap.open(0, cv.CAP_ANY, params)
+        if cap.isOpened() == False:
+            print("Error: Can't open microphone")
+            print("Error: problems with audio reading, check input arguments")
+            exit()
+        audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+        numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
+
+        print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
+        print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
+        print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
+
+        cvTickFreq = cv.getTickFrequency()
+        sysTimeCurr = cv.getTickCount()
+        sysTimePrev = sysTimeCurr
+
+        frame = []
+        frame = np.asarray(frame)
+        inputAudio = []
+
+        while ((sysTimeCurr - sysTimePrev) / cvTickFreq < self.microTime):
+            if (cap.grab()):
+                frame = []
+                frame = np.asarray(frame)
+                frame = cap.retrieve(frame, audioBaseIndex)
+                for i in range(len(frame[1][0])):
+                    inputAudio.append(frame[1][0][i])
+                sysTimeCurr = cv.getTickCount()
+            else:
+                print("Error: Grab error")
+                break
+
+        inputAudio = np.asarray(inputAudio)
+        print("Number of samples: ", len(inputAudio))
+        samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+
+        return samplingRate, inputAudio
+
+
+    def drawAmplitude(self, inputAudio):
+        color = (247, 111, 87)
+        thickness = 5
+        frameVectorRows = 500
+        middle = frameVectorRows // 2
+
+        # usually the input data is too big, so it is necessary
+        # to reduce size using interpolation of data
+        frameVectorCols = 40000
+        if len(inputAudio) < frameVectorCols:
+            frameVectorCols = len(inputAudio)
+
+        img = np.zeros((frameVectorRows, frameVectorCols, 3), np.uint8)
+        img += 255  # white background
+
+        audio = np.array(0)
+        audio = cv.resize(inputAudio, (1, frameVectorCols), interpolation=cv.INTER_LINEAR)
+        reshapeAudio = np.reshape(audio, (-1))
+
+        # normalization data by maximum element
+        minCv, maxCv, _, _ = cv.minMaxLoc(reshapeAudio)
+        maxElem = int(max(abs(minCv), abs(maxCv)))
+
+        # if all data values are zero (silence)
+        if maxElem == 0:
+            maxElem = 1
+        for i in range(len(reshapeAudio)):
+            reshapeAudio[i] = middle - reshapeAudio[i] * middle // maxElem
+
+        for i in range(1, frameVectorCols, 1):
+            cv.line(img, (i - 1, int(reshapeAudio[i - 1])), (i, int(reshapeAudio[i])), color, thickness)
+
+        img = cv.resize(img, (900, 400), interpolation=cv.INTER_AREA)
+        return img
+
+
+    def drawAmplitudeScale(self, inputImg, inputAudio, samplingRate, xmin=None, xmax=None):
+        # function of layout drawing for graph of volume amplitudes
+        # x axis for time
+        # y axis for amplitudes
+
+        # parameters for the new image size
+        preCol = 100
+        aftCol = 100
+        preLine = 40
+        aftLine = 50
+
+        frameVectorRows = inputImg.shape[0]
+        frameVectorCols = inputImg.shape[1]
+
+        totalRows = preLine + frameVectorRows + aftLine
+        totalCols = preCol + frameVectorCols + aftCol
+
+        imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
+        imgTotal += 255  # white background
+        imgTotal[preLine: preLine + frameVectorRows, preCol: preCol + frameVectorCols] = inputImg
+
+        # calculating values on x axis
+        if xmin is None:
+            xmin = 0
+        if xmax is None:
+            xmax = len(inputAudio) / samplingRate
+
+        if xmax > self.xmarkup:
+            xList = np.linspace(xmin, xmax, self.xmarkup).astype(int)
+        else:
+            # this case is used to display a dynamic update
+            tmp = np.arange(xmin, xmax, 1).astype(int) + 1
+            xList = np.concatenate((np.zeros(self.xmarkup - len(tmp)), tmp[:]), axis=None)
+
+        # calculating values on y axis
+        ymin = np.min(inputAudio)
+        ymax = np.max(inputAudio)
+        yList = np.linspace(ymin, ymax, self.ymarkup)
+
+        # parameters for layout drawing
+        textThickness = 1
+        gridThickness = 1
+        gridColor = (0, 0, 0)
+        textColor = (0, 0, 0)
+        font = cv.FONT_HERSHEY_SIMPLEX
+        fontScale = 0.5
+
+        # horizontal axis under the graph
+        cv.line(imgTotal, (preCol, totalRows - aftLine),
+                (preCol + frameVectorCols, totalRows - aftLine),
+                gridColor, gridThickness)
+        # vertical axis for amplitude
+        cv.line(imgTotal, (preCol, preLine), (preCol, preLine + frameVectorRows),
+                gridColor, gridThickness)
+
+        # parameters for layout calculation
+        serifSize = 10
+        indentDownX = serifSize * 2
+        indentDownY = serifSize // 2
+        indentLeftX = serifSize
+        indentLeftY = 2 * preCol // 3
+
+        # drawing layout for x axis
+        numX = frameVectorCols // (self.xmarkup - 1)
+        for i in range(len(xList)):
+            a1 = preCol + i * numX
+            a2 = frameVectorRows + preLine
+            b1 = a1
+            b2 = a2 + serifSize
+            if self.enableGrid is True:
+                d1 = a1
+                d2 = preLine
+                cv.line(imgTotal, (a1, a2), (d1, d2), gridColor, gridThickness)
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(xList[i])), (b1 - indentLeftX, b2 + indentDownX),
+                    font, fontScale, textColor, textThickness)
+
+        # drawing layout for y axis
+        numY = frameVectorRows // (self.ymarkup - 1)
+        for i in range(len(yList)):
+            a1 = preCol
+            a2 = totalRows - aftLine - i * numY
+            b1 = preCol - serifSize
+            b2 = a2
+            if self.enableGrid is True:
+                d1 = preCol + frameVectorCols
+                d2 = a2
+                cv.line(imgTotal, (a1, a2), (d1, d2), gridColor, gridThickness)
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(yList[i])), (b1 - indentLeftY, b2 + indentDownY),
+                    font, fontScale, textColor, textThickness)
+        imgTotal = cv.resize(imgTotal, (self.cols, self.rows), interpolation=cv.INTER_AREA)
+        return imgTotal
+
+
+    def STFT(self, inputAudio):
+        """
+        The Short-time Fourier transform (STFT), is a Fourier-related transform used to determine
+        the sinusoidal frequency and phase content of local sections of a signal as it changes over
+        time.
+        In practice, the procedure for computing STFTs is to divide a longer time signal into
+        shorter segments of equal length and then compute the Fourier transform separately on each
+        shorter segment. This reveals the Fourier spectrum on each shorter segment. One then usually
+        plots the changing spectra as a function of time, known as a spectrogram or waterfall plot.
+
+        https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+        """
+
+        time_step = self.windLen - self.overlap
+        stft = []
+
+        if self.windowType == "Hann":
+            # https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
+            Hann_wind = []
+            for i in range (1 - self.windLen, self.windLen, 2):
+                Hann_wind.append(i * (0.5 + 0.5 * math.cos(math.pi * i / (self.windLen - 1))))
+            Hann_wind = np.asarray(Hann_wind)
+
+        elif self.windowType == "Hamming":
+            # https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
+            Hamming_wind = []
+            for i in range (1 - self.windLen, self.windLen, 2):
+                Hamming_wind.append(i * (0.53836 - 0.46164 * (math.cos(2 * math.pi * i / (self.windLen - 1)))))
+            Hamming_wind = np.asarray(Hamming_wind)
+
+        for index in np.arange(0, len(inputAudio), time_step).astype(int):
+
+            section = inputAudio[index:index + self.windLen]
+            zeroArray = np.zeros(self.windLen - len(section))
+            section = np.concatenate((section, zeroArray), axis=None)
+
+            if self.windowType == "Hann":
+                section *= Hann_wind
+            elif self.windowType == "Hamming":
+                section *= Hamming_wind
+
+            dst = np.empty(0)
+            dst = cv.dft(section, dst, flags=cv.DFT_COMPLEX_OUTPUT)
+            reshape_dst = np.reshape(dst, (-1))
+            # we need only the first part of the spectrum, the second part is symmetrical
+            complexArr = np.zeros(len(dst) // 4, dtype=complex)
+            for i in range(len(dst) // 4):
+                complexArr[i] = complex(reshape_dst[2 * i], reshape_dst[2 * i + 1])
+            stft.append(np.abs(complexArr))
+
+        stft = np.array(stft).transpose()
+        # convert elements to the decibel scale
+        np.log10(stft, out=stft, where=(stft != 0.))
+        return 10 * stft
+
+
+    def drawSpectrogram(self, stft):
+
+        frameVectorRows = stft.shape[0]
+        frameVectorCols = stft.shape[1]
+
+        # Normalization of image values from 0 to 255 to get more contrast image
+        # and this normalization will be taken into account in the scale drawing
+        colormapImageRows = 255
+
+        imgSpec = np.zeros((frameVectorRows, frameVectorCols, 3), np.uint8)
+        stftMat = np.zeros((frameVectorRows, frameVectorCols), np.float64)
+        cv.normalize(stft, stftMat, 1.0, 0.0, cv.NORM_INF)
+
+        for i in range(frameVectorRows):
+            for j in range(frameVectorCols):
+                imgSpec[frameVectorRows - i - 1, j] = int(stftMat[i][j] * colormapImageRows)
+
+        imgSpec = cv.applyColorMap(imgSpec, cv.COLORMAP_INFERNO)
+        imgSpec = cv.resize(imgSpec, (900, 400), interpolation=cv.INTER_LINEAR)
+        return imgSpec
+
+
+    def drawSpectrogramColorbar(self, inputImg, inputAudio, samplingRate, stft, xmin=None, xmax=None):
+        # function of layout drawing for the three-dimensional graph of the spectrogram
+        # x axis for time
+        # y axis for frequencies
+        # z axis for magnitudes of frequencies shown by color scale
+
+        # parameters for the new image size
+        preCol = 100
+        aftCol = 100
+        preLine = 40
+        aftLine = 50
+        colColor = 20
+        ind_col = 20
+
+        frameVectorRows = inputImg.shape[0]
+        frameVectorCols = inputImg.shape[1]
+
+        totalRows = preLine + frameVectorRows + aftLine
+        totalCols = preCol + frameVectorCols + aftCol + colColor
+
+        imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
+        imgTotal += 255  # white background
+        imgTotal[preLine: preLine + frameVectorRows, preCol: preCol + frameVectorCols] = inputImg
+
+        # colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0,
+        # so here colorbar has values from 255 to 0
+        colorArrSize = 256
+        imgColorBar = np.zeros((colorArrSize, colColor, 1), np.uint8)
+
+        for i in range(colorArrSize):
+            imgColorBar[i] += colorArrSize - 1 - i
+
+        imgColorBar = cv.applyColorMap(imgColorBar, cv.COLORMAP_INFERNO)
+        imgColorBar = cv.resize(imgColorBar, (colColor, frameVectorRows), interpolation=cv.INTER_AREA)  #
+
+        imgTotal[preLine: preLine + frameVectorRows,
+        preCol + frameVectorCols + ind_col:
+        preCol + frameVectorCols + ind_col + colColor] = imgColorBar
+
+        # calculating values on x axis
+        if xmin is None:
+            xmin = 0
+        if xmax is None:
+            xmax = len(inputAudio) / samplingRate
+        if xmax > self.xmarkup:
+            xList = np.linspace(xmin, xmax, self.xmarkup).astype(int)
+        else:
+            # this case is used to display a dynamic update
+            tmpXList = np.arange(xmin, xmax, 1).astype(int) + 1
+            xList = np.concatenate((np.zeros(self.xmarkup - len(tmpXList)), tmpXList[:]), axis=None)
+
+        # calculating values on y axis
+        # according to the Nyquist sampling theorem,
+        # signal should posses frequencies equal to half of sampling rate
+        ymin = 0
+        ymax = int(samplingRate / 2.)
+        yList = np.linspace(ymin, ymax, self.ymarkup).astype(int)
+
+        # calculating values on z axis
+        zList = np.linspace(np.min(stft), np.max(stft), self.zmarkup)
+
+        # parameters for layout drawing
+        textThickness = 1
+        textColor = (0, 0, 0)
+        gridThickness = 1
+        gridColor = (0, 0, 0)
+        font = cv.FONT_HERSHEY_SIMPLEX
+        fontScale = 0.5
+
+        serifSize = 10
+        indentDownX = serifSize * 2
+        indentDownY = serifSize // 2
+        indentLeftX = serifSize
+        indentLeftY = 2 * preCol // 3
+
+        # horizontal axis
+        cv.line(imgTotal, (preCol, totalRows - aftLine), (preCol + frameVectorCols, totalRows - aftLine),
+                gridColor, gridThickness)
+        # vertical axis
+        cv.line(imgTotal, (preCol, preLine), (preCol, preLine + frameVectorRows),
+                gridColor, gridThickness)
+
+        # drawing layout for x axis
+        numX = frameVectorCols // (self.xmarkup - 1)
+        for i in range(len(xList)):
+            a1 = preCol + i * numX
+            a2 = frameVectorRows + preLine
+            b1 = a1
+            b2 = a2 + serifSize
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(xList[i])), (b1 - indentLeftX, b2 + indentDownX),
+                    font, fontScale, textColor, textThickness)
+
+        # drawing layout for y axis
+        numY = frameVectorRows // (self.ymarkup - 1)
+        for i in range(len(yList)):
+            a1 = preCol
+            a2 = totalRows - aftLine - i * numY
+            b1 = preCol - serifSize
+            b2 = a2
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(yList[i])), (b1 - indentLeftY, b2 + indentDownY),
+                    font, fontScale, textColor, textThickness)
+
+        # drawing layout for z axis
+        numZ = frameVectorRows // (self.zmarkup - 1)
+        for i in range(len(zList)):
+            a1 = preCol + frameVectorCols + ind_col + colColor
+            a2 = totalRows - aftLine - i * numZ
+            b1 = a1 + serifSize
+            b2 = a2
+            cv.line(imgTotal, (a1, a2), (b1, b2), gridColor, gridThickness)
+            cv.putText(imgTotal, str(int(zList[i])), (b1 + 10, b2 + indentDownY),
+                    font, fontScale, textColor, textThickness)
+        imgTotal = cv.resize(imgTotal, (self.cols, self.rows), interpolation=cv.INTER_AREA)
+        return imgTotal
+
+
+    def concatenateImages(self, img1, img2):
+        # first image will be under the second image
+        totalRows = img1.shape[0] + img2.shape[0]
+        totalCols = max(img1.shape[1], img2.shape[1])
+
+        # if images columns do not match, the difference is filled in white
+        imgTotal = np.zeros((totalRows, totalCols, 3), np.uint8)
+        imgTotal += 255
+
+        imgTotal[:img1.shape[0], :img1.shape[1]] = img1
+        imgTotal[img2.shape[0]:, :img2.shape[1]] = img2
+
+        return imgTotal
+
+
+    def dynamicFile(self, file):
+        cap = cv.VideoCapture(file)
+        params = [cv.CAP_PROP_AUDIO_STREAM, self.audioStream,
+                cv.CAP_PROP_VIDEO_STREAM, -1,
+                cv.CAP_PROP_AUDIO_DATA_DEPTH, cv.CV_16S]
+        params = np.asarray(params)
+
+        cap.open(file, cv.CAP_ANY, params)
+        if cap.isOpened() == False:
+            print("ERROR! Can't to open file")
+            return
+
+        audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+        numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
+        samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+
+        print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
+        print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
+        print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
+
+        step = int(self.updateTime * samplingRate)
+        frameSize = int(self.frameSizeTime * samplingRate)
+        # since the dimensional grid is counted in integer seconds,
+        # if duration of audio frame is less than xmarkup, to avoid an incorrect display,
+        # xmarkup will be taken equal to duration
+        if self.frameSizeTime <= self.xmarkup:
+            self.xmarkup = self.frameSizeTime
+
+        buffer = []
+        section = np.zeros(frameSize, dtype=np.int16)
+        currentSamples = 0
+
+        while (1):
+            if (cap.grab()):
+                frame = []
+                frame = np.asarray(frame)
+                frame = cap.retrieve(frame, audioBaseIndex)
+
+                for i in range(len(frame[1][0])):
+                    buffer.append(frame[1][0][i])
+
+                buffer_size = len(buffer)
+                if (buffer_size >= step):
+
+                    section = list(section)
+                    currentSamples += step
+
+                    del section[0:step]
+                    section.extend(buffer[0:step])
+                    del buffer[0:step]
+
+                    section = np.asarray(section)
+
+                    if currentSamples < frameSize:
+                        xmin = 0
+                        xmax = (currentSamples) / samplingRate
+                    else:
+                        xmin = (currentSamples - frameSize) / samplingRate + 1
+                        xmax = (currentSamples) / samplingRate
+
+                    if self.graph == "ampl":
+                        imgAmplitude = self.drawAmplitude(section)
+                        imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
+                        cv.imshow("Display amplitude graph", imgAmplitude)
+                        cv.waitKey(self.waitTime)
+
+                    elif self.graph == "spec":
+                        stft = self.STFT(section)
+                        imgSpec = self.drawSpectrogram(stft)
+                        imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
+                        cv.imshow("Display spectrogram", imgSpec)
+                        cv.waitKey(self.waitTime)
+
+                    elif self.graph == "ampl_and_spec":
+
+                        imgAmplitude = self.drawAmplitude(section)
+                        stft = self.STFT(section)
+                        imgSpec = self.drawSpectrogram(stft)
+
+                        imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
+                        imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
+
+                        imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
+                        cv.imshow("Display amplitude graph and spectrogram", imgTotal)
+                        cv.waitKey(self.waitTime)
+            else:
+                break
+
+
+    def dynamicMicrophone(self):
+        cap = cv.VideoCapture()
+        params = [cv.CAP_PROP_AUDIO_STREAM, 0, cv.CAP_PROP_VIDEO_STREAM, -1]
+        params = np.asarray(params)
+
+        cap.open(0, cv.CAP_ANY, params)
+        if cap.isOpened() == False:
+            print("ERROR! Can't to open file")
+            return
+        audioBaseIndex = int(cap.get(cv.CAP_PROP_AUDIO_BASE_INDEX))
+        numberOfChannels = int(cap.get(cv.CAP_PROP_AUDIO_TOTAL_CHANNELS))
+
+        print("CAP_PROP_AUDIO_DATA_DEPTH: ", str((int(cap.get(cv.CAP_PROP_AUDIO_DATA_DEPTH)))))
+        print("CAP_PROP_AUDIO_SAMPLES_PER_SECOND: ", cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+        print("CAP_PROP_AUDIO_TOTAL_CHANNELS: ", numberOfChannels)
+        print("CAP_PROP_AUDIO_TOTAL_STREAMS: ", cap.get(cv.CAP_PROP_AUDIO_TOTAL_STREAMS))
+
+        frame = []
+        frame = np.asarray(frame)
+        samplingRate = int(cap.get(cv.CAP_PROP_AUDIO_SAMPLES_PER_SECOND))
+
+        step = int(self.updateTime * samplingRate)
+        frameSize = int(self.frameSizeTime * samplingRate)
+        self.xmarkup = self.frameSizeTime
+
+        currentSamples = 0
+
+        buffer = []
+        section = np.zeros(frameSize, dtype=np.int16)
+
+        cvTickFreq = cv.getTickFrequency()
+        sysTimeCurr = cv.getTickCount()
+        sysTimePrev = sysTimeCurr
+        self.waitTime = self.updateTime * 1000
+        while ((sysTimeCurr - sysTimePrev) / cvTickFreq < self.microTime):
+            if (cap.grab()):
+                frame = []
+                frame = np.asarray(frame)
+                frame = cap.retrieve(frame, audioBaseIndex)
+
+                for i in range(len(frame[1][0])):
+                    buffer.append(frame[1][0][i])
+
+                sysTimeCurr = cv.getTickCount()
+                buffer_size = len(buffer)
+                if (buffer_size >= step):
+
+                    section = list(section)
+                    currentSamples += step
+
+                    del section[0:step]
+                    section.extend(buffer[0:step])
+                    del buffer[0:step]
+
+                    section = np.asarray(section)
+
+                    if currentSamples < frameSize:
+                        xmin = 0
+                        xmax = (currentSamples) / samplingRate
+                    else:
+                        xmin = (currentSamples - frameSize) / samplingRate + 1
+                        xmax = (currentSamples) / samplingRate
+
+                    if self.graph == "ampl":
+                        imgAmplitude = self.drawAmplitude(section)
+                        imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
+                        cv.imshow("Display amplitude graph", imgAmplitude)
+                        cv.waitKey(self.waitTime)
+
+                    elif self.graph == "spec":
+                        stft = self.STFT(section)
+                        imgSpec = self.drawSpectrogram(stft)
+                        imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
+                        cv.imshow("Display spectrogram", imgSpec)
+                        cv.waitKey(self.waitTime)
+
+                    elif self.graph == "ampl_and_spec":
+                        imgAmplitude = self.drawAmplitude(section)
+                        stft = self.STFT(section)
+                        imgSpec = self.drawSpectrogram(stft)
+
+                        imgAmplitude = self.drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax)
+                        imgSpec = self.drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax)
+
+                        imgTotal = self.concatenateImages(imgAmplitude, imgSpec)
+                        cv.imshow("Display amplitude graph and spectrogram", imgTotal)
+                        cv.waitKey(self.waitTime)
+            else:
+                break
+
+
+    def initAndCheckArgs(self, args):
+        if args.inputType != "file" and args.inputType != "microphone":
+            print("Error: ", args.inputType, " input method doesnt exist")
+            return False
+        if args.draw != "static" and args.draw != "dynamic":
+            print("Error: ", args.draw, " draw type doesnt exist")
+            return False
+        if args.graph != "ampl" and args.graph != "spec" and args.graph != "ampl_and_spec":
+            print("Error: ", args.graph, " type of graph doesnt exist")
+            return False
+        if args.windowType != "Rect" and args.windowType != "Hann" and args.windowType != "Hamming":
+            print("Error: ", args.windowType, " type of window doesnt exist")
+            return False
+        if args.windLen <= 0:
+            print("Error: windLen = ", args.windLen, " - incorrect value. Must be > 0")
+            return False
+        if args.overlap <= 0:
+            print("Error: overlap = ", args.overlap, " - incorrect value. Must be > 0")
+            return False
+        if args.rows <= 0:
+            print("Error: rows = ", args.rows, " - incorrect value. Must be > 0")
+            return False
+        if args.cols <= 0:
+            print("Error: cols = ", args.cols, " - incorrect value. Must be > 0")
+            return False
+        if args.xmarkup < 2:
+            print("Error: xmarkup = ", args.xmarkup, " - incorrect value. Must be >= 2")
+            return False
+        if args.ymarkup < 2:
+            print("Error: ymarkup = ", args.ymarkup, " - incorrect value. Must be >= 2")
+            return False
+        if args.zmarkup < 2:
+            print("Error: zmarkup = ", args.zmarkup, " - incorrect value. Must be >= 2")
+            return False
+        if args.microTime <= 0:
+            print("Error: microTime = ", args.microTime, " - incorrect value. Must be > 0")
+            return False
+        if args.frameSizeTime <= 0:
+            print("Error: frameSizeTime = ", args.frameSizeTime, " - incorrect value. Must be > 0")
+            return False
+        if args.updateTime <= 0:
+            print("Error: updateTime = ", args.updateTime, " - incorrect value. Must be > 0")
+            return False
+        if args.waitTime < 0:
+            print("Error: waitTime = ", args.waitTime, " - incorrect value. Must be >= 0")
+            return False
+        return True
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
+                                     description='''this sample draws a volume graph and/or spectrogram of audio/video files and microphone\nDefault usage: ./Spectrogram.exe''')
+
+    parser.add_argument("-i", "--inputType", dest="inputType", type=str, default="file", help="file or microphone")
+    parser.add_argument("-d", "--draw", dest="draw", type=str, default="static",
+                        help="type of drawing: static - for plotting graph(s) across the entire input audio; dynamic - for plotting graph(s) in a time-updating window")
+    parser.add_argument("-g", "--graph", dest="graph", type=str, default="ampl_and_spec",
+                        help="type of graph: amplitude graph or/and spectrogram. Please use tags below : ampl - draw the amplitude graph; spec - draw the spectrogram; ampl_and_spec - draw the amplitude graph and spectrogram on one image under each other")
+
+    parser.add_argument("-a", "--audio", dest="audio", type=str, default='Megamind.avi',
+                        help="name and path to file")
+    parser.add_argument("-s", "--audioStream", dest="audioStream", type=int, default=1,
+                        help=" CAP_PROP_AUDIO_STREAM value")
+
+    parser.add_argument("-t", '--windowType', dest="windowType", type=str, default="Rect",
+                        help="type of window for STFT. Please use tags below : Rect/Hann/Hamming")
+    parser.add_argument("-l", '--windLen', dest="windLen", type=int, default=256, help="size of window for STFT")
+    parser.add_argument("-o", '--overlap', dest="overlap", type=int, default=128, help="overlap of windows for STFT")
+
+    parser.add_argument("-gd", '--grid', dest="enableGrid", type=bool, default=False, help="grid on amplitude graph(on/off)")
+
+    parser.add_argument("-r", '--rows', dest="rows", type=int, default=400, help="rows of output image")
+    parser.add_argument("-c", '--cols', dest="cols", type=int, default=900, help="cols of output image")
+
+    parser.add_argument("-x", '--xmarkup', dest="xmarkup", type=int, default=5,
+                        help="number of x axis divisions (time asix)")
+    parser.add_argument("-y", '--ymarkup', dest="ymarkup", type=int, default=5,
+                        help="number of y axis divisions (frequency or/and amplitude axis)")  # ?
+    parser.add_argument("-z", '--zmarkup', dest="zmarkup", type=int, default=5,
+                        help="number of z axis divisions (colorbar)")  # ?
+
+    parser.add_argument("-m", '--microTime', dest="microTime", type=int, default=20,
+                        help="time of recording audio with microphone in seconds")
+    parser.add_argument("-f", '--frameSizeTime', dest="frameSizeTime", type=int, default=5,
+                        help="size of sliding window in seconds")
+    parser.add_argument("-u", '--updateTime', dest="updateTime", type=int, default=1,
+                        help="update time of sliding window in seconds")
+    parser.add_argument("-w", '--waitTime', dest="waitTime", type=int, default=10,
+                        help="parameter to cv.waitKey() for dynamic update, takes values in milliseconds")
+
+    args = parser.parse_args()
+
+    AudioDrawing(args).Draw()