mirror of https://github.com/opencv/opencv.git
Open Source Computer Vision Library
https://opencv.org/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1071 lines
40 KiB
1071 lines
40 KiB
#include <opencv2/core.hpp> |
|
#include <opencv2/videoio.hpp> |
|
#include <opencv2/highgui.hpp> |
|
#include <opencv2/imgproc.hpp> |
|
|
|
#include <iostream> |
|
#include <vector> |
|
#include <string> |
|
#include <cmath> |
|
using namespace cv; |
|
using namespace std; |
|
|
|
|
|
class AudioDrawing |
|
{ |
|
|
|
public: |
|
|
|
AudioDrawing(const CommandLineParser& parser) { |
|
if (!initAndCheckArgs(parser)) |
|
{ |
|
cerr << "Error: Wrong input arguments" << endl; |
|
exit(0); |
|
} |
|
Draw(); |
|
} |
|
|
|
void Draw() { |
|
if (draw == "static") |
|
{ |
|
vector<int>inputAudio = {}; |
|
int samplingRate = 0; |
|
if (inputType == "file") |
|
{ |
|
samplingRate = readAudioFile(audio, inputAudio); |
|
} |
|
else if (inputType == "microphone") |
|
{ |
|
samplingRate = readAudioMicrophone(inputAudio); |
|
} |
|
if ((inputAudio.size() == 0) || samplingRate <= 0) |
|
{ |
|
cerr << "Error: problems with audio reading, check input arguments" << endl; |
|
return; |
|
} |
|
|
|
int duration = static_cast<int>(inputAudio.size()) / samplingRate; |
|
|
|
// since the dimensional grid is counted in integer seconds, |
|
// if the input audio has an incomplete last second, |
|
// then it is filled with zeros to complete |
|
int remainder = static_cast<int>(inputAudio.size()) % samplingRate; |
|
if (remainder) |
|
{ |
|
int sizeToFullSec = samplingRate - remainder; |
|
for (int j = 0; j < sizeToFullSec; ++j) |
|
{ |
|
inputAudio.push_back(0); |
|
} |
|
duration += 1; |
|
cout << "Update duration of audio to full last second with " << |
|
sizeToFullSec << " zero samples" << endl; |
|
cout << "New number of samples " << inputAudio.size() << endl; |
|
} |
|
cout << "Duration of audio = " << duration << " seconds" << endl; |
|
|
|
// since the dimensional grid is counted in integer seconds, |
|
// if duration of file is less than xmarkup, to avoid an incorrect display, |
|
// xmarkup will be taken equal to duration |
|
if (duration <= xmarkup) |
|
{ |
|
xmarkup = duration + 1; |
|
} |
|
|
|
if (graph == "ampl") |
|
{ |
|
Mat imgAmplitude = drawAmplitude(inputAudio); |
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate); |
|
imshow("Display amplitude graph", imgAmplitude); |
|
waitKey(0); |
|
} |
|
else if (graph == "spec") |
|
{ |
|
vector<vector<double>>stft = STFT(inputAudio); |
|
Mat imgSpec = drawSpectrogram(stft); |
|
imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft); |
|
imshow("Display spectrogram", imgSpec); |
|
waitKey(0); |
|
} |
|
else if (graph == "ampl_and_spec") |
|
{ |
|
Mat imgAmplitude = drawAmplitude(inputAudio); |
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, inputAudio, samplingRate); |
|
vector<vector<double>>stft = STFT(inputAudio); |
|
Mat imgSpec = drawSpectrogram(stft); |
|
imgSpec = drawSpectrogramColorbar(imgSpec, inputAudio, samplingRate, stft); |
|
Mat imgTotal = concatenateImages(imgAmplitude, imgSpec); |
|
imshow("Display amplitude graph and spectrogram", imgTotal); |
|
waitKey(0); |
|
} |
|
} |
|
else if (draw == "dynamic") |
|
{ |
|
if (inputType == "file") |
|
{ |
|
dynamicFile(audio); |
|
} |
|
else if (inputType == "microphone") |
|
{ |
|
dynamicMicrophone(); |
|
} |
|
} |
|
} |
|
|
|
~AudioDrawing() { |
|
} |
|
|
|
int readAudioFile(string file, vector<int>& inputAudio) |
|
{ |
|
VideoCapture cap; |
|
vector<int> params { CAP_PROP_AUDIO_STREAM, audioStream, |
|
CAP_PROP_VIDEO_STREAM, -1, |
|
CAP_PROP_AUDIO_DATA_DEPTH, CV_16S }; |
|
|
|
cap.open(file, CAP_ANY, params); |
|
if (!cap.isOpened()) |
|
{ |
|
cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl; |
|
return -1; |
|
} |
|
const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX); |
|
const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS); |
|
cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl; |
|
int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND)); |
|
cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; |
|
cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; |
|
cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; |
|
|
|
vector<int> frameVec; |
|
Mat frame; |
|
for (;;) |
|
{ |
|
if (cap.grab()) |
|
{ |
|
cap.retrieve(frame, audioBaseIndex); |
|
frameVec = frame; |
|
inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end()); |
|
} |
|
else |
|
{ |
|
cout << "Number of samples: " << inputAudio.size() << endl; |
|
break; |
|
} |
|
} |
|
return samplingRate; |
|
} |
|
|
|
int readAudioMicrophone(vector<int>& inputAudio) |
|
{ |
|
VideoCapture cap; |
|
vector<int> params { CAP_PROP_AUDIO_STREAM, 0, |
|
CAP_PROP_VIDEO_STREAM, -1 }; |
|
|
|
cap.open(0, CAP_ANY, params); |
|
if (!cap.isOpened()) |
|
{ |
|
cerr << "Error: Can't open microphone" << endl; |
|
return -1; |
|
} |
|
|
|
const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX)); |
|
const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS)); |
|
cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString( static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl; |
|
int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND)); |
|
cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << samplingRate << endl; |
|
cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; |
|
cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; |
|
|
|
const double cvTickFreq = getTickFrequency(); |
|
int64 sysTimeCurr = getTickCount(); |
|
int64 sysTimePrev = sysTimeCurr; |
|
|
|
vector<int> frameVec; |
|
Mat frame; |
|
while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime) |
|
{ |
|
if (cap.grab()) |
|
{ |
|
cap.retrieve(frame, audioBaseIndex); |
|
frameVec = frame; |
|
inputAudio.insert(inputAudio.end(), frameVec.begin(), frameVec.end()); |
|
sysTimeCurr = getTickCount(); |
|
} |
|
else |
|
{ |
|
cerr << "Error: Grab error" << endl; |
|
break; |
|
} |
|
} |
|
cout << "Number of samples: " << inputAudio.size() << endl; |
|
return samplingRate; |
|
} |
|
|
|
|
|
Mat drawAmplitude(vector<int>& inputAudio) |
|
{ |
|
Scalar color = Scalar(247,111,87); |
|
int thickness = 5; |
|
int frameVectorRows = 500; |
|
int middle = frameVectorRows / 2; |
|
// usually the input data is too big, so it is necessary |
|
// to reduce size using interpolation of data |
|
int frameVectorCols = 40000; |
|
if (static_cast<int>(inputAudio.size()) < frameVectorCols) |
|
{ |
|
frameVectorCols = static_cast<int>(inputAudio.size()); |
|
} |
|
|
|
Mat img(frameVectorRows, frameVectorCols, CV_8UC3 , Scalar(255,255,255)); // white background |
|
|
|
vector<double>reshapeAudio(inputAudio.size()); |
|
for (size_t i = 0; i < inputAudio.size(); ++i) |
|
{ |
|
reshapeAudio[i]=static_cast<double>(inputAudio[i]); |
|
} |
|
|
|
Mat img_frameVector( 1, static_cast<int>(reshapeAudio.size()), CV_64F , reshapeAudio.data()); |
|
Mat img_frameVector_resize; |
|
resize(img_frameVector, img_frameVector_resize, Size(frameVectorCols, 1), INTER_LINEAR); |
|
reshapeAudio = img_frameVector_resize; |
|
|
|
// normalization data by maximum element |
|
normalize(reshapeAudio, reshapeAudio, 1.0, 0.0, NORM_INF); |
|
|
|
for (size_t i = 0; i < reshapeAudio.size(); ++i) |
|
{ |
|
reshapeAudio[i] = middle - reshapeAudio[i] * middle; |
|
} |
|
|
|
for (int i = 1; i < static_cast<int>(reshapeAudio.size()); ++i) |
|
{ |
|
line(img, Point(i-1, static_cast<int>(reshapeAudio[i-1])), Point(i, static_cast<int>(reshapeAudio[i])), color, thickness); |
|
} |
|
Mat resImage; |
|
resize(img, resImage, Size(900, 400), INTER_AREA ); |
|
return resImage; |
|
} |
|
|
|
Mat drawAmplitudeScale(Mat& inputImg, const vector<int>& inputAudio, int samplingRate, |
|
int xmin = 0, int xmax = 0) |
|
{ |
|
// function of layout drawing for graph of volume amplitudes |
|
// x axis for time |
|
// y axis for amplitudes |
|
|
|
// parameters for the new image size |
|
int preCol = 100; |
|
int aftCol = 100; |
|
int preLine = 40; |
|
int aftLine = 50; |
|
|
|
int frameVectorRows = inputImg.rows; |
|
int frameVectorCols = inputImg.cols; |
|
|
|
int totalRows = preLine + frameVectorRows + aftLine; |
|
int totalCols = preCol + frameVectorCols + aftCol; |
|
|
|
Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3, Scalar(255, 255, 255)); |
|
inputImg.copyTo(imgTotal(Rect(preCol, preLine, inputImg.cols, inputImg.rows))); |
|
|
|
|
|
// calculating values on x axis |
|
if (xmax == 0) |
|
{ |
|
xmax = static_cast<int>(inputAudio.size()) / samplingRate; |
|
} |
|
std::vector<double> xList(xmarkup); |
|
if (xmax >= xmarkup) |
|
{ |
|
double deltax = (xmax - xmin) / (xmarkup - 1); |
|
for (int i = 0; i < xmarkup; ++i) |
|
{ |
|
xList[i] = (xmin + deltax * i); |
|
} |
|
} |
|
else |
|
{ |
|
// this case is used to display a dynamic update |
|
vector<double> tmpXList; |
|
for (int i = xmin; i < xmax; ++i) |
|
{ |
|
tmpXList.push_back(i + 1); |
|
} |
|
int k = 0; |
|
for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i) |
|
{ |
|
xList[i] = tmpXList[k]; |
|
k += 1; |
|
} |
|
} |
|
|
|
// calculating values on y axis |
|
double minCv; double maxCv; Point minLoc; Point maxLoc; |
|
minMaxLoc(inputAudio, &minCv, &maxCv, &minLoc, &maxLoc); |
|
int ymin = static_cast<int>(minCv); |
|
int ymax = static_cast<int>(maxCv); |
|
|
|
std::vector<double> yList(ymarkup); |
|
double deltay = (ymax - ymin) / (ymarkup - 1); |
|
for (int i = 0; i < ymarkup; ++i) |
|
{ |
|
yList[i] = ymin + deltay * i; |
|
} |
|
|
|
// parameters for layout drawing |
|
int textThickness = 1; |
|
int gridThickness = 1; |
|
Scalar gridColor(0, 0, 0); |
|
Scalar textColor(0, 0, 0); |
|
float fontScale = 0.5; |
|
|
|
// horizontal axis |
|
line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine), |
|
gridColor, gridThickness); |
|
// vertical axis |
|
line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows), |
|
gridColor, gridThickness); |
|
|
|
// parameters for layout calculation |
|
int serifSize = 10; |
|
int indentDownX = serifSize * 2; |
|
int indentDownY = serifSize / 2; |
|
int indentLeftX = serifSize; |
|
int indentLeftY = 2 * preCol / 3; |
|
|
|
|
|
// drawing layout for x axis |
|
int numX = frameVectorCols / (xmarkup - 1); |
|
for (size_t i = 0; i < xList.size(); ++i) |
|
{ |
|
int a1 = static_cast<int>(preCol + i * numX); |
|
int a2 = frameVectorRows + preLine; |
|
|
|
int b1 = a1; |
|
int b2 = a2 + serifSize; |
|
|
|
if (enableGrid) |
|
{ |
|
int d1 = a1; |
|
int d2 = preLine; |
|
line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness); |
|
} |
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); |
|
putText(imgTotal, to_string(int(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX), |
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); |
|
} |
|
|
|
// drawing layout for y axis |
|
int numY = frameVectorRows / (ymarkup - 1); |
|
for (size_t i = 0; i < yList.size(); ++i) { |
|
int a1 = preCol; |
|
int a2 = static_cast<int>(totalRows - aftLine - i * numY); |
|
int b1 = preCol - serifSize; |
|
int b2 = a2; |
|
if (enableGrid) |
|
{ |
|
int d1 = preCol + frameVectorCols; |
|
int d2 = a2; |
|
line(imgTotal, Point(a1, a2), Point(d1, d2), gridColor, gridThickness); |
|
} |
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); |
|
putText(imgTotal, to_string(int(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY), |
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); |
|
} |
|
Mat resImage; |
|
resize(imgTotal, resImage, Size(cols, rows), INTER_AREA ); |
|
return resImage; |
|
} |
|
|
|
vector<vector<double>> STFT(const vector<int>& inputAudio) |
|
{ |
|
// The Short-time Fourier transform (STFT), is a Fourier-related transform used to |
|
// determine the sinusoidal frequency and phase content of local sections of a signal |
|
// as it changes over time. |
|
// In practice, the procedure for computing STFTs is to divide a longer time signal |
|
// into shorter segments of equal length and then compute the Fourier transform separately |
|
// on each shorter segment. This reveals the Fourier spectrum on each shorter segment. |
|
// One then usually plots the changing spectra as a function of time, known as a spectrogram |
|
// or waterfall plot. |
|
// https://en.wikipedia.org/wiki/Short-time_Fourier_transform |
|
|
|
int timeStep = windLen - overlap; |
|
Mat dstMat; |
|
vector<double> stftRow; |
|
vector<double> WindType; |
|
if (windowType == "Hann") |
|
{ |
|
// https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows |
|
for (int j = 1 - windLen; j < windLen; j+=2) |
|
{ |
|
WindType.push_back(j * (0.5 * (1 - cos(CV_PI * j / (windLen - 1))))); |
|
} |
|
} |
|
else if (windowType == "Hamming") |
|
{ |
|
// https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows |
|
for (int j = 1 - windLen; j < windLen; j+=2) |
|
{ |
|
WindType.push_back(j * (0.53836 - 0.46164 * (cos(CV_PI * j / (windLen - 1))))); |
|
} |
|
} |
|
for (size_t i = 0; i < inputAudio.size(); i += timeStep) |
|
{ |
|
vector<double>section(windLen, 0); |
|
for (int j = 0; j < windLen; ++j) |
|
{ |
|
section[j] = inputAudio[j + i]; |
|
} |
|
if (windowType == "Hann" || windowType == "Hamming") |
|
{ |
|
for (size_t j = 0; j < section.size(); ++j) |
|
{ |
|
section[j] *= WindType[j]; |
|
} |
|
} |
|
|
|
dft(section, dstMat, DFT_COMPLEX_OUTPUT); |
|
|
|
for (int j = 0; j < dstMat.cols / 4; ++j) |
|
{ |
|
double complModule = sqrt(dstMat.at<double>(2*j) * dstMat.at<double>(2*j) + |
|
dstMat.at<double>(2*j+1) * dstMat.at<double>(2*j+1)); |
|
stftRow.push_back(complModule); |
|
} |
|
} |
|
|
|
size_t xSize = inputAudio.size() / timeStep + 1; |
|
// we need only the first part of the spectrum, the second part is symmetrical |
|
size_t ySize = dstMat.cols / 4; |
|
|
|
vector<vector<double>> stft(ySize, vector<double>(xSize, 0.)); |
|
for (size_t i = 0; i < xSize; ++i) |
|
{ |
|
for (size_t j = 0; j < ySize; ++j) |
|
{ |
|
// write elements with transposition and convert it to the decibel scale |
|
double stftElem = stftRow[ i * ySize + j]; |
|
if (stftElem != 0.) |
|
{ |
|
stft[j][i] = 10 * log10(stftElem); |
|
} |
|
} |
|
} |
|
return stft; |
|
} |
|
|
|
Mat drawSpectrogram(const vector<vector<double>>& stft) |
|
{ |
|
int frameVectorRows = static_cast<int>(stft.size()); |
|
int frameVectorCols = static_cast<int>(stft[0].size()); |
|
|
|
// Normalization of image values from 0 to 255 to get more contrast image |
|
// and this normalization will be taken into account in the scale drawing |
|
int colormapImageRows = 255; |
|
|
|
double minCv; double maxCv; Point minLoc; Point maxLoc; |
|
minMaxLoc(stft[0], &minCv, &maxCv, &minLoc, &maxLoc); |
|
double maxStft = max(abs(maxCv), abs(minCv)); |
|
|
|
for (int i = 1; i < frameVectorRows; ++i) |
|
{ |
|
minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc); |
|
maxStft = max(maxStft, max(abs(maxCv), abs(minCv))); |
|
} |
|
// if maxStft is zero (silence) |
|
if (maxStft == 0.) |
|
{ |
|
maxStft = 1; |
|
} |
|
Mat imgSpec(frameVectorRows, frameVectorCols, CV_8UC1, Scalar(255, 255, 255)); |
|
|
|
for (int i = 0; i < frameVectorRows; ++i) |
|
{ |
|
for (int j = 0; j < frameVectorCols; ++j) |
|
{ |
|
imgSpec.at<uchar>(frameVectorRows - i - 1, j) = static_cast<uchar>(stft[i][j] * colormapImageRows / maxStft); |
|
} |
|
} |
|
applyColorMap(imgSpec, imgSpec, COLORMAP_INFERNO); |
|
Mat resImage; |
|
resize(imgSpec, resImage, Size(900, 400), INTER_AREA); |
|
return resImage; |
|
} |
|
|
|
Mat drawSpectrogramColorbar(Mat& inputImg, const vector<int>& inputAudio, |
|
int samplingRate, const vector<vector<double>>& stft, |
|
int xmin = 0, int xmax = 0) |
|
{ |
|
// function of layout drawing for the three-dimensional graph of the spectrogram |
|
// x axis for time |
|
// y axis for frequencies |
|
// z axis for magnitudes of frequencies shown by color scale |
|
|
|
// parameters for the new image size |
|
int preCol = 100; |
|
int aftCol = 100; |
|
int preLine = 40; |
|
int aftLine = 50; |
|
int colColor = 20; |
|
int indCol = 20; |
|
|
|
int frameVectorRows = inputImg.rows; |
|
int frameVectorCols = inputImg.cols; |
|
|
|
int totalRows = preLine + frameVectorRows + aftLine; |
|
int totalCols = preCol + frameVectorCols + aftCol; |
|
|
|
Mat imgTotal = Mat(totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255)); |
|
inputImg.copyTo(imgTotal(Rect(preCol, preLine, frameVectorCols, frameVectorRows))); |
|
|
|
// colorbar image due to drawSpectrogram(..) picture has been normalised from 255 to 0, |
|
// so here colorbar has values from 255 to 0 |
|
int colorArrSize = 256; |
|
Mat imgColorBar = Mat (colorArrSize, colColor, CV_8UC1 , Scalar(255,255,255)); |
|
for (int i = 0; i < colorArrSize; ++i) |
|
{ |
|
for( int j = 0; j < colColor; ++j) |
|
{ |
|
imgColorBar.at<uchar>(i, j) = static_cast<uchar>(colorArrSize - 1 - i); // from 255 to 0 |
|
} |
|
} |
|
|
|
applyColorMap(imgColorBar, imgColorBar, COLORMAP_INFERNO); |
|
resize(imgColorBar, imgColorBar, Size(colColor, frameVectorRows), INTER_AREA); |
|
imgColorBar.copyTo(imgTotal(Rect(preCol + frameVectorCols + indCol, preLine, colColor, frameVectorRows))); |
|
|
|
|
|
// calculating values on x axis |
|
if (xmax == 0) |
|
{ |
|
xmax = static_cast<int>(inputAudio.size()) / samplingRate + 1; |
|
} |
|
vector<double> xList(xmarkup, 0); |
|
if (xmax >= xmarkup) |
|
{ |
|
double deltax = (xmax - xmin) / (xmarkup - 1); |
|
for(int i = 0; i < xmarkup; ++i) |
|
{ |
|
xList[i] = xmin + deltax * i; |
|
} |
|
} |
|
else |
|
{ |
|
// this case is used to display a dynamic update |
|
vector<double> tmpXList; |
|
for(int i = xmin; i < xmax; ++i) |
|
{ |
|
tmpXList.push_back(i + 1); |
|
} |
|
int k = 0; |
|
for (int i = xmarkup - static_cast<int>(tmpXList.size()); i < xmarkup; ++i) |
|
{ |
|
xList[i] = tmpXList[k]; |
|
k += 1; |
|
} |
|
} |
|
|
|
// calculating values on y axis |
|
// according to the Nyquist sampling theorem, |
|
// signal should posses frequencies equal to half of sampling rate |
|
int ymin = 0; |
|
int ymax = static_cast<int>(samplingRate / 2); |
|
|
|
vector<double> yList; |
|
double deltay = (ymax - ymin) / (ymarkup - 1); |
|
for(int i = 0; i < ymarkup; ++i) |
|
{ |
|
yList.push_back(ymin + deltay * i); |
|
} |
|
|
|
// calculating values on z axis |
|
double minCv; double maxCv; Point minLoc; Point maxLoc; |
|
minMaxLoc( stft[0], &minCv, &maxCv, &minLoc, &maxLoc); |
|
double zmin = minCv, zmax = maxCv; |
|
|
|
std::vector<double> zList; |
|
for (size_t i = 1; i < stft.size(); ++i) |
|
{ |
|
minMaxLoc( stft[i], &minCv, &maxCv, &minLoc, &maxLoc); |
|
zmax = max(zmax, maxCv); |
|
zmin = min(zmin, minCv); |
|
} |
|
double deltaz = (zmax - zmin) / (zmarkup - 1); |
|
for(int i = 0; i < zmarkup; ++i) |
|
{ |
|
zList.push_back(zmin + deltaz * i); |
|
} |
|
|
|
// parameters for layout drawing |
|
int textThickness = 1; |
|
int gridThickness = 1; |
|
Scalar gridColor(0,0,0); |
|
Scalar textColor(0,0,0); |
|
float fontScale = 0.5; |
|
|
|
int serifSize = 10; |
|
int indentDownX = serifSize * 2; |
|
int indentDownY = serifSize / 2; |
|
int indentLeftX = serifSize; |
|
int indentLeftY = 2 * preCol / 3; |
|
|
|
// horizontal axis |
|
line(imgTotal, Point(preCol, totalRows - aftLine), Point(preCol + frameVectorCols, totalRows - aftLine), |
|
gridColor, gridThickness); |
|
// vertical axis |
|
line(imgTotal, Point(preCol, preLine), Point(preCol, preLine + frameVectorRows), |
|
gridColor, gridThickness); |
|
|
|
// drawing layout for x axis |
|
int numX = frameVectorCols / (xmarkup - 1); |
|
for (size_t i = 0; i < xList.size(); ++i) |
|
{ |
|
int a1 = static_cast<int>(preCol + i * numX); |
|
int a2 = frameVectorRows + preLine; |
|
|
|
int b1 = a1; |
|
int b2 = a2 + serifSize; |
|
|
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); |
|
putText(imgTotal, to_string(static_cast<int>(xList[i])), Point(b1 - indentLeftX, b2 + indentDownX), |
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); |
|
} |
|
|
|
// drawing layout for y axis |
|
int numY = frameVectorRows / (ymarkup - 1); |
|
for (size_t i = 0; i < yList.size(); ++i) |
|
{ |
|
int a1 = preCol; |
|
int a2 = static_cast<int>(totalRows - aftLine - i * numY); |
|
|
|
int b1 = preCol - serifSize; |
|
int b2 = a2; |
|
|
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); |
|
putText(imgTotal, to_string(static_cast<int>(yList[i])), Point(b1 - indentLeftY, b2 + indentDownY), |
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); |
|
} |
|
|
|
// drawing layout for z axis |
|
int numZ = frameVectorRows / (zmarkup - 1); |
|
for (size_t i = 0; i < zList.size(); ++i) |
|
{ |
|
int a1 = preCol + frameVectorCols + indCol + colColor; |
|
int a2 = static_cast<int>(totalRows - aftLine - i * numZ); |
|
|
|
int b1 = a1 + serifSize; |
|
int b2 = a2; |
|
|
|
line(imgTotal, Point(a1, a2), Point(b1, b2), gridColor, gridThickness); |
|
putText(imgTotal, to_string(static_cast<int>(zList[i])), Point(b1 + 10, b2 + indentDownY), |
|
FONT_HERSHEY_SIMPLEX, fontScale, textColor, textThickness); |
|
} |
|
Mat resImage; |
|
resize(imgTotal, resImage, Size(cols, rows), INTER_AREA ); |
|
return resImage; |
|
} |
|
|
|
Mat concatenateImages(Mat& img1, Mat& img2) |
|
{ |
|
// first image will be under the second image |
|
int totalRows = img1.rows + img2.rows; |
|
int totalCols = max(img1.cols , img2.cols); |
|
// if images columns do not match, the difference is filled in white |
|
Mat imgTotal = Mat (totalRows, totalCols, CV_8UC3 , Scalar(255, 255, 255)); |
|
|
|
img1.copyTo(imgTotal(Rect(0, 0, img1.cols, img1.rows))); |
|
img2.copyTo(imgTotal(Rect(0, img1.rows, img2.cols, img2.rows))); |
|
return imgTotal; |
|
} |
|
|
|
void dynamicFile(const string file) |
|
{ |
|
VideoCapture cap; |
|
vector<int> params { CAP_PROP_AUDIO_STREAM, audioStream, |
|
CAP_PROP_VIDEO_STREAM, -1, |
|
CAP_PROP_AUDIO_DATA_DEPTH, CV_16S }; |
|
|
|
cap.open(file, CAP_ANY, params); |
|
if (!cap.isOpened()) |
|
{ |
|
cerr << "Error : Can't read audio file: '" << audio << "' with audioStream = " << audioStream << endl; |
|
return; |
|
} |
|
|
|
const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX)); |
|
const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS)); |
|
int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND)); |
|
|
|
cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl; |
|
cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; |
|
cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; |
|
cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; |
|
|
|
int step = static_cast<int>(updateTime * samplingRate); |
|
int frameSize = static_cast<int>(frameSizeTime * samplingRate); |
|
|
|
// since the dimensional grid is counted in integer seconds, |
|
// if duration of audio frame is less than xmarkup, to avoid an incorrect display, |
|
// xmarkup will be taken equal to duration |
|
if (frameSizeTime <= xmarkup) |
|
{ |
|
xmarkup = frameSizeTime; |
|
} |
|
|
|
vector<int> buffer; |
|
vector<int> frameVector; |
|
vector<int> section(frameSize, 0); |
|
vector<vector<double>>stft; |
|
Mat frame, imgAmplitude, imgSpec, imgTotal; |
|
int currentSamples = 0; |
|
int xmin = 0; |
|
int xmax = 0; |
|
|
|
for (;;) |
|
{ |
|
if (cap.grab()) |
|
{ |
|
cap.retrieve(frame, audioBaseIndex); |
|
frameVector = frame; |
|
buffer.insert(buffer.end(), frameVector.begin(), frameVector.end()); |
|
int bufferSize = static_cast<int>(buffer.size()); |
|
if (bufferSize >= step) |
|
{ |
|
currentSamples += bufferSize; |
|
section.erase(section.begin(), section.begin() + step); |
|
section.insert(section.end(), buffer.begin(), buffer.end()); |
|
buffer.erase(buffer.begin(), buffer.begin() + step); |
|
if (currentSamples < frameSize) |
|
{ |
|
xmin = 0; |
|
xmax = (currentSamples) / samplingRate; |
|
} |
|
else |
|
{ |
|
xmin = (currentSamples - frameSize) / samplingRate + 1; |
|
xmax = (currentSamples) / samplingRate; |
|
} |
|
|
|
if (graph == "ampl") |
|
{ |
|
imgAmplitude = drawAmplitude(section); |
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax); |
|
imshow("Display amplitude graph", imgAmplitude); |
|
waitKey(waitTime); |
|
} |
|
else if (graph == "spec") |
|
{ |
|
stft = STFT(section); |
|
imgSpec = drawSpectrogram(stft); |
|
imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax); |
|
imshow("Display spectrogram", imgSpec); |
|
waitKey(waitTime); |
|
} |
|
else if (graph == "ampl_and_spec") |
|
{ |
|
imgAmplitude = drawAmplitude(section); |
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax); |
|
stft = STFT(section); |
|
imgSpec = drawSpectrogram(stft); |
|
imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax); |
|
imgTotal = concatenateImages(imgAmplitude, imgSpec); |
|
imshow("Display amplitude graph and spectrogram", imgTotal); |
|
waitKey(waitTime); |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
break; |
|
} |
|
} |
|
|
|
} |
|
|
|
void dynamicMicrophone() |
|
{ |
|
VideoCapture cap; |
|
vector<int> params { CAP_PROP_AUDIO_STREAM, 0, |
|
CAP_PROP_VIDEO_STREAM, -1 }; |
|
|
|
cap.open(0, CAP_MSMF, params); |
|
if (!cap.isOpened()) |
|
{ |
|
cerr << "Error: Can't open microphone" << endl; |
|
return; |
|
} |
|
|
|
const int audioBaseIndex = static_cast<int>(cap.get(CAP_PROP_AUDIO_BASE_INDEX)); |
|
const int numberOfChannels = static_cast<int>(cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS)); |
|
int samplingRate = static_cast<int>(cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND)); |
|
cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString(static_cast<int>(cap.get(CAP_PROP_AUDIO_DATA_DEPTH))) << endl; |
|
cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; |
|
cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; |
|
cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; |
|
|
|
const double cvTickFreq = getTickFrequency(); |
|
int64 sysTimeCurr = getTickCount(); |
|
int64 sysTimePrev = sysTimeCurr; |
|
|
|
int step = (updateTime * samplingRate); |
|
int frameSize = (frameSizeTime * samplingRate); |
|
// since the dimensional grid is counted in integer seconds, |
|
// if duration of audio frame is less than xmarkup, to avoid an incorrect display, |
|
// xmarkup will be taken equal to duration |
|
if (frameSizeTime <= xmarkup) |
|
{ |
|
xmarkup = frameSizeTime; |
|
} |
|
|
|
vector<int> frameVector; |
|
vector<int> buffer; |
|
vector<int> section(frameSize, 0); |
|
Mat frame, imgAmplitude, imgSpec, imgTotal; |
|
|
|
int currentSamples = 0; |
|
vector<vector<double>> stft; |
|
int xmin = 0; |
|
int xmax = 0; |
|
waitTime = updateTime * 1000; |
|
while ((sysTimeCurr - sysTimePrev) / cvTickFreq < microTime) |
|
{ |
|
if (cap.grab()) |
|
{ |
|
cap.retrieve(frame, audioBaseIndex); |
|
frameVector = frame; |
|
buffer.insert(buffer.end(), frameVector.begin(), frameVector.end()); |
|
sysTimeCurr = getTickCount(); |
|
|
|
int bufferSize = static_cast<int>(buffer.size()); |
|
if (bufferSize >= step) |
|
{ |
|
currentSamples += step; |
|
section.erase(section.begin(), section.begin() + step); |
|
section.insert(section.end(), buffer.begin(), buffer.end()); |
|
buffer.erase(buffer.begin(), buffer.begin() + step); |
|
|
|
if (currentSamples < frameSize) |
|
{ |
|
xmin = 0; |
|
xmax = (currentSamples) / samplingRate; |
|
} |
|
else |
|
{ |
|
xmin = (currentSamples - frameSize) / samplingRate + 1; |
|
xmax = (currentSamples) / samplingRate; |
|
} |
|
|
|
if (graph == "ampl") |
|
{ |
|
imgAmplitude = drawAmplitude(section); |
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax); |
|
imshow("Display amplitude graph", imgAmplitude); |
|
waitKey(waitTime); |
|
} |
|
else if (graph == "spec") |
|
{ |
|
stft = STFT(section); |
|
imgSpec = drawSpectrogram(stft); |
|
imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax); |
|
imshow("Display spectrogram", imgSpec); |
|
waitKey(waitTime); |
|
} |
|
else if (graph == "ampl_and_spec") |
|
{ |
|
imgAmplitude = drawAmplitude(section); |
|
imgAmplitude = drawAmplitudeScale(imgAmplitude, section, samplingRate, xmin, xmax); |
|
stft = STFT(section); |
|
imgSpec = drawSpectrogram(stft); |
|
imgSpec = drawSpectrogramColorbar(imgSpec, section, samplingRate, stft, xmin, xmax); |
|
imgTotal = concatenateImages(imgAmplitude, imgSpec); |
|
imshow("Display amplitude graph and spectrogram", imgTotal); |
|
waitKey(waitTime); |
|
} |
|
} |
|
} |
|
else |
|
{ |
|
cerr << "Error: Grab error" << endl; |
|
break; |
|
} |
|
} |
|
|
|
} |
|
|
|
bool initAndCheckArgs(const CommandLineParser& parser) |
|
{ |
|
inputType = parser.get<string>("inputType"); |
|
if ((inputType != "file") && (inputType != "microphone")) |
|
{ |
|
cout << "Error: " << inputType << " input method doesnt exist" << endl; |
|
return false; |
|
} |
|
|
|
draw = parser.get<string>("draw"); |
|
if ((draw != "static") && (draw != "dynamic")) |
|
{ |
|
cout << "Error: " << draw << " draw type doesnt exist" << endl; |
|
return false; |
|
} |
|
|
|
graph = parser.get<string>("graph"); |
|
if ((graph != "ampl") && (graph != "spec") && (graph != "ampl_and_spec")) |
|
{ |
|
cout << "Error: " << graph << " type of graph doesnt exist" << endl; |
|
return false; |
|
} |
|
|
|
audio = samples::findFile(parser.get<std::string>("audio")); |
|
|
|
audioStream = parser.get<int>("audioStream"); |
|
if (audioStream < 0) |
|
{ |
|
cout << "Error: audioStream = " << audioStream << " - incorrect value. Must be >= 0" << endl; |
|
return false; |
|
} |
|
windowType = parser.get<string>("windowType"); |
|
if ((windowType != "Rect") && (windowType != "Hann") && (windowType != "Hamming")) |
|
{ |
|
cout << "Error: " << windowType << " type of window doesnt exist" << endl; |
|
return false; |
|
} |
|
|
|
windLen = parser.get<int>("windLen"); |
|
if (windLen <= 0) |
|
{ |
|
cout << "Error: windLen = " << windLen << " - incorrect value. Must be > 0" << endl; |
|
return false; |
|
} |
|
|
|
overlap = parser.get<int>("overlap"); |
|
if (overlap <= 0) |
|
{ |
|
cout << "Error: overlap = " << overlap << " - incorrect value. Must be > 0" << endl; |
|
return false; |
|
} |
|
|
|
enableGrid = parser.get<bool>("enableGrid"); |
|
|
|
rows = parser.get<int>("rows"); |
|
if (rows <= 0) |
|
{ |
|
cout << "Error: rows = " << rows << " - incorrect value. Must be > 0" << endl; |
|
return false; |
|
} |
|
cols = parser.get<int>("cols"); |
|
|
|
if (cols <= 0) |
|
{ |
|
cout << "Error: cols = " << cols << " - incorrect value. Must be > 0" << endl; |
|
return false; |
|
} |
|
xmarkup = parser.get<int>("xmarkup"); |
|
if (xmarkup < 2) |
|
{ |
|
cout << "Error: xmarkup = " << xmarkup << " - incorrect value. Must be >= 2" << endl; |
|
return false; |
|
} |
|
ymarkup = parser.get<int>("ymarkup"); |
|
if (ymarkup < 2) |
|
{ |
|
cout << "Error: ymarkup = " << ymarkup << " - incorrect value. Must be >= 2" << endl; |
|
return false; |
|
} |
|
zmarkup = parser.get<int>("zmarkup"); |
|
if (zmarkup < 2) |
|
{ |
|
cout << "Error: zmarkup = " << zmarkup << " - incorrect value. Must be >= 2" << endl; |
|
return false; |
|
} |
|
microTime = parser.get<int>("microTime"); |
|
if (microTime <= 0) |
|
{ |
|
cout << "Error: microTime = " << microTime << " - incorrect value. Must be > 0" << endl; |
|
return false; |
|
} |
|
frameSizeTime = parser.get<int>("frameSizeTime"); |
|
if (frameSizeTime <= 0) |
|
{ |
|
cout << "Error: frameSizeTime = " << frameSizeTime << " - incorrect value. Must be > 0" << endl; |
|
return false; |
|
} |
|
updateTime = parser.get<int>("updateTime"); |
|
if (updateTime <= 0) |
|
{ |
|
cout << "Error: updateTime = " << updateTime << " - incorrect value. Must be > 0" << endl; |
|
return false; |
|
} |
|
waitTime = parser.get<int>("waitTime"); |
|
if (waitTime < 0) |
|
{ |
|
cout << "Error: waitTime = " << waitTime << " - incorrect value. Must be >= 0" << endl; |
|
return false; |
|
} |
|
return true; |
|
} |
|
|
|
private : |
|
string inputType; |
|
string draw; |
|
string graph; |
|
string audio; |
|
int audioStream; |
|
|
|
string windowType; |
|
int windLen; |
|
int overlap; |
|
|
|
bool enableGrid; |
|
|
|
int rows; |
|
int cols; |
|
|
|
int xmarkup; |
|
int ymarkup; |
|
int zmarkup; |
|
|
|
int microTime; |
|
int frameSizeTime; |
|
int updateTime; |
|
int waitTime; |
|
|
|
}; |
|
|
|
int main(int argc, char** argv) |
|
{ |
|
const String keys = |
|
"{help h usage ? | | this sample draws a volume graph and/or spectrogram of audio/video files and microphone \n\t\tDefault usage: ./Spectrogram.exe}" |
|
"{inputType i | file | file or microphone }" |
|
"{draw d | static | type of drawing: \n\t\t\tstatic - for plotting graph(s) across the entire input audio \n\t\t\tdynamic - for plotting graph(s) in a time-updating window}" |
|
"{graph g | ampl_and_spec | type of graph: amplitude graph or/and spectrogram. Please use tags below : \n\t\t\tampl - draw the amplitude graph \n\t\t\tspec - draw the spectrogram\n\t\t\tampl_and_spec - draw the amplitude graph and spectrogram on one image under each other}" |
|
"{audio a | Megamind.avi | name and path to file }" |
|
"{audioStream s | 1 | CAP_PROP_AUDIO_STREAM value. Select audio stream number }" |
|
"{windowType t | Rect | type of window for STFT. Please use tags below : \n\t\t\tRect/Hann/Hamming }" |
|
"{windLen l | 256 | size of window for STFT }" |
|
"{overlap o | 128 | overlap of windows for STFT }" |
|
|
|
"{enableGrid | false | grid on the amplitude graph }" |
|
|
|
"{rows r | 400 | rows of output image }" |
|
"{cols c | 900 | cols of output image }" |
|
|
|
"{xmarkup x | 5 | number of x axis divisions (time asix) }" |
|
"{ymarkup y | 5 | number of y axis divisions (frequency or/and amplitude axis) }" |
|
"{zmarkup z | 5 | number of z axis divisions (colorbar) }" |
|
|
|
"{microTime m | 20 | time of recording audio with microphone in seconds }" |
|
"{frameSizeTime f| 5 | size of sliding window in seconds }" |
|
"{updateTime u | 1 | update time of sliding window in seconds }" |
|
"{waitTime w | 10 | parameter to cv.waitKey() for dynamic update of file input, takes values in milliseconds }" |
|
; |
|
|
|
CommandLineParser parser(argc, argv, keys); |
|
if (parser.has("help")) |
|
{ |
|
parser.printMessage(); |
|
return 0; |
|
} |
|
|
|
AudioDrawing draw(parser); |
|
return 0; |
|
} |