Merge pull request #19721 from MaximMilashchenko:Audio

add audio support in cap_msmf * audio msmf * fixed warnings * minor fix * fixed SampleTime MSMF * minor fix, fixed audio test, retrieveAudioFrame * fixed warnings * impelemented sync audio and video stream with start offset * fixed error * fixed docs * fixed audio sample * CAP_PROP_AUDIO_POS, minor fixed * fixed warnings * videoio(MSMF): update audio test checks, add debug logging * fixed * fixed desynchronization of time positions, warnings * fixed warnings * videoio(audio): tune tests checks * videoio(audio): update properties description * build warnings Co-authored-by: Alexander Alekhin <alexander.a.alekhin@gmail.com>
3 years ago · f36c268b9e
parent 824392a1c2
commit f36c268b9e
7 changed files with 1360 additions and 136 deletions
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@ -189,6 +189,17 @@ enum VideoCaptureProperties {
       CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg back-end only)
       CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg back-end only)
       CAP_PROP_STREAM_OPEN_TIME_USEC =55, //<! (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
       CAP_PROP_VIDEO_TOTAL_CHANNELS = 56, //!< (read-only) Number of video channels
       CAP_PROP_VIDEO_STREAM = 57, //!< (**open-only**) Specify video stream, 0-based index. Use -1 to disable video stream from file or IP cameras. Default value is 0.
       CAP_PROP_AUDIO_STREAM = 58, //!< (**open-only**) Specify stream in multi-language media files, -1 - disable audio processing or microphone. Default value is -1.
       CAP_PROP_AUDIO_POS = 59, //!< (read-only) Audio position is measured in samples. Accurate audio sample timestamp of previous grabbed fragment. See CAP_PROP_AUDIO_SAMPLES_PER_SECOND and CAP_PROP_AUDIO_SHIFT_NSEC.
       CAP_PROP_AUDIO_SHIFT_NSEC = 60, //!< (read only) Contains the time difference between the start of the audio stream and the video stream in nanoseconds. Positive value means that audio is started after the first video frame. Negative value means that audio is started before the first video frame.
       CAP_PROP_AUDIO_DATA_DEPTH = 61, //!< (open, read) Alternative definition to bits-per-sample, but with clear handling of 32F / 32S
       CAP_PROP_AUDIO_SAMPLES_PER_SECOND = 62, //!< (read-only) determined from file/codec input. If not specified, then selected audio sample rate is 44100
       CAP_PROP_AUDIO_BASE_INDEX = 63, //!< (read-only) Index of the first audio channel for .retrieve() calls. That audio channel number continues enumeration after video channels.
       CAP_PROP_AUDIO_TOTAL_CHANNELS = 64, //!< (read-only) Number of audio channels in the selected audio stream (mono, stereo, etc)
       CAP_PROP_AUDIO_TOTAL_STREAMS = 65, //!< (read-only) Number of audio streams.
       CAP_PROP_AUDIO_SYNCHRONIZE = 66, //!< (open, read) Enables audio synchronization.
 #ifndef CV_DOXYGEN
       CV__CAP_PROP_LATEST
 #endif
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
--- a/modules/videoio/test/test_audio.cpp
+++ b/modules/videoio/test/test_audio.cpp
@ -0,0 +1,273 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
 namespace opencv_test { namespace {
 //file name, number of audio channels, epsilon, video type, weight, height, number of frame, number of audio samples, fps, psnr Threshold, backend
 typedef std::tuple<std::string, int, double, int, int, int, int, int, int, double, VideoCaptureAPIs> paramCombination;
 //file name, number of audio channels, number of audio samples, epsilon, backend
 typedef std::tuple<std::string, int, int, double, VideoCaptureAPIs> param;
 class AudioBaseTest
 {
 protected:
    AudioBaseTest(){};
    void getValidAudioData()
    {
        const double step = 3.14/22050;
        double value = 0;
        validAudioData.resize(expectedNumAudioCh);
        for (int nCh = 0; nCh < expectedNumAudioCh; nCh++)
        {
            value = 0;
            for(unsigned int i = 0; i < numberOfSamples; i++)
            {
                if (i != 0 && i % 44100 == 0)
                    value = 0;
                validAudioData[nCh].push_back(sin(value));
                value += step;
            }
        }
    }
    void checkAudio()
    {
        getValidAudioData();
        ASSERT_EQ(expectedNumAudioCh, (int)audioData.size());
        for (unsigned int nCh = 0; nCh < audioData.size(); nCh++)
        {
            ASSERT_EQ(numberOfSamples, audioData[nCh].size()) << "nCh=" << nCh;
            for (unsigned int i = 0; i < numberOfSamples; i++)
            {
                EXPECT_NEAR(validAudioData[nCh][i], audioData[nCh][i], epsilon) << "sample index=" << i << " nCh=" << nCh;
            }
        }
    }
 protected:
    int expectedNumAudioCh;
    unsigned int numberOfSamples;
    double epsilon;
    VideoCaptureAPIs backend;
    std::string root;
    std::string fileName;
    std::vector<std::vector<double>> validAudioData;
    std::vector<std::vector<double>> audioData;
    std::vector<int> params;
    Mat audioFrame;
    VideoCapture cap;
 };
 class AudioTestFixture : public AudioBaseTest, public testing::TestWithParam <param>
 {
 public:
    AudioTestFixture()
    {
        fileName = get<0>(GetParam());
        expectedNumAudioCh = get<1>(GetParam());
        numberOfSamples = get<2>(GetParam());
        epsilon = get<3>(GetParam());
        backend = get<4>(GetParam());
        root = "audio/";
        params = {  CAP_PROP_AUDIO_STREAM, 0,
                    CAP_PROP_VIDEO_STREAM, -1,
                    CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
    }
    void doTest()
    {
        ASSERT_TRUE(cap.open(findDataFile(root + fileName), backend, params));
        const int audioBaseIndex = static_cast<int>(cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX));
        const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
        ASSERT_EQ(expectedNumAudioCh, numberOfChannels);
        double f = 0;
        audioData.resize(numberOfChannels);
        for (;;)
        {
            if (cap.grab())
            {
                for (int nCh = 0; nCh < numberOfChannels; nCh++)
                {
                    ASSERT_TRUE(cap.retrieve(audioFrame, audioBaseIndex));
                    ASSERT_EQ(CV_16SC1, audioFrame.type()) << audioData[nCh].size();
                    for (int i = 0; i < audioFrame.cols; i++)
                    {
                        f = ((double) audioFrame.at<signed short>(0,i)) / (double) 32768;
                        audioData[nCh].push_back(f);
                    }
                }
            }
            else { break; }
        }
        ASSERT_FALSE(audioData.empty());
        checkAudio();
    }
 };
 const param audioParams[] =
 {
    param("test_audio.wav", 1, 132300, 0.0001, cv::CAP_MSMF),
    param("test_mono_audio.mp3", 1, 133104, 0.12, cv::CAP_MSMF),
    param("test_stereo_audio.mp3", 2, 133104, 0.12, cv::CAP_MSMF),
    param("test_audio.mp4", 1, 133104, 0.15, cv::CAP_MSMF)
 };
 class Audio : public AudioTestFixture{};
 TEST_P(Audio, audio)
 {
    if (!videoio_registry::hasBackend(cv::VideoCaptureAPIs(backend)))
        throw SkipTestException(cv::videoio_registry::getBackendName(backend) + " backend was not found");
    doTest();
 }
 INSTANTIATE_TEST_CASE_P(/**/, Audio, testing::ValuesIn(audioParams));
 class MediaTestFixture : public AudioBaseTest, public testing::TestWithParam <paramCombination>
 {
 public:
    MediaTestFixture():
        videoType(get<3>(GetParam())),
        height(get<4>(GetParam())),
        width(get<5>(GetParam())),
        numberOfFrames(get<6>(GetParam())),
        fps(get<8>(GetParam())),
        psnrThreshold(get<9>(GetParam()))
        {
            fileName = get<0>(GetParam());
            expectedNumAudioCh = get<1>(GetParam());
            numberOfSamples = get<7>(GetParam());
            epsilon = get<2>(GetParam());
            backend = get<10>(GetParam());
            root = "audio/";
            params = {  CAP_PROP_AUDIO_STREAM, 0,
                        CAP_PROP_VIDEO_STREAM, 0,
                        CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
        };
    void doTest()
    {
        ASSERT_TRUE(cap.open(findDataFile(root + fileName), backend, params));
        const int audioBaseIndex = static_cast<int>(cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX));
        const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
        ASSERT_EQ(expectedNumAudioCh, numberOfChannels);
        const int samplePerSecond = (int)cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND);
        ASSERT_EQ(44100, samplePerSecond);
        int samplesPerFrame = (int)(1./fps*samplePerSecond);
        int audioSamplesTolerance = samplesPerFrame / 2;
        double audio0_timestamp = 0;
        Mat videoFrame;
        Mat img(height, width, videoType);
        audioData.resize(numberOfChannels);
        for (int frame = 0; frame < numberOfFrames; frame++)
        {
            SCOPED_TRACE(cv::format("frame=%d", frame));
            ASSERT_TRUE(cap.grab());
            if (frame == 0)
            {
                double audio_shift = cap.get(CAP_PROP_AUDIO_SHIFT_NSEC);
                double video0_timestamp = cap.get(CAP_PROP_POS_MSEC) * 1e-3;
                audio0_timestamp = video0_timestamp + audio_shift * 1e-9;
                std::cout << "video0 timestamp: " << video0_timestamp << "  audio0 timestamp: " << audio0_timestamp << " (audio shift nanoseconds: " << audio_shift << " , seconds: " << audio_shift * 1e-9 << ")" << std::endl;
            }
            ASSERT_TRUE(cap.retrieve(videoFrame));
            if (epsilon >= 0)
            {
                generateFrame(frame, numberOfFrames, img);
                ASSERT_EQ(img.size, videoFrame.size);
                double psnr = cvtest::PSNR(img, videoFrame);
                EXPECT_GE(psnr, psnrThreshold);
            }
            int audioFrameCols = 0;
            for (int nCh = 0; nCh < numberOfChannels; nCh++)
            {
                ASSERT_TRUE(cap.retrieve(audioFrame, audioBaseIndex+nCh));
                if (audioFrame.empty())
                    continue;
                ASSERT_EQ(CV_16SC1, audioFrame.type());
                if (nCh == 0)
                    audioFrameCols = audioFrame.cols;
                else
                    ASSERT_EQ(audioFrameCols, audioFrame.cols) << "channel "<< nCh;
                for (int i = 0; i < audioFrame.cols; i++)
                {
                    double f = audioFrame.at<signed short>(0,i) / 32768.0;
                    audioData[nCh].push_back(f);
                }
            }
            if (frame < 5 || frame >= numberOfFrames-5)
                std::cout << "frame=" << frame << ":  audioFrameSize=" << audioFrameCols << "  videoTimestamp=" << cap.get(CAP_PROP_POS_MSEC) << " ms" << std::endl;
            else if (frame == 6)
                std::cout << "frame..." << std::endl;
            if (audioFrameCols == 0)
                continue;
            if (frame != 0 && frame != numberOfFrames-1)
            {
                // validate audio position
                EXPECT_NEAR(
                        cap.get(CAP_PROP_AUDIO_POS) / samplePerSecond + audio0_timestamp,
                        cap.get(CAP_PROP_POS_MSEC) * 1e-3,
                        (1.0 / fps) * 0.3)
                    << "CAP_PROP_AUDIO_POS=" << cap.get(CAP_PROP_AUDIO_POS) << " CAP_PROP_POS_MSEC=" << cap.get(CAP_PROP_POS_MSEC);
            }
            if (frame != 0 && frame != numberOfFrames-1 && audioData[0].size() != (size_t)numberOfSamples)
            {
                // validate audio frame size
                EXPECT_NEAR(audioFrame.cols, samplesPerFrame, audioSamplesTolerance);
            }
        }
        ASSERT_FALSE(cap.grab());
        ASSERT_FALSE(audioData.empty());
        std::cout << "Total audio samples=" << audioData[0].size() << std::endl;
        if (epsilon >= 0)
            checkAudio();
    }
 protected:
    const int videoType;
    const int height;
    const int width;
    const int numberOfFrames;
    const int fps;
    const double psnrThreshold;
 };
 const paramCombination mediaParams[] =
 {
    paramCombination("test_audio.mp4", 1, 0.15, CV_8UC3, 240, 320, 90, 131819, 30, 30., cv::CAP_MSMF)
 #if 0
    // https://filesamples.com/samples/video/mp4/sample_960x400_ocean_with_audio.mp4
    , paramCombination("sample_960x400_ocean_with_audio.mp4", 2, -1/*eplsilon*/, CV_8UC3, 400, 960, 1116, 2056588, 30, 30., cv::CAP_MSMF)
 #endif
 };
 class Media : public MediaTestFixture{};
 TEST_P(Media, audio)
 {
    if (!videoio_registry::hasBackend(cv::VideoCaptureAPIs(backend)))
        throw SkipTestException(cv::videoio_registry::getBackendName(backend) + " backend was not found");
    doTest();
 }
 INSTANTIATE_TEST_CASE_P(/**/, Media, testing::ValuesIn(mediaParams));
 }} //namespace
--- a/modules/videoio/test/test_microphone.cpp
+++ b/modules/videoio/test/test_microphone.cpp
@ -0,0 +1,41 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 // Usage: opencv_test_videoio --gtest_also_run_disabled_tests
 #include "test_precomp.hpp"
 namespace opencv_test { namespace {
 TEST(DISABLED_videoio_micro, basic)
 {
    int cursize = 0;
    int validSize = 0;
    Mat frame;
    std::vector<int> params { CAP_PROP_AUDIO_STREAM, 0, CAP_PROP_VIDEO_STREAM, -1 };
    VideoCapture cap(0, cv::CAP_MSMF, params);
    ASSERT_TRUE(cap.isOpened());
    int samplesPerSecond = (int)cap.get(cv::CAP_PROP_AUDIO_SAMPLES_PER_SECOND);
    const int audio_base_index = (int)cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX);
    const double cvTickFreq = cv::getTickFrequency();
    int64 sysTimePrev = cv::getTickCount();
    int64 sysTimeCurr = cv::getTickCount();
    cout << "Audio would be captured for the next 10 seconds" << endl;
    while ((sysTimeCurr-sysTimePrev)/cvTickFreq < 10)
    {
        if (cap.grab())
        {
            ASSERT_TRUE(cap.retrieve(frame, audio_base_index));
            sysTimeCurr = cv::getTickCount();
        }
    }
    validSize = samplesPerSecond*(int)((sysTimeCurr-sysTimePrev)/cvTickFreq);
    cursize = (int)cap.get(cv::CAP_PROP_AUDIO_POS);
    ASSERT_LT(validSize - cursize, cursize*0.05);
 }
 }} // namespace
--- a/samples/cpp/videocapture_audio.cpp
+++ b/samples/cpp/videocapture_audio.cpp
@ -0,0 +1,59 @@
 #include <opencv2/core.hpp>
 #include <opencv2/videoio.hpp>
 #include <opencv2/highgui.hpp>
 #include <iostream>
 using namespace cv;
 using namespace std;
 int main(int argc, char** argv)
 {
    CommandLineParser parser(argc, argv, "{@audio||}");
    string file = parser.get<string>("@audio");
    if (file.empty())
    {
        return 1;
    }
    Mat frame;
    vector<vector<Mat>> audioData;
    VideoCapture cap;
    vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
                            CAP_PROP_VIDEO_STREAM, -1,
                            CAP_PROP_AUDIO_DATA_DEPTH, CV_16S   };
    cap.open(file, CAP_MSMF, params);
    if (!cap.isOpened())
    {
        cerr << "ERROR! Can't to open file: " + file << endl;
        return -1;
    }
    const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
    const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
    cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
    cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
    cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
    cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
    int numberOfSamples = 0;
    audioData.resize(numberOfChannels);
    for (;;)
    {
        if (cap.grab())
        {
            for (int nCh = 0; nCh < numberOfChannels; nCh++)
            {
                cap.retrieve(frame, audioBaseIndex+nCh);
                audioData[nCh].push_back(frame);
                numberOfSamples+=frame.cols;
            }
        }
        else { break; }
    }
    cout << "Number of samples: " << numberOfSamples << endl;
    return 0;
 }
--- a/samples/cpp/videocapture_audio_combination.cpp
+++ b/samples/cpp/videocapture_audio_combination.cpp
@ -0,0 +1,69 @@
 #include <opencv2/core.hpp>
 #include <opencv2/videoio.hpp>
 #include <opencv2/highgui.hpp>
 #include <iostream>
 using namespace cv;
 using namespace std;
 int main(int argc, char** argv)
 {
    cv::CommandLineParser parser(argc, argv, "{@audio||}");
    string file = parser.get<string>("@audio");
    if (file.empty())
    {
        return 1;
    }
    Mat videoFrame;
    Mat audioFrame;
    vector<vector<Mat>> audioData;
    VideoCapture cap;
    vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
                            CAP_PROP_VIDEO_STREAM, 0,
                            CAP_PROP_AUDIO_DATA_DEPTH, CV_16S   };
    cap.open(file, CAP_MSMF, params);
    if (!cap.isOpened())
    {
        cerr << "ERROR! Can't to open file: " + file << endl;
        return -1;
    }
    const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
    const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
    cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
    cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
    cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS) << endl;
    cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
    int numberOfSamples = 0;
    int numberOfFrames = 0;
    audioData.resize(numberOfChannels);
    for (;;)
    {
        if (cap.grab())
        {
            cap.retrieve(videoFrame);
            for (int nCh = 0; nCh < numberOfChannels; nCh++)
            {
                cap.retrieve(audioFrame, audioBaseIndex+nCh);
                if (!audioFrame.empty())
                    audioData[nCh].push_back(audioFrame);
                numberOfSamples+=audioFrame.cols;
            }
            if (!videoFrame.empty())
            {
                numberOfFrames++;
                imshow("Live", videoFrame);
                if (waitKey(5) >= 0)
                    break;
            }
        } else { break; }
    }
    cout << "Number of audio samples: " << numberOfSamples << endl
         << "Number of video frames: " << numberOfFrames << endl;
    return 0;
 }
--- a/samples/cpp/videocapture_microphone.cpp
+++ b/samples/cpp/videocapture_microphone.cpp
@ -0,0 +1,57 @@
 #include <opencv2/core.hpp>
 #include <opencv2/videoio.hpp>
 #include <opencv2/highgui.hpp>
 #include <iostream>
 using namespace cv;
 using namespace std;
 int main(int, char**)
 {
    Mat frame;
    vector<Mat> audioData;
    VideoCapture cap;
    vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
                            CAP_PROP_VIDEO_STREAM, -1   };
    cap.open(0, CAP_MSMF, params);
    if (!cap.isOpened())
    {
        cerr << "ERROR! Can't to open microphone" << endl;
        return -1;
    }
    const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
    const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
    cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
    cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
    cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
    cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
    const double cvTickFreq = getTickFrequency();
    int64 sysTimeCurr = getTickCount();
    int64 sysTimePrev = sysTimeCurr;
    while ((sysTimeCurr-sysTimePrev)/cvTickFreq < 10)
    {
        if (cap.grab())
        {
            for (int nCh = 0; nCh < numberOfChannels; nCh++)
            {
                cap.retrieve(frame, audioBaseIndex+nCh);
                audioData.push_back(frame);
                sysTimeCurr = getTickCount();
            }
        }
        else
        {
            cerr << "Grab error" << endl;
            break;
        }
    }
    int numberOfSamles = 0;
    for (auto item : audioData)
        numberOfSamles+=item.cols;
    cout << "Number of samples: " << numberOfSamles << endl;
    return 0;
 }