diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp
index 3276d0d5e4..4b5bc135bc 100644
--- a/modules/videoio/include/opencv2/videoio.hpp
+++ b/modules/videoio/include/opencv2/videoio.hpp
@@ -189,6 +189,17 @@ enum VideoCaptureProperties {
        CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg back-end only)
        CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg back-end only)
        CAP_PROP_STREAM_OPEN_TIME_USEC =55, //<! (read-only) time in microseconds since Jan 1 1970 when stream was opened. Applicable for FFmpeg backend only. Useful for RTSP and other live streams
+       CAP_PROP_VIDEO_TOTAL_CHANNELS = 56, //!< (read-only) Number of video channels
+       CAP_PROP_VIDEO_STREAM = 57, //!< (**open-only**) Specify video stream, 0-based index. Use -1 to disable video stream from file or IP cameras. Default value is 0.
+       CAP_PROP_AUDIO_STREAM = 58, //!< (**open-only**) Specify stream in multi-language media files, -1 - disable audio processing or microphone. Default value is -1.
+       CAP_PROP_AUDIO_POS = 59, //!< (read-only) Audio position is measured in samples. Accurate audio sample timestamp of previous grabbed fragment. See CAP_PROP_AUDIO_SAMPLES_PER_SECOND and CAP_PROP_AUDIO_SHIFT_NSEC.
+       CAP_PROP_AUDIO_SHIFT_NSEC = 60, //!< (read only) Contains the time difference between the start of the audio stream and the video stream in nanoseconds. Positive value means that audio is started after the first video frame. Negative value means that audio is started before the first video frame.
+       CAP_PROP_AUDIO_DATA_DEPTH = 61, //!< (open, read) Alternative definition to bits-per-sample, but with clear handling of 32F / 32S
+       CAP_PROP_AUDIO_SAMPLES_PER_SECOND = 62, //!< (read-only) determined from file/codec input. If not specified, then selected audio sample rate is 44100
+       CAP_PROP_AUDIO_BASE_INDEX = 63, //!< (read-only) Index of the first audio channel for .retrieve() calls. That audio channel number continues enumeration after video channels.
+       CAP_PROP_AUDIO_TOTAL_CHANNELS = 64, //!< (read-only) Number of audio channels in the selected audio stream (mono, stereo, etc)
+       CAP_PROP_AUDIO_TOTAL_STREAMS = 65, //!< (read-only) Number of audio streams.
+       CAP_PROP_AUDIO_SYNCHRONIZE = 66, //!< (open, read) Enables audio synchronization.
 #ifndef CV_DOXYGEN
        CV__CAP_PROP_LATEST
 #endif
diff --git a/modules/videoio/src/cap_msmf.cpp b/modules/videoio/src/cap_msmf.cpp
index 9e45fd1bac..0fa064dfb8 100644
--- a/modules/videoio/src/cap_msmf.cpp
+++ b/modules/videoio/src/cap_msmf.cpp
@@ -35,6 +35,7 @@
 #include <vector>
 #include <string>
 #include <algorithm>
+#include <deque>
 #include <stdio.h>
 #include <stdarg.h>
 #include <string.h>
@@ -69,7 +70,6 @@ static void init_MFCreateDXGIDeviceManager()
 #endif
 
 #include <mferror.h>
-
 #include <comdef.h>
 
 #include <shlwapi.h>  // QISearch
@@ -108,6 +108,13 @@ public:
     {
     }
 
+    void swap(_In_ ComPtr<T>& lp)
+    {
+        ComPtr<T> tmp(p);
+        p = lp.p;
+        lp.p = tmp.p;
+        tmp = NULL;
+    }
     T** operator&()
     {
         CV_Assert(p == NULL);
@@ -155,6 +162,7 @@ template <typename T> inline T absDiff(T a, T b) { return a >= b ? a - b : b - a
 // Structure for collecting info about types of video which are supported by current video device
 struct MediaType
 {
+    //video param
     UINT32 width;
     UINT32 height;
     INT32 stride; // stride is negative if image is bottom-up
@@ -165,9 +173,17 @@ struct MediaType
     UINT32 aspectRatioDenom;
     UINT32 sampleSize;
     UINT32 interlaceMode;
+    //audio param
+    UINT32 bit_per_sample;
+    UINT32 nChannels;
+    UINT32 nAvgBytesPerSec;
+    UINT32 nSamplesPerSec;
+
     GUID majorType; // video or audio
     GUID subType; // fourCC
+    _ComPtr<IMFMediaType> Type;
     MediaType(IMFMediaType *pType = 0) :
+        Type(pType),
         width(0), height(0),
         stride(0),
         isFixedSize(true),
@@ -175,23 +191,38 @@ struct MediaType
         aspectRatioNum(1), aspectRatioDenom(1),
         sampleSize(0),
         interlaceMode(0),
-        majorType(MFMediaType_Video),
+        bit_per_sample(0),
+        nChannels(0),
+        nAvgBytesPerSec(0),
+        nSamplesPerSec(0),
+        majorType({ 0 }),//MFMediaType_Video
         subType({ 0 })
     {
         if (pType)
         {
-            MFGetAttributeSize(pType, MF_MT_FRAME_SIZE, &width, &height);
-            pType->GetUINT32(MF_MT_DEFAULT_STRIDE, (UINT32*)&stride); // value is stored as UINT32 but should be casted to INT3)
-            pType->GetUINT32(MF_MT_FIXED_SIZE_SAMPLES, &isFixedSize);
-            MFGetAttributeRatio(pType, MF_MT_FRAME_RATE, &frameRateNum, &frameRateDenom);
-            MFGetAttributeRatio(pType, MF_MT_PIXEL_ASPECT_RATIO, &aspectRatioNum, &aspectRatioDenom);
-            pType->GetUINT32(MF_MT_SAMPLE_SIZE, &sampleSize);
-            pType->GetUINT32(MF_MT_INTERLACE_MODE, &interlaceMode);
             pType->GetGUID(MF_MT_MAJOR_TYPE, &majorType);
             pType->GetGUID(MF_MT_SUBTYPE, &subType);
+            if (majorType == MFMediaType_Audio)
+            {
+                pType->GetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, &bit_per_sample);
+                pType->GetUINT32(MF_MT_AUDIO_NUM_CHANNELS, &nChannels);
+                pType->GetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, &nAvgBytesPerSec);
+                pType->GetUINT32(MF_MT_AUDIO_FLOAT_SAMPLES_PER_SECOND, &nSamplesPerSec);
+            }
+            else if (majorType == MFMediaType_Video)
+            {
+                MFGetAttributeSize(pType, MF_MT_FRAME_SIZE, &width, &height);
+                pType->GetUINT32(MF_MT_DEFAULT_STRIDE, (UINT32*)&stride); // value is stored as UINT32 but should be casted to INT3)
+                pType->GetUINT32(MF_MT_FIXED_SIZE_SAMPLES, &isFixedSize);
+                MFGetAttributeRatio(pType, MF_MT_FRAME_RATE, &frameRateNum, &frameRateDenom);
+                MFGetAttributeRatio(pType, MF_MT_PIXEL_ASPECT_RATIO, &aspectRatioNum, &aspectRatioDenom);
+                pType->GetUINT32(MF_MT_SAMPLE_SIZE, &sampleSize);
+                pType->GetUINT32(MF_MT_INTERLACE_MODE, &interlaceMode);
+                pType->GetUINT32(MF_MT_INTERLACE_MODE, &interlaceMode);
+            }
         }
     }
-    static MediaType createDefault()
+    static MediaType createDefault_Video()
     {
         MediaType res;
         res.width = 640;
@@ -199,11 +230,24 @@ struct MediaType
         res.setFramerate(30.0);
         return res;
     }
-    inline bool isEmpty() const
+    static MediaType createDefault_Audio()
+    {
+        MediaType res;
+        res.majorType = MFMediaType_Audio;
+        res.subType = MFAudioFormat_PCM;
+        res.bit_per_sample = 16;
+        res.nChannels = 1;
+        res.nSamplesPerSec = 44100;
+        return res;
+    }
+    inline bool isEmpty(bool flag = false) const
     {
-        return width == 0 && height == 0;
+        if (!flag)
+            return width == 0 && height == 0;
+        else
+            return nChannels == 0;
     }
-    _ComPtr<IMFMediaType> createMediaType() const
+    _ComPtr<IMFMediaType> createMediaType_Video() const
     {
         _ComPtr<IMFMediaType> res;
         MFCreateMediaType(&res);
@@ -225,6 +269,22 @@ struct MediaType
             res->SetGUID(MF_MT_SUBTYPE, subType);
         return res;
     }
+    _ComPtr<IMFMediaType> createMediaType_Audio() const
+    {
+        _ComPtr<IMFMediaType> res;
+        MFCreateMediaType(&res);
+        if (majorType != GUID())
+            res->SetGUID(MF_MT_MAJOR_TYPE, majorType);
+        if (subType != GUID())
+            res->SetGUID(MF_MT_SUBTYPE, subType);
+        if (bit_per_sample != 0)
+            res->SetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, bit_per_sample);
+        if (nChannels != 0)
+            res->SetUINT32(MF_MT_AUDIO_NUM_CHANNELS, nChannels);
+        if (nSamplesPerSec != 0)
+            res->SetUINT32(MF_MT_AUDIO_SAMPLES_PER_SECOND, nSamplesPerSec);
+        return res;
+    }
     void setFramerate(double fps)
     {
         frameRateNum = (UINT32)cvRound(fps * 1000.0);
@@ -246,7 +306,7 @@ struct MediaType
         return wdiff + hdiff;
     }
     // check if 'this' is better than 'other' comparing to reference
-    bool isBetterThan(const MediaType& other, const MediaType& ref) const
+    bool VideoIsBetterThan(const MediaType& other, const MediaType& ref) const
     {
         const unsigned long thisDiff = resolutionDiff(ref);
         const unsigned long otherDiff = other.resolutionDiff(ref);
@@ -268,6 +328,24 @@ struct MediaType
         }
         return false;
     }
+    bool AudioIsBetterThan(const MediaType& other, const MediaType& ref) const
+    {
+        double thisDiff = absDiff(nChannels, ref.nChannels);
+        double otherDiff = absDiff(other.nChannels, ref.nChannels);
+        if (otherDiff < thisDiff)
+        {
+            thisDiff = absDiff(bit_per_sample, ref.bit_per_sample);
+            otherDiff = absDiff(bit_per_sample, ref.bit_per_sample);
+            if (otherDiff < thisDiff)
+            {
+                thisDiff = absDiff(nSamplesPerSec, ref.nSamplesPerSec);
+                otherDiff = absDiff(nSamplesPerSec, ref.nSamplesPerSec);
+                if (otherDiff < thisDiff)
+                    return true;
+            }
+        }
+        return false;
+    }
 };
 
 void printFormat(std::ostream& out, const GUID& fmt)
@@ -405,7 +483,7 @@ public:
         return S_OK;
     }
 
-    HRESULT Wait(DWORD dwMilliseconds, _ComPtr<IMFSample>& videoSample, BOOL& pbEOS)
+    HRESULT Wait(DWORD dwMilliseconds, _ComPtr<IMFSample>& mediaSample, BOOL& pbEOS)
     {
         pbEOS = FALSE;
 
@@ -423,14 +501,14 @@ public:
         if (!pbEOS)
         {
             cv::AutoLock lock(m_mutex);
-            videoSample = m_lastSample;
-            CV_Assert(videoSample);
+            mediaSample = m_lastSample;
+            CV_Assert(mediaSample);
             m_lastSample.Release();
             ResetEvent(m_hEvent);  // event is auto-reset, but we need this forced reset due time gap between wait() and mutex hold.
         }
-
         return m_hrStatus;
     }
+
 private:
     // Destructor is private. Caller should call Release.
     virtual ~SourceReaderCB()
@@ -496,22 +574,67 @@ public:
             }
         }
     }
+    void countNumberOfAudioStreams(DWORD &numberOfAudioStreams)
+    {
+        std::pair<MediaID, MediaType> best;
+        std::map<MediaID, MediaType>::const_iterator i = formats.begin();
+        for (; i != formats.end(); ++i)
+        {
+            if(i->second.majorType == MFMediaType_Audio)
+            {
+                if(best.second.isEmpty() || i->first.stream != best.first.stream)
+                {
+                    numberOfAudioStreams++;
+                    best = *i;
+                }
+            }
+        }
+    }
     std::pair<MediaID, MediaType> findBestVideoFormat(const MediaType& newType)
     {
         std::pair<MediaID, MediaType> best;
         std::map<MediaID, MediaType>::const_iterator i = formats.begin();
         for (; i != formats.end(); ++i)
         {
-            if (i->second.majorType != MFMediaType_Video)
-                continue;
-            if (newType.isEmpty()) // file input - choose first returned media type
+            if (i->second.majorType == MFMediaType_Video)
             {
-                best = *i;
-                break;
+                if (best.second.isEmpty() || i->second.VideoIsBetterThan(best.second, newType))
+                {
+                    best = *i;
+                }
             }
-            if (best.second.isEmpty() || i->second.isBetterThan(best.second, newType))
+        }
+        return best;
+    }
+    std::pair<MediaID, MediaType> findBestAudioFormat(const MediaType& newType)
+    {
+        std::pair<MediaID, MediaType> best;
+        std::map<MediaID, MediaType>::const_iterator i = formats.begin();
+        best = *i;
+        for (; i != formats.end(); ++i)
+        {
+            if (i->second.majorType == MFMediaType_Audio)
+            {
+                if ( i->second.AudioIsBetterThan(best.second, newType))
+                {
+                    best = *i;
+                }
+            }
+        }
+        return best;
+    }
+    std::pair<MediaID, MediaType> findAudioFormatByStream(const DWORD StreamIndex)
+    {
+        std::pair<MediaID, MediaType> best;
+        std::map<MediaID, MediaType>::const_iterator i = formats.begin();
+        for (; i != formats.end(); ++i)
+        {
+            if (i->second.majorType == MFMediaType_Audio)
             {
-                best = *i;
+                if ((*i).first.stream == StreamIndex)
+                {
+                    best = *i;
+                }
             }
         }
         return best;
@@ -586,21 +709,30 @@ public:
     virtual void close();
     virtual double getProperty(int) const CV_OVERRIDE;
     virtual bool setProperty(int, double) CV_OVERRIDE;
+    bool grabAudioFrame();
+    bool grabVideoFrame();
     virtual bool grabFrame() CV_OVERRIDE;
+    bool retrieveAudioFrame(int, OutputArray);
+    bool retrieveVideoFrame(OutputArray);
     virtual bool retrieveFrame(int, cv::OutputArray) CV_OVERRIDE;
     virtual bool isOpened() const CV_OVERRIDE { return isOpen; }
     virtual int getCaptureDomain() CV_OVERRIDE { return CV_CAP_MSMF; }
 protected:
-    bool configureOutput(MediaType newType, cv::uint32_t outFormat);
+    bool configureOutput();
+    bool configureAudioOutput(MediaType newType);
+    bool configureVideoOutput(MediaType newType, cv::uint32_t outFormat);
     bool setTime(double time, bool rough);
+    bool setTime(int numberFrame);
     bool configureHW(bool enable);
+    bool configureStreams(const cv::VideoCaptureParameters&);
+    bool setAudioProperties(const cv::VideoCaptureParameters&);
 
     template <typename CtrlT>
     bool readComplexPropery(long prop, long& val) const;
     template <typename CtrlT>
     bool writeComplexProperty(long prop, double val, long flags);
     _ComPtr<IMFAttributes> getDefaultSourceConfig(UINT32 num = 10);
-    bool initStream(DWORD streamID, const MediaType& mt);
+    bool initStream(DWORD streamID, const MediaType mt);
 
     bool openFinalize_(const VideoCaptureParameters* params);
 
@@ -615,17 +747,49 @@ protected:
     _ComPtr<IMFDXGIDeviceManager> D3DMgr;
 #endif
     _ComPtr<IMFSourceReader> videoFileSource;
-    _ComPtr<IMFSample> videoSample;
     _ComPtr<IMFSourceReaderCallback> readCallback;  // non-NULL for "live" streams (camera capture)
-    DWORD dwStreamIndex;
+    std::vector<DWORD> dwStreamIndices;
+    std::vector<_ComPtr<IMFSample>> audioSamples;
+    _ComPtr<IMFSample> impendingVideoSample;
+    _ComPtr<IMFSample> usedVideoSample;
+    DWORD dwVideoStreamIndex;
+    DWORD dwAudioStreamIndex;
     MediaType nativeFormat;
-    MediaType captureFormat;
-    int outputFormat;
+    MediaType captureVideoFormat;
+    MediaType captureAudioFormat;
+    bool device_status; //on or off
+    int videoStream; // look at CAP_PROP_VIDEO_STREAM
+    int audioStream; // look at CAP_PROP_AUDIO_STREAM
+    bool vEOS;
+    bool aEOS;
+    unsigned int audioBaseIndex;
+    int outputVideoFormat;
+    int outputAudioFormat;
     bool convertFormat;
     MFTIME duration;
     LONGLONG frameStep;
-    LONGLONG sampleTime;
+    LONGLONG nFrame;
+    LONGLONG impendingVideoSampleTime;
+    LONGLONG usedVideoSampleTime;
+    LONGLONG videoStartOffset;
+    LONGLONG videoSampleDuration;
+    LONGLONG requiredAudioTime;
+    LONGLONG audioSampleTime;
+    LONGLONG audioStartOffset;
+    LONGLONG audioSampleDuration;
+    LONGLONG audioTime;
+    LONGLONG chunkLengthOfBytes;
+    LONGLONG givenAudioTime;
+    LONGLONG numberOfAdditionalAudioBytes; // the number of additional bytes required to align the audio chunk
+    double bufferedAudioDuration;
+    LONGLONG audioSamplePos;
+    DWORD numberOfAudioStreams;
+    Mat audioFrame;
+    std::deque<BYTE> bufferAudioData;
     bool isOpen;
+    bool grabIsDone;
+    bool syncLastFrame;
+    bool lastFrame;
 };
 
 CvCapture_MSMF::CvCapture_MSMF():
@@ -640,15 +804,42 @@ CvCapture_MSMF::CvCapture_MSMF():
     D3DMgr(NULL),
 #endif
     videoFileSource(NULL),
-    videoSample(NULL),
     readCallback(NULL),
-    dwStreamIndex(0),
-    outputFormat(CV_CAP_MODE_BGR),
+    impendingVideoSample(NULL),
+    usedVideoSample(NULL),
+    dwVideoStreamIndex(0),
+    dwAudioStreamIndex(0),
+    device_status(false),
+    videoStream(0),
+    audioStream(-1),
+    vEOS(false),
+    aEOS(false),
+    audioBaseIndex(1),
+    outputVideoFormat(CV_CAP_MODE_BGR),
+    outputAudioFormat(CV_16S),
     convertFormat(true),
     duration(0),
     frameStep(0),
-    sampleTime(0),
-    isOpen(false)
+    nFrame(0),
+    impendingVideoSampleTime(0),
+    usedVideoSampleTime(0),
+    videoStartOffset(-1),
+    videoSampleDuration(0),
+    requiredAudioTime(0),
+    audioSampleTime(0),
+    audioStartOffset(-1),
+    audioSampleDuration(0),
+    audioTime(0),
+    chunkLengthOfBytes(0),
+    givenAudioTime(0),
+    numberOfAdditionalAudioBytes(0),
+    bufferedAudioDuration(0),
+    audioSamplePos(0),
+    numberOfAudioStreams(0),
+    isOpen(false),
+    grabIsDone(false),
+    syncLastFrame(true),
+    lastFrame(false)
 {
 }
 
@@ -663,29 +854,37 @@ void CvCapture_MSMF::close()
     if (isOpen)
     {
         isOpen = false;
-        videoSample.Release();
+        usedVideoSample.Release();
+        for (auto item : audioSamples)
+            item.Release();
         videoFileSource.Release();
+        device_status = false;
         camid = -1;
         filename.clear();
     }
     readCallback.Release();
 }
 
-bool CvCapture_MSMF::initStream(DWORD streamID, const MediaType& mt)
+bool CvCapture_MSMF::initStream(DWORD streamID, const MediaType mt)
 {
     CV_LOG_DEBUG(NULL, "Init stream " << streamID << " with MediaType " << mt);
-    _ComPtr<IMFMediaType> mediaTypeOut = mt.createMediaType();
-    if (FAILED(videoFileSource->SetStreamSelection((DWORD)MF_SOURCE_READER_ALL_STREAMS, false)))
+    _ComPtr<IMFMediaType> mediaTypesOut;
+    if (mt.majorType == MFMediaType_Audio)
     {
-        CV_LOG_WARNING(NULL, "Failed to reset streams");
-        return false;
+        captureAudioFormat = mt;
+        mediaTypesOut = mt.createMediaType_Audio();
+    }
+    if (mt.majorType == MFMediaType_Video)
+    {
+        captureVideoFormat = mt;
+        mediaTypesOut = mt.createMediaType_Video();
     }
     if (FAILED(videoFileSource->SetStreamSelection(streamID, true)))
     {
         CV_LOG_WARNING(NULL, "Failed to select stream " << streamID);
         return false;
     }
-    HRESULT hr = videoFileSource->SetCurrentMediaType(streamID, NULL, mediaTypeOut.Get());
+    HRESULT hr = videoFileSource->SetCurrentMediaType(streamID, NULL, mediaTypesOut.Get());
     if (hr == MF_E_TOPO_CODEC_NOT_FOUND)
     {
         CV_LOG_WARNING(NULL, "Failed to set mediaType (stream " << streamID << ", " << mt << "(codec not found)");
@@ -701,7 +900,7 @@ bool CvCapture_MSMF::initStream(DWORD streamID, const MediaType& mt)
         CV_LOG_WARNING(NULL, "Failed to set mediaType (stream " << streamID << ", " << mt << "(HRESULT " << hr << ")");
         return false;
     }
-    captureFormat = mt;
+
     return true;
 }
 
@@ -826,7 +1025,52 @@ bool CvCapture_MSMF::configureHW(const VideoCaptureParameters& params)
     return configureHW(va_type == VIDEO_ACCELERATION_D3D11 || va_type == VIDEO_ACCELERATION_ANY);
 }
 
-bool CvCapture_MSMF::configureOutput(MediaType newType, cv::uint32_t outFormat)
+bool CvCapture_MSMF::configureAudioOutput(MediaType newType)
+{
+    FormatStorage formats;
+    formats.read(videoFileSource.Get());
+    std::pair<FormatStorage::MediaID, MediaType> bestMatch;
+    formats.countNumberOfAudioStreams(numberOfAudioStreams);
+    if (device_status)
+        bestMatch = formats.findBestAudioFormat(newType);
+    else
+        bestMatch = formats.findAudioFormatByStream(audioStream);
+    if (bestMatch.second.isEmpty(true))
+    {
+        CV_LOG_DEBUG(NULL, "Can not find audio stream with requested parameters");
+        return false;
+    }
+    dwAudioStreamIndex = bestMatch.first.stream;
+    dwStreamIndices.push_back(dwAudioStreamIndex);
+    MediaType newFormat = bestMatch.second;
+
+    newFormat.majorType = MFMediaType_Audio;
+    newFormat.nSamplesPerSec = 44100;
+    switch (outputAudioFormat)
+    {
+    case CV_8S:
+        newFormat.subType = MFAudioFormat_PCM;
+        newFormat.bit_per_sample = 8;
+        break;
+    case CV_16S:
+        newFormat.subType = MFAudioFormat_PCM;
+        newFormat.bit_per_sample = 16;
+        break;
+    case CV_32S:
+        newFormat.subType = MFAudioFormat_PCM;
+        newFormat.bit_per_sample = 32;
+    case CV_32F:
+        newFormat.subType = MFAudioFormat_Float;
+        newFormat.bit_per_sample = 32;
+        break;
+    default:
+        break;
+    }
+
+    return initStream(dwAudioStreamIndex, newFormat);
+}
+
+bool CvCapture_MSMF::configureVideoOutput(MediaType newType, cv::uint32_t outFormat)
 {
     FormatStorage formats;
     formats.read(videoFileSource.Get());
@@ -836,9 +1080,11 @@ bool CvCapture_MSMF::configureOutput(MediaType newType, cv::uint32_t outFormat)
         CV_LOG_DEBUG(NULL, "Can not find video stream with requested parameters");
         return false;
     }
-    dwStreamIndex = bestMatch.first.stream;
+    dwVideoStreamIndex = bestMatch.first.stream;
+    dwStreamIndices.push_back(dwVideoStreamIndex);
     nativeFormat = bestMatch.second;
     MediaType newFormat = nativeFormat;
+
     if (convertFormat)
     {
         switch (outFormat)
@@ -869,8 +1115,25 @@ bool CvCapture_MSMF::configureOutput(MediaType newType, cv::uint32_t outFormat)
     }
     // we select native format first and then our requested format (related issue #12822)
     if (!newType.isEmpty()) // camera input
-        initStream(dwStreamIndex, nativeFormat);
-    return initStream(dwStreamIndex, newFormat);
+    {
+        initStream(dwVideoStreamIndex, nativeFormat);
+    }
+    return initStream(dwVideoStreamIndex, newFormat);
+}
+
+bool CvCapture_MSMF::configureOutput()
+{
+    if (FAILED(videoFileSource->SetStreamSelection((DWORD)MF_SOURCE_READER_ALL_STREAMS, false)))
+    {
+        CV_LOG_WARNING(NULL, "Failed to reset streams");
+        return false;
+    }
+    bool tmp = true;
+    if (videoStream != -1)
+        tmp = (!device_status)? configureVideoOutput(MediaType(), outputVideoFormat) : configureVideoOutput(MediaType::createDefault_Video(), outputVideoFormat);
+    if (audioStream != -1)
+        tmp &= (!device_status)? configureAudioOutput(MediaType()) : configureAudioOutput(MediaType::createDefault_Audio());
+    return tmp;
 }
 
 bool CvCapture_MSMF::open(int index, const cv::VideoCaptureParameters* params)
@@ -882,10 +1145,19 @@ bool CvCapture_MSMF::open(int index, const cv::VideoCaptureParameters* params)
     if (params)
     {
         configureHW(*params);
+        configureStreams(*params);
+    }
+    if (videoStream != -1 && audioStream != -1 || videoStream == -1 && audioStream == -1)
+    {
+        CV_LOG_DEBUG(NULL, "Only one of the properties CAP_PROP_AUDIO_STREAM " << audioStream << " and " << CAP_PROP_VIDEO_STREAM << " must be different from -1");
+        return false;
     }
-
     DeviceList devices;
-    UINT32 count = devices.read();
+    UINT32 count = 0;
+    if (audioStream != -1)
+        count = devices.read(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID);
+    if (videoStream != -1)
+        count = devices.read(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_GUID);
     if (count == 0 || static_cast<UINT32>(index) > count)
     {
         CV_LOG_DEBUG(NULL, "Device " << index << " not found (total " << count << " devices)");
@@ -902,14 +1174,14 @@ bool CvCapture_MSMF::open(int index, const cv::VideoCaptureParameters* params)
     }
 
     isOpen = true;
+    device_status = true;
     camid = index;
     readCallback = cb;
     duration = 0;
-    if (configureOutput(MediaType::createDefault(), outputFormat))
+    if (configureOutput())
     {
-        frameStep = captureFormat.getFrameStep();
+        frameStep = captureVideoFormat.getFrameStep();
     }
-
     if (isOpen && !openFinalize_(params))
     {
         close();
@@ -928,8 +1200,9 @@ bool CvCapture_MSMF::open(const cv::String& _filename, const cv::VideoCapturePar
     if (params)
     {
         configureHW(*params);
+        configureStreams(*params);
+        setAudioProperties(*params);
     }
-
     // Set source reader parameters
     _ComPtr<IMFAttributes> attr = getDefaultSourceConfig();
     cv::AutoBuffer<wchar_t> unicodeFileName(_filename.length() + 1);
@@ -937,11 +1210,11 @@ bool CvCapture_MSMF::open(const cv::String& _filename, const cv::VideoCapturePar
     if (SUCCEEDED(MFCreateSourceReaderFromURL(unicodeFileName.data(), attr.Get(), &videoFileSource)))
     {
         isOpen = true;
-        sampleTime = 0;
-        if (configureOutput(MediaType(), outputFormat))
+        usedVideoSampleTime = 0;
+        if (configureOutput())
         {
-            frameStep = captureFormat.getFrameStep();
             filename = _filename;
+            frameStep = captureVideoFormat.getFrameStep();
             PROPVARIANT var;
             HRESULT hr;
             if (SUCCEEDED(hr = videoFileSource->GetPresentationAttribute((DWORD)MF_SOURCE_READER_MEDIASOURCE, MF_PD_DURATION, &var)) &&
@@ -954,13 +1227,18 @@ bool CvCapture_MSMF::open(const cv::String& _filename, const cv::VideoCapturePar
                 duration = 0;
         }
     }
-
     if (isOpen && !openFinalize_(params))
     {
         close();
         return false;
     }
-
+    if (isOpen)
+        if (audioStream != -1 && videoStream != -1)
+        {
+            isOpen = grabFrame();
+            if (isOpen)
+                grabIsDone = true;
+        }
     return isOpen;
 }
 
@@ -997,71 +1275,212 @@ bool CvCapture_MSMF::openFinalize_(const VideoCaptureParameters* params)
     return true;
 }
 
-bool CvCapture_MSMF::grabFrame()
+bool CvCapture_MSMF::configureStreams(const cv::VideoCaptureParameters& params)
 {
-    CV_TRACE_FUNCTION();
-    if (readCallback)  // async "live" capture mode
+    if (params.has(CAP_PROP_VIDEO_STREAM))
     {
-        HRESULT hr = 0;
-        SourceReaderCB* reader = ((SourceReaderCB*)readCallback.Get());
-        if (!reader->m_reader)
+        double value = params.get<double>(CAP_PROP_VIDEO_STREAM);
+        if (value == -1 || value == 0)
+            videoStream = static_cast<int>(value);
+        else
         {
-            // Initiate capturing with async callback
-            reader->m_reader = videoFileSource.Get();
-            reader->m_dwStreamIndex = dwStreamIndex;
-            if (FAILED(hr = videoFileSource->ReadSample(dwStreamIndex, 0, NULL, NULL, NULL, NULL)))
-            {
-                CV_LOG_ERROR(NULL, "videoio(MSMF): can't grab frame - initial async ReadSample() call failed: " << hr);
-                reader->m_reader = NULL;
-                return false;
-            }
+            CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: CAP_PROP_VIDEO_STREAM parameter value is invalid/unsupported: " << value);
+            return false;
         }
-        BOOL bEOS = false;
-        if (FAILED(hr = reader->Wait(10000, videoSample, bEOS)))  // 10 sec
+    }
+    if (params.has(CAP_PROP_AUDIO_STREAM))
+    {
+        double value = params.get<double>(CAP_PROP_AUDIO_STREAM);
+        if (value == -1 || value > -1)
+            audioStream = static_cast<int>(value);
+        else
         {
-            CV_LOG_WARNING(NULL, "videoio(MSMF): can't grab frame. Error: " << hr);
+            CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: CAP_PROP_AUDIO_STREAM parameter value is invalid/unsupported: " << value);
             return false;
         }
-        if (bEOS)
+    }
+    return true;
+}
+bool CvCapture_MSMF::setAudioProperties(const cv::VideoCaptureParameters& params)
+{
+    if (params.has(CAP_PROP_AUDIO_DATA_DEPTH))
+    {
+        int value = static_cast<int>(params.get<double>(CAP_PROP_AUDIO_DATA_DEPTH));
+        if (value != CV_8S && value != CV_16S && value != CV_32S && value != CV_32F)
         {
-            CV_LOG_WARNING(NULL, "videoio(MSMF): EOS signal. Capture stream is lost");
+            CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: CAP_PROP_AUDIO_DATA_DEPTH parameter value is invalid/unsupported: " << value);
             return false;
         }
-        sampleTime = reader->m_lastSampleTimestamp;
-        return true;
+        else
+        {
+            outputAudioFormat = value;
+        }
     }
-    else if (isOpen)
+    if (params.has(CAP_PROP_AUDIO_SYNCHRONIZE))
+    {
+        int value = static_cast<int>(params.get<double>(CAP_PROP_AUDIO_SYNCHRONIZE));
+        syncLastFrame = (value != 0) ? true : false;
+    }
+    return true;
+}
+
+bool CvCapture_MSMF::grabVideoFrame()
+{
+    DWORD streamIndex,  flags;
+    HRESULT hr;
+    usedVideoSample.Release();
+
+    bool returnFlag = false;
+    bool stopFlag = false;
+    if (audioStream != -1)
     {
-        DWORD streamIndex, flags;
-        videoSample.Release();
-        HRESULT hr;
-        for(;;)
+        usedVideoSample.swap(impendingVideoSample);
+        std::swap(usedVideoSampleTime, impendingVideoSampleTime);
+    }
+    while (!stopFlag)
+    {
+        for (;;)
         {
             CV_TRACE_REGION("ReadSample");
             if (!SUCCEEDED(hr = videoFileSource->ReadSample(
-                dwStreamIndex, // Stream index.
+                dwVideoStreamIndex, // Stream index.
                 0,             // Flags.
                 &streamIndex,  // Receives the actual stream index.
                 &flags,        // Receives status flags.
-                &sampleTime,   // Receives the time stamp.
-                &videoSample   // Receives the sample or NULL.
+                &impendingVideoSampleTime,   // Receives the time stamp.
+                &impendingVideoSample   // Receives the sample or NULL.
             )))
                 break;
-            if (streamIndex != dwStreamIndex)
+            if (streamIndex != dwVideoStreamIndex)
                 break;
             if (flags & (MF_SOURCE_READERF_ERROR | MF_SOURCE_READERF_ALLEFFECTSREMOVED | MF_SOURCE_READERF_ENDOFSTREAM))
                 break;
-            if (videoSample)
+            if (impendingVideoSample)
                 break;
             if (flags & MF_SOURCE_READERF_STREAMTICK)
             {
                 CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream tick detected. Retrying to grab the frame");
             }
         }
+        if (SUCCEEDED(hr))
+        {
+            if (streamIndex != dwVideoStreamIndex)
+            {
+                CV_LOG_DEBUG(NULL, "videoio(MSMF): Wrong stream read. Abort capturing");
+                close();
+            }
+            else if (flags & MF_SOURCE_READERF_ERROR)
+            {
+                CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream reading error. Abort capturing");
+                close();
+            }
+            else if (flags & MF_SOURCE_READERF_ALLEFFECTSREMOVED)
+            {
+                CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream decoding error. Abort capturing");
+                close();
+            }
+            else if (flags & MF_SOURCE_READERF_ENDOFSTREAM)
+            {
+                vEOS = true;
+                lastFrame = true;
+                stopFlag = true;
+                if (audioStream == -1)
+                    returnFlag = false;
+                else if (usedVideoSample)
+                    returnFlag = true;
+                CV_LOG_DEBUG(NULL, "videoio(MSMF): End of video stream detected");
+            }
+            else
+            {
+                CV_LOG_DEBUG(NULL, "videoio(MSMF): got video frame with timestamp=" << impendingVideoSampleTime);
+                if (audioStream != -1)
+                {
+                    if (!usedVideoSample)
+                    {
+                        usedVideoSample.swap(impendingVideoSample);
+                        std::swap(usedVideoSampleTime, impendingVideoSampleTime);
+                        videoStartOffset = usedVideoSampleTime;
+                    }
+                    else
+                    {
+                        stopFlag = true;
+                    }
+                    if (impendingVideoSample)
+                    {
+                        nFrame++;
+                        videoSampleDuration = impendingVideoSampleTime - usedVideoSampleTime;
+                        requiredAudioTime = impendingVideoSampleTime - givenAudioTime;
+                        givenAudioTime += requiredAudioTime;
+                    }
+                }
+                else
+                {
+                    usedVideoSample.swap(impendingVideoSample);
+                    std::swap(usedVideoSampleTime, impendingVideoSampleTime);
+                    stopFlag = true;
+                    nFrame++;
+                }
+                if (flags & MF_SOURCE_READERF_NEWSTREAM)
+                {
+                    CV_LOG_DEBUG(NULL, "videoio(MSMF): New stream detected");
+                }
+                if (flags & MF_SOURCE_READERF_NATIVEMEDIATYPECHANGED)
+                {
+                    CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream native media type changed");
+                }
+                if (flags & MF_SOURCE_READERF_CURRENTMEDIATYPECHANGED)
+                {
+                    CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream current media type changed");
+                }
+                returnFlag = true;
+            }
+        }
+    }
+    return returnFlag;
+}
 
+bool CvCapture_MSMF::grabAudioFrame()
+{
+    DWORD streamIndex,  flags;
+    HRESULT hr;
+    _ComPtr<IMFSample> audioSample = NULL;
+    audioSamples.clear();
+
+    bool returnFlag = false;
+    audioTime = 0;
+    int numberOfSamples = -1;
+    if (bufferedAudioDuration*1e7 > requiredAudioTime)
+        return true;
+    while ((!vEOS) ? audioTime <= requiredAudioTime : !aEOS)
+    {
+        if (audioStartOffset - usedVideoSampleTime > videoSampleDuration)
+            return true;
+        for (;;)
+        {
+            CV_TRACE_REGION("ReadSample");
+            if (!SUCCEEDED(hr = videoFileSource->ReadSample(
+                dwAudioStreamIndex, // Stream index.
+                0,             // Flags.
+                &streamIndex,  // Receives the actual stream index.
+                &flags,        // Receives status flags.
+                &audioSampleTime,   // Receives the time stamp.
+                &audioSample  // Receives the sample or NULL.
+            )))
+                break;
+            if (streamIndex != dwAudioStreamIndex)
+                break;
+            if (flags & (MF_SOURCE_READERF_ERROR | MF_SOURCE_READERF_ALLEFFECTSREMOVED | MF_SOURCE_READERF_ENDOFSTREAM))
+                break;
+            if (audioSample)
+                break;
+            if (flags & MF_SOURCE_READERF_STREAMTICK)
+            {
+                CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream tick detected. Retrying to grab the frame");
+            }
+        }
         if (SUCCEEDED(hr))
         {
-            if (streamIndex != dwStreamIndex)
+            if (streamIndex != dwAudioStreamIndex)
             {
                 CV_LOG_DEBUG(NULL, "videoio(MSMF): Wrong stream read. Abort capturing");
                 close();
@@ -1078,12 +1497,25 @@ bool CvCapture_MSMF::grabFrame()
             }
             else if (flags & MF_SOURCE_READERF_ENDOFSTREAM)
             {
-                sampleTime += frameStep;
-                CV_LOG_DEBUG(NULL, "videoio(MSMF): End of stream detected");
+                aEOS = true;
+                if (videoStream != -1 && !vEOS)
+                    returnFlag = true;
+                CV_LOG_DEBUG(NULL, "videoio(MSMF): End of audio stream detected");
+                break;
             }
             else
             {
-                sampleTime += frameStep;
+                audioSamples.push_back(audioSample);
+                audioSample = NULL;
+                numberOfSamples++;
+                audioSamples[numberOfSamples]->GetSampleDuration(&audioSampleDuration);
+                CV_LOG_DEBUG(NULL, "videoio(MSMF): got audio frame with timestamp=" << audioSampleTime << "  duration=" << audioSampleDuration);
+                audioTime += (LONGLONG)(audioSampleDuration + bufferedAudioDuration*1e7);
+                if (nFrame == 1 && audioStartOffset == -1)
+                {
+                    audioStartOffset = audioSampleTime - audioSampleDuration;
+                    requiredAudioTime -= audioStartOffset;
+                }
                 if (flags & MF_SOURCE_READERF_NEWSTREAM)
                 {
                     CV_LOG_DEBUG(NULL, "videoio(MSMF): New stream detected");
@@ -1096,33 +1528,189 @@ bool CvCapture_MSMF::grabFrame()
                 {
                     CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream current media type changed");
                 }
-                return true;
+                returnFlag = true;
+            }
+        }
+        else
+        {
+            CV_LOG_DEBUG(NULL, "videoio(MSMF): ReadSample() method is not succeeded");
+            return false;
+        }
+    }
+
+    if (!audioSamples.empty() || !bufferAudioData.empty() && aEOS)
+    {
+        _ComPtr<IMFMediaBuffer> buf = NULL;
+        std::vector<BYTE> audioDataInUse;
+        BYTE* ptr = NULL;
+        DWORD maxsize = 0, cursize = 0;
+        CV_TRACE_REGION("get_contiguous_buffer");
+        for (auto item : audioSamples)
+        {
+            if (!SUCCEEDED(item->ConvertToContiguousBuffer(&buf)))
+            {
+                CV_TRACE_REGION("get_buffer");
+                DWORD bcnt = 0;
+                if (!SUCCEEDED(item->GetBufferCount(&bcnt)))
+                    break;
+                if (bcnt == 0)
+                    break;
+                if (!SUCCEEDED(item->GetBufferByIndex(0, &buf)))
+                    break;
+            }
+            if (!SUCCEEDED(buf->Lock(&ptr, &maxsize, &cursize)))
+                break;
+            size_t lastSize = bufferAudioData.size();
+            bufferAudioData.resize(lastSize+cursize);
+            for (unsigned int i = 0; i < cursize; i++)
+            {
+                bufferAudioData[lastSize+i]=*(ptr+i);
+            }
+            CV_TRACE_REGION_NEXT("unlock");
+            buf->Unlock();
+            buf = NULL;
+        }
+        audioSamples.clear();
+
+        audioSamplePos += chunkLengthOfBytes/((captureAudioFormat.bit_per_sample/8)*captureAudioFormat.nChannels);
+        chunkLengthOfBytes = (videoStream != -1) ? (LONGLONG)((requiredAudioTime*captureAudioFormat.nSamplesPerSec*captureAudioFormat.nChannels*(captureAudioFormat.bit_per_sample)/8)/1e7) : cursize;
+        if ((videoStream != -1) && (chunkLengthOfBytes % ((int)(captureAudioFormat.bit_per_sample)/8* (int)captureAudioFormat.nChannels) != 0))
+        {
+            if ( (double)audioSamplePos/captureAudioFormat.nSamplesPerSec + audioStartOffset * 1e-7 - usedVideoSampleTime * 1e-7 >= 0 )
+                chunkLengthOfBytes -= numberOfAdditionalAudioBytes;
+            numberOfAdditionalAudioBytes = ((int)(captureAudioFormat.bit_per_sample)/8* (int)captureAudioFormat.nChannels)
+                                        - chunkLengthOfBytes % ((int)(captureAudioFormat.bit_per_sample)/8* (int)captureAudioFormat.nChannels);
+            chunkLengthOfBytes += numberOfAdditionalAudioBytes;
+        }
+        if (lastFrame && !syncLastFrame|| aEOS && !vEOS)
+        {
+            chunkLengthOfBytes = bufferAudioData.size();
+        }
+        CV_Check((double)chunkLengthOfBytes, chunkLengthOfBytes >= INT_MIN || chunkLengthOfBytes <= INT_MAX, "MSMF: The chunkLengthOfBytes is out of the allowed range");
+        copy(bufferAudioData.begin(), bufferAudioData.begin() + (int)chunkLengthOfBytes, std::back_inserter(audioDataInUse));
+        bufferAudioData.erase(bufferAudioData.begin(), bufferAudioData.begin() + (int)chunkLengthOfBytes);
+        if (audioFrame.empty())
+        {
+            switch (outputAudioFormat)
+            {
+            case CV_8S:
+                cv::Mat((int)chunkLengthOfBytes/(captureAudioFormat.nChannels), captureAudioFormat.nChannels, CV_8S, audioDataInUse.data()).copyTo(audioFrame);
+                break;
+            case CV_16S:
+                cv::Mat((int)chunkLengthOfBytes/(2*captureAudioFormat.nChannels), captureAudioFormat.nChannels, CV_16S, audioDataInUse.data()).copyTo(audioFrame);
+                break;
+            case CV_32S:
+                cv::Mat((int)chunkLengthOfBytes/(4*captureAudioFormat.nChannels), captureAudioFormat.nChannels, CV_32S, audioDataInUse.data()).copyTo(audioFrame);
+                break;
+            case CV_32F:
+                cv::Mat((int)chunkLengthOfBytes/(4*captureAudioFormat.nChannels), captureAudioFormat.nChannels, CV_32F, audioDataInUse.data()).copyTo(audioFrame);
+                break;
+            default:
+                break;
+            }
+        }
+        audioDataInUse.clear();
+        audioDataInUse.shrink_to_fit();
+    }
+
+    return returnFlag;
+}
+
+bool CvCapture_MSMF::grabFrame()
+{
+    CV_TRACE_FUNCTION();
+
+    if (grabIsDone)
+    {
+        grabIsDone = false;
+        return true;
+    }
+
+    audioFrame = Mat();
+    if (readCallback)  // async "live" capture mode
+    {
+        audioSamples.push_back(NULL);
+        HRESULT hr = 0;
+        SourceReaderCB* reader = ((SourceReaderCB*)readCallback.Get());
+        DWORD dwStreamIndex = 0;
+        if (videoStream != -1)
+            dwStreamIndex = dwVideoStreamIndex;
+        if (audioStream != -1)
+            dwStreamIndex = dwAudioStreamIndex;
+        if (!reader->m_reader)
+        {
+            // Initiate capturing with async callback
+            reader->m_reader = videoFileSource.Get();
+            reader->m_dwStreamIndex = dwStreamIndex;
+            if (FAILED(hr = videoFileSource->ReadSample(dwStreamIndex, 0, NULL, NULL, NULL, NULL)))
+            {
+                CV_LOG_ERROR(NULL, "videoio(MSMF): can't grab frame - initial async ReadSample() call failed: " << hr);
+                reader->m_reader = NULL;
+                return false;
             }
         }
+        BOOL bEOS = false;
+        if (FAILED(hr = reader->Wait( videoStream == -1 ? INFINITE : 10000, (videoStream != -1) ? usedVideoSample : audioSamples[0], bEOS)))  // 10 sec
+        {
+            CV_LOG_WARNING(NULL, "videoio(MSMF): can't grab frame. Error: " << hr);
+            return false;
+        }
+        if (bEOS)
+        {
+            CV_LOG_WARNING(NULL, "videoio(MSMF): EOS signal. Capture stream is lost");
+            return false;
+        }
+        if (videoStream != -1)
+            usedVideoSampleTime = reader->m_lastSampleTimestamp;
+        return true;
+    }
+    else if (isOpen)
+    {
+        if (vEOS)
+            return false;
+
+        bool returnFlag = true;
+
+        if (videoStream != -1)
+        {
+            if (!vEOS)
+                returnFlag &= grabVideoFrame();
+            if (!returnFlag)
+                return false;
+        }
+
+        if (audioStream != -1)
+        {
+            bufferedAudioDuration = (double)(bufferAudioData.size()/((captureAudioFormat.bit_per_sample/8)*captureAudioFormat.nChannels))/captureAudioFormat.nSamplesPerSec;
+            audioFrame.release();
+            if (!aEOS)
+                returnFlag &= grabAudioFrame();
+        }
+
+        return returnFlag;
     }
     return false;
 }
 
-bool CvCapture_MSMF::retrieveFrame(int, cv::OutputArray frame)
+bool CvCapture_MSMF::retrieveVideoFrame(cv::OutputArray frame)
 {
     CV_TRACE_FUNCTION();
     do
     {
-        if (!videoSample)
+        if (!usedVideoSample)
             break;
 
         _ComPtr<IMFMediaBuffer> buf = NULL;
-
         CV_TRACE_REGION("get_contiguous_buffer");
-        if (!SUCCEEDED(videoSample->ConvertToContiguousBuffer(&buf)))
+        if (!SUCCEEDED(usedVideoSample->ConvertToContiguousBuffer(&buf)))
         {
             CV_TRACE_REGION("get_buffer");
             DWORD bcnt = 0;
-            if (!SUCCEEDED(videoSample->GetBufferCount(&bcnt)))
+            if (!SUCCEEDED(usedVideoSample->GetBufferCount(&bcnt)))
                 break;
             if (bcnt == 0)
                 break;
-            if (!SUCCEEDED(videoSample->GetBufferByIndex(0, &buf)))
+            if (!SUCCEEDED(usedVideoSample->GetBufferByIndex(0, &buf)))
                 break;
         }
 
@@ -1158,27 +1746,27 @@ bool CvCapture_MSMF::retrieveFrame(int, cv::OutputArray frame)
             break;
         if (convertFormat)
         {
-            if (lock2d || (unsigned int)cursize == captureFormat.sampleSize)
+            if (lock2d || (unsigned int)cursize == captureVideoFormat.sampleSize)
             {
-                switch (outputFormat)
+                switch (outputVideoFormat)
                 {
                 case CV_CAP_MODE_YUYV:
-                    cv::Mat(captureFormat.height, captureFormat.width, CV_8UC2, ptr, pitch).copyTo(frame);
+                    cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC2, ptr, pitch).copyTo(frame);
                     break;
                 case CV_CAP_MODE_BGR:
                     if (captureMode == MODE_HW)
-                        cv::cvtColor(cv::Mat(captureFormat.height, captureFormat.width, CV_8UC4, ptr, pitch), frame, cv::COLOR_BGRA2BGR);
+                        cv::cvtColor(cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC4, ptr, pitch), frame, cv::COLOR_BGRA2BGR);
                     else
-                        cv::Mat(captureFormat.height, captureFormat.width, CV_8UC3, ptr, pitch).copyTo(frame);
+                        cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC3, ptr, pitch).copyTo(frame);
                     break;
                 case CV_CAP_MODE_RGB:
                     if (captureMode == MODE_HW)
-                        cv::cvtColor(cv::Mat(captureFormat.height, captureFormat.width, CV_8UC4, ptr, pitch), frame, cv::COLOR_BGRA2BGR);
+                        cv::cvtColor(cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC4, ptr, pitch), frame, cv::COLOR_BGRA2BGR);
                     else
-                        cv::cvtColor(cv::Mat(captureFormat.height, captureFormat.width, CV_8UC3, ptr, pitch), frame, cv::COLOR_BGR2RGB);
+                        cv::cvtColor(cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC3, ptr, pitch), frame, cv::COLOR_BGR2RGB);
                     break;
                 case CV_CAP_MODE_GRAY:
-                    cv::Mat(captureFormat.height, captureFormat.width, CV_8UC1, ptr, pitch).copyTo(frame);
+                    cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC1, ptr, pitch).copyTo(frame);
                     break;
                 default:
                     frame.release();
@@ -1204,30 +1792,142 @@ bool CvCapture_MSMF::retrieveFrame(int, cv::OutputArray frame)
     return false;
 }
 
+bool CvCapture_MSMF::retrieveAudioFrame(int index, cv::OutputArray frame)
+{
+    CV_TRACE_FUNCTION();
+    if (audioStartOffset - usedVideoSampleTime > videoSampleDuration)
+    {
+        frame.release();
+        return true;
+    }
+    do
+    {
+        if (audioFrame.empty())
+        {
+            frame.release();
+            if (aEOS)
+                return true;
+        }
+        cv::Mat data;
+        switch (outputAudioFormat)
+        {
+        case CV_8S:
+            data = cv::Mat(1, audioFrame.rows, CV_8S);
+            for (int i = 0; i < audioFrame.rows; i++)
+                data.at<char>(0,i) = audioFrame.at<char>(i,index-audioBaseIndex);
+            break;
+        case CV_16S:
+            data = cv::Mat(1, audioFrame.rows, CV_16S);
+            for (int i = 0; i < audioFrame.rows; i++)
+                data.at<short>(0,i) = audioFrame.at<short>(i,index-audioBaseIndex);
+            break;
+        case CV_32S:
+            data = cv::Mat(1, audioFrame.rows, CV_32S);
+            for (int i = 0; i < audioFrame.rows; i++)
+                data.at<int>(0,i) = audioFrame.at<int>(i,index-audioBaseIndex);
+            break;
+        case CV_32F:
+            data = cv::Mat(1, audioFrame.rows, CV_32F);
+            for (int i = 0; i < audioFrame.rows; i++)
+                data.at<float>(0,i) = audioFrame.at<float>(i,index-audioBaseIndex);
+            break;
+        default:
+            frame.release();
+            break;
+        }
+        if (!data.empty())
+            data.copyTo(frame);
+
+        return !frame.empty();
+    } while (0);
+
+    return false;
+}
+
+bool CvCapture_MSMF::retrieveFrame(int index, cv::OutputArray frame)
+{
+    CV_TRACE_FUNCTION();
+    if (index < 0)
+        return false;
+    if ((unsigned int)index < audioBaseIndex)
+    {
+        if (videoStream == -1)
+        {
+            frame.release();
+            return false;
+        }
+        else
+            return retrieveVideoFrame(frame);
+    }
+    else
+    {
+        if (audioStream == -1)
+        {
+            frame.release();
+            return false;
+        }
+        else
+            return retrieveAudioFrame(index, frame);
+    }
+}
+
 bool CvCapture_MSMF::setTime(double time, bool rough)
 {
+    if (videoStream == -1)
+        return false;
+    if (videoStream != -1 && audioStream != -1)
+        if (time != 0)
+            return false;
     PROPVARIANT var;
     if (SUCCEEDED(videoFileSource->GetPresentationAttribute((DWORD)MF_SOURCE_READER_MEDIASOURCE, MF_SOURCE_READER_MEDIASOURCE_CHARACTERISTICS, &var)) &&
         var.vt == VT_UI4 && var.ulVal & MFMEDIASOURCE_CAN_SEEK)
     {
-        videoSample.Release();
+        usedVideoSample.Release();
         bool useGrabbing = time > 0 && !rough && !(var.ulVal & MFMEDIASOURCE_HAS_SLOW_SEEK);
         PropVariantClear(&var);
-        sampleTime = (useGrabbing && time >= frameStep) ? (LONGLONG)floor(time + 0.5) - frameStep : (LONGLONG)floor(time + 0.5);
+        usedVideoSampleTime = (useGrabbing) ? 0 : (LONGLONG)floor(time + 0.5);
+        nFrame = (useGrabbing) ? 0 : usedVideoSampleTime/frameStep;
+        givenAudioTime = (useGrabbing) ? 0 : nFrame*frameStep;
         var.vt = VT_I8;
-        var.hVal.QuadPart = sampleTime;
+        var.hVal.QuadPart = usedVideoSampleTime;
         bool resOK = SUCCEEDED(videoFileSource->SetCurrentPosition(GUID_NULL, var));
         PropVariantClear(&var);
         if (resOK && useGrabbing)
         {
             LONGLONG timeborder = (LONGLONG)floor(time + 0.5) - frameStep / 2;
-            do { resOK = grabFrame(); videoSample.Release(); } while (resOK && sampleTime < timeborder);
+            do { resOK = grabFrame(); usedVideoSample.Release(); } while (resOK && usedVideoSampleTime < timeborder);
         }
         return resOK;
     }
     return false;
 }
 
+bool CvCapture_MSMF::setTime(int numberFrame)
+{
+    if (videoStream == -1)
+        return false;
+    if (videoStream != -1 && audioStream != -1)
+        if (numberFrame != 0)
+            return false;
+    PROPVARIANT var;
+    if (SUCCEEDED(videoFileSource->GetPresentationAttribute((DWORD)MF_SOURCE_READER_MEDIASOURCE, MF_SOURCE_READER_MEDIASOURCE_CHARACTERISTICS, &var)) &&
+        var.vt == VT_UI4 && var.ulVal & MFMEDIASOURCE_CAN_SEEK)
+    {
+        usedVideoSample.Release();
+        PropVariantClear(&var);
+        usedVideoSampleTime =  0;
+        nFrame =  0;
+        givenAudioTime =  0;
+        var.vt = VT_I8;
+        var.hVal.QuadPart = usedVideoSampleTime;
+        bool resOK = SUCCEEDED(videoFileSource->SetCurrentPosition(GUID_NULL, var));
+        PropVariantClear(&var);
+        while (resOK && nFrame < numberFrame) { resOK = grabFrame(); usedVideoSample.Release(); };
+        return resOK;
+    }
+    return false;
+}
+
 template <typename CtrlT>
 bool CvCapture_MSMF::readComplexPropery(long prop, long & val) const
 {
@@ -1269,29 +1969,31 @@ double CvCapture_MSMF::getProperty( int property_id ) const
         case CV_CAP_PROP_CONVERT_RGB:
                 return convertFormat ? 1 : 0;
         case CV_CAP_PROP_SAR_NUM:
-                return captureFormat.aspectRatioNum;
+                return captureVideoFormat.aspectRatioNum;
         case CV_CAP_PROP_SAR_DEN:
-                return captureFormat.aspectRatioDenom;
+                return captureVideoFormat.aspectRatioDenom;
         case CV_CAP_PROP_FRAME_WIDTH:
-            return captureFormat.width;
+            return captureVideoFormat.width;
         case CV_CAP_PROP_FRAME_HEIGHT:
-            return captureFormat.height;
+            return captureVideoFormat.height;
         case CV_CAP_PROP_FOURCC:
-            return captureFormat.subType.Data1;
+            return captureVideoFormat.subType.Data1;
         case CV_CAP_PROP_FPS:
-            return captureFormat.getFramerate();
+            return captureVideoFormat.getFramerate();
         case CV_CAP_PROP_FRAME_COUNT:
             if (duration != 0)
-                return floor(((double)duration / 1e7)* captureFormat.getFramerate() + 0.5);
+                return floor(((double)duration / 1e7)* captureVideoFormat.getFramerate() + 0.5);
             else
                 break;
         case CV_CAP_PROP_POS_FRAMES:
-            return floor(((double)sampleTime / 1e7)* captureFormat.getFramerate() + 0.5);
+            return (double)nFrame;
         case CV_CAP_PROP_POS_MSEC:
-            return (double)sampleTime / 1e4;
+            return (double)usedVideoSampleTime / 1e4;
+        case CAP_PROP_AUDIO_POS:
+            return (double)audioSamplePos;
         case CV_CAP_PROP_POS_AVI_RATIO:
             if (duration != 0)
-                return (double)sampleTime / duration;
+                return (double)usedVideoSampleTime / duration;
             else
                 break;
         case CV_CAP_PROP_BRIGHTNESS:
@@ -1383,6 +2085,18 @@ double CvCapture_MSMF::getProperty( int property_id ) const
         case CV_CAP_PROP_ISO_SPEED:
         case CV_CAP_PROP_SETTINGS:
         case CV_CAP_PROP_BUFFERSIZE:
+        case CAP_PROP_AUDIO_BASE_INDEX:
+            return audioBaseIndex;
+        case CAP_PROP_AUDIO_TOTAL_STREAMS:
+            return numberOfAudioStreams;
+        case CAP_PROP_AUDIO_TOTAL_CHANNELS:
+            return captureAudioFormat.nChannels;
+        case CAP_PROP_AUDIO_SAMPLES_PER_SECOND:
+            return captureAudioFormat.nSamplesPerSec;
+        case CAP_PROP_AUDIO_DATA_DEPTH:
+            return outputAudioFormat;
+        case CAP_PROP_AUDIO_SHIFT_NSEC:
+            return (double)(audioStartOffset - videoStartOffset)*1e2;
         default:
             break;
         }
@@ -1408,7 +2122,7 @@ bool CvCapture_MSMF::writeComplexProperty(long prop, double val, long flags)
 
 bool CvCapture_MSMF::setProperty( int property_id, double value )
 {
-    MediaType newFormat = captureFormat;
+    MediaType newFormat = captureVideoFormat;
     if (isOpen)
         switch (property_id)
         {
@@ -1423,45 +2137,45 @@ bool CvCapture_MSMF::setProperty( int property_id, double value )
                 return false;
             }
         case CV_CAP_PROP_FOURCC:
-            return configureOutput(newFormat, (int)cvRound(value));
+            return configureVideoOutput(newFormat, (int)cvRound(value));
         case CV_CAP_PROP_FORMAT:
-            return configureOutput(newFormat, (int)cvRound(value));
+            return configureVideoOutput(newFormat, (int)cvRound(value));
         case CV_CAP_PROP_CONVERT_RGB:
             convertFormat = (value != 0);
-            return configureOutput(newFormat, outputFormat);
+            return configureVideoOutput(newFormat, outputVideoFormat);
         case CV_CAP_PROP_SAR_NUM:
             if (value > 0)
             {
                 newFormat.aspectRatioNum = (UINT32)cvRound(value);
-                return configureOutput(newFormat, outputFormat);
+                return configureVideoOutput(newFormat, outputVideoFormat);
             }
             break;
         case CV_CAP_PROP_SAR_DEN:
             if (value > 0)
             {
                 newFormat.aspectRatioDenom = (UINT32)cvRound(value);
-                return configureOutput(newFormat, outputFormat);
+                return configureVideoOutput(newFormat, outputVideoFormat);
             }
             break;
         case CV_CAP_PROP_FRAME_WIDTH:
             if (value >= 0)
             {
                 newFormat.width = (UINT32)cvRound(value);
-                return configureOutput(newFormat, outputFormat);
+                return configureVideoOutput(newFormat, outputVideoFormat);
             }
             break;
         case CV_CAP_PROP_FRAME_HEIGHT:
             if (value >= 0)
             {
                 newFormat.height = (UINT32)cvRound(value);
-                return configureOutput(newFormat, outputFormat);
+                return configureVideoOutput(newFormat, outputVideoFormat);
             }
             break;
         case CV_CAP_PROP_FPS:
             if (value >= 0)
             {
                 newFormat.setFramerate(value);
-                return configureOutput(newFormat, outputFormat);
+                return configureVideoOutput(newFormat, outputVideoFormat);
             }
             break;
         case CV_CAP_PROP_FRAME_COUNT:
@@ -1471,8 +2185,8 @@ bool CvCapture_MSMF::setProperty( int property_id, double value )
                 return setTime(duration * value, true);
             break;
         case CV_CAP_PROP_POS_FRAMES:
-            if (std::fabs(captureFormat.getFramerate()) > 0)
-                return setTime(value  * 1e7 / captureFormat.getFramerate(), false);
+            if (std::fabs(captureVideoFormat.getFramerate()) > 0)
+                return setTime((int)value);
             break;
         case CV_CAP_PROP_POS_MSEC:
                 return setTime(value  * 1e4, false);
diff --git a/modules/videoio/test/test_audio.cpp b/modules/videoio/test/test_audio.cpp
new file mode 100644
index 0000000000..3ff51e2613
--- /dev/null
+++ b/modules/videoio/test/test_audio.cpp
@@ -0,0 +1,273 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+//file name, number of audio channels, epsilon, video type, weight, height, number of frame, number of audio samples, fps, psnr Threshold, backend
+typedef std::tuple<std::string, int, double, int, int, int, int, int, int, double, VideoCaptureAPIs> paramCombination;
+//file name, number of audio channels, number of audio samples, epsilon, backend
+typedef std::tuple<std::string, int, int, double, VideoCaptureAPIs> param;
+
+class AudioBaseTest
+{
+protected:
+    AudioBaseTest(){};
+    void getValidAudioData()
+    {
+        const double step = 3.14/22050;
+        double value = 0;
+        validAudioData.resize(expectedNumAudioCh);
+        for (int nCh = 0; nCh < expectedNumAudioCh; nCh++)
+        {
+            value = 0;
+            for(unsigned int i = 0; i < numberOfSamples; i++)
+            {
+                if (i != 0 && i % 44100 == 0)
+                    value = 0;
+                validAudioData[nCh].push_back(sin(value));
+                value += step;
+            }
+        }
+    }
+    void checkAudio()
+    {
+        getValidAudioData();
+
+        ASSERT_EQ(expectedNumAudioCh, (int)audioData.size());
+        for (unsigned int nCh = 0; nCh < audioData.size(); nCh++)
+        {
+            ASSERT_EQ(numberOfSamples, audioData[nCh].size()) << "nCh=" << nCh;
+            for (unsigned int i = 0; i < numberOfSamples; i++)
+            {
+                EXPECT_NEAR(validAudioData[nCh][i], audioData[nCh][i], epsilon) << "sample index=" << i << " nCh=" << nCh;
+            }
+        }
+    }
+protected:
+    int expectedNumAudioCh;
+    unsigned int numberOfSamples;
+    double epsilon;
+    VideoCaptureAPIs backend;
+    std::string root;
+    std::string fileName;
+
+    std::vector<std::vector<double>> validAudioData;
+    std::vector<std::vector<double>> audioData;
+    std::vector<int> params;
+
+    Mat audioFrame;
+    VideoCapture cap;
+};
+
+class AudioTestFixture : public AudioBaseTest, public testing::TestWithParam <param>
+{
+public:
+    AudioTestFixture()
+    {
+        fileName = get<0>(GetParam());
+        expectedNumAudioCh = get<1>(GetParam());
+        numberOfSamples = get<2>(GetParam());
+        epsilon = get<3>(GetParam());
+        backend = get<4>(GetParam());
+        root = "audio/";
+        params = {  CAP_PROP_AUDIO_STREAM, 0,
+                    CAP_PROP_VIDEO_STREAM, -1,
+                    CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
+    }
+
+    void doTest()
+    {
+        ASSERT_TRUE(cap.open(findDataFile(root + fileName), backend, params));
+        const int audioBaseIndex = static_cast<int>(cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX));
+        const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
+        ASSERT_EQ(expectedNumAudioCh, numberOfChannels);
+        double f = 0;
+        audioData.resize(numberOfChannels);
+        for (;;)
+        {
+            if (cap.grab())
+            {
+                for (int nCh = 0; nCh < numberOfChannels; nCh++)
+                {
+                    ASSERT_TRUE(cap.retrieve(audioFrame, audioBaseIndex));
+                    ASSERT_EQ(CV_16SC1, audioFrame.type()) << audioData[nCh].size();
+                    for (int i = 0; i < audioFrame.cols; i++)
+                    {
+                        f = ((double) audioFrame.at<signed short>(0,i)) / (double) 32768;
+                        audioData[nCh].push_back(f);
+                    }
+                }
+            }
+            else { break; }
+        }
+        ASSERT_FALSE(audioData.empty());
+
+        checkAudio();
+    }
+};
+
+const param audioParams[] =
+{
+    param("test_audio.wav", 1, 132300, 0.0001, cv::CAP_MSMF),
+    param("test_mono_audio.mp3", 1, 133104, 0.12, cv::CAP_MSMF),
+    param("test_stereo_audio.mp3", 2, 133104, 0.12, cv::CAP_MSMF),
+    param("test_audio.mp4", 1, 133104, 0.15, cv::CAP_MSMF)
+};
+
+class Audio : public AudioTestFixture{};
+
+TEST_P(Audio, audio)
+{
+    if (!videoio_registry::hasBackend(cv::VideoCaptureAPIs(backend)))
+        throw SkipTestException(cv::videoio_registry::getBackendName(backend) + " backend was not found");
+
+    doTest();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Audio, testing::ValuesIn(audioParams));
+
+class MediaTestFixture : public AudioBaseTest, public testing::TestWithParam <paramCombination>
+{
+public:
+    MediaTestFixture():
+        videoType(get<3>(GetParam())),
+        height(get<4>(GetParam())),
+        width(get<5>(GetParam())),
+        numberOfFrames(get<6>(GetParam())),
+        fps(get<8>(GetParam())),
+        psnrThreshold(get<9>(GetParam()))
+        {
+            fileName = get<0>(GetParam());
+            expectedNumAudioCh = get<1>(GetParam());
+            numberOfSamples = get<7>(GetParam());
+            epsilon = get<2>(GetParam());
+            backend = get<10>(GetParam());
+            root = "audio/";
+            params = {  CAP_PROP_AUDIO_STREAM, 0,
+                        CAP_PROP_VIDEO_STREAM, 0,
+                        CAP_PROP_AUDIO_DATA_DEPTH, CV_16S };
+        };
+
+    void doTest()
+    {
+        ASSERT_TRUE(cap.open(findDataFile(root + fileName), backend, params));
+
+        const int audioBaseIndex = static_cast<int>(cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX));
+        const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
+        ASSERT_EQ(expectedNumAudioCh, numberOfChannels);
+
+        const int samplePerSecond = (int)cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND);
+        ASSERT_EQ(44100, samplePerSecond);
+        int samplesPerFrame = (int)(1./fps*samplePerSecond);
+        int audioSamplesTolerance = samplesPerFrame / 2;
+
+        double audio0_timestamp = 0;
+
+        Mat videoFrame;
+        Mat img(height, width, videoType);
+        audioData.resize(numberOfChannels);
+        for (int frame = 0; frame < numberOfFrames; frame++)
+        {
+            SCOPED_TRACE(cv::format("frame=%d", frame));
+
+            ASSERT_TRUE(cap.grab());
+
+            if (frame == 0)
+            {
+                double audio_shift = cap.get(CAP_PROP_AUDIO_SHIFT_NSEC);
+                double video0_timestamp = cap.get(CAP_PROP_POS_MSEC) * 1e-3;
+                audio0_timestamp = video0_timestamp + audio_shift * 1e-9;
+                std::cout << "video0 timestamp: " << video0_timestamp << "  audio0 timestamp: " << audio0_timestamp << " (audio shift nanoseconds: " << audio_shift << " , seconds: " << audio_shift * 1e-9 << ")" << std::endl;
+            }
+
+            ASSERT_TRUE(cap.retrieve(videoFrame));
+            if (epsilon >= 0)
+            {
+                generateFrame(frame, numberOfFrames, img);
+                ASSERT_EQ(img.size, videoFrame.size);
+                double psnr = cvtest::PSNR(img, videoFrame);
+                EXPECT_GE(psnr, psnrThreshold);
+            }
+
+            int audioFrameCols = 0;
+            for (int nCh = 0; nCh < numberOfChannels; nCh++)
+            {
+                ASSERT_TRUE(cap.retrieve(audioFrame, audioBaseIndex+nCh));
+                if (audioFrame.empty())
+                    continue;
+                ASSERT_EQ(CV_16SC1, audioFrame.type());
+                if (nCh == 0)
+                    audioFrameCols = audioFrame.cols;
+                else
+                    ASSERT_EQ(audioFrameCols, audioFrame.cols) << "channel "<< nCh;
+                for (int i = 0; i < audioFrame.cols; i++)
+                {
+                    double f = audioFrame.at<signed short>(0,i) / 32768.0;
+                    audioData[nCh].push_back(f);
+                }
+            }
+
+            if (frame < 5 || frame >= numberOfFrames-5)
+                std::cout << "frame=" << frame << ":  audioFrameSize=" << audioFrameCols << "  videoTimestamp=" << cap.get(CAP_PROP_POS_MSEC) << " ms" << std::endl;
+            else if (frame == 6)
+                std::cout << "frame..." << std::endl;
+
+            if (audioFrameCols == 0)
+                continue;
+            if (frame != 0 && frame != numberOfFrames-1)
+            {
+                // validate audio position
+                EXPECT_NEAR(
+                        cap.get(CAP_PROP_AUDIO_POS) / samplePerSecond + audio0_timestamp,
+                        cap.get(CAP_PROP_POS_MSEC) * 1e-3,
+                        (1.0 / fps) * 0.3)
+                    << "CAP_PROP_AUDIO_POS=" << cap.get(CAP_PROP_AUDIO_POS) << " CAP_PROP_POS_MSEC=" << cap.get(CAP_PROP_POS_MSEC);
+            }
+            if (frame != 0 && frame != numberOfFrames-1 && audioData[0].size() != (size_t)numberOfSamples)
+            {
+                // validate audio frame size
+                EXPECT_NEAR(audioFrame.cols, samplesPerFrame, audioSamplesTolerance);
+            }
+        }
+        ASSERT_FALSE(cap.grab());
+        ASSERT_FALSE(audioData.empty());
+
+        std::cout << "Total audio samples=" << audioData[0].size() << std::endl;
+
+        if (epsilon >= 0)
+            checkAudio();
+    }
+protected:
+    const int videoType;
+    const int height;
+    const int width;
+    const int numberOfFrames;
+    const int fps;
+    const double psnrThreshold;
+};
+
+const paramCombination mediaParams[] =
+{
+    paramCombination("test_audio.mp4", 1, 0.15, CV_8UC3, 240, 320, 90, 131819, 30, 30., cv::CAP_MSMF)
+#if 0
+    // https://filesamples.com/samples/video/mp4/sample_960x400_ocean_with_audio.mp4
+    , paramCombination("sample_960x400_ocean_with_audio.mp4", 2, -1/*eplsilon*/, CV_8UC3, 400, 960, 1116, 2056588, 30, 30., cv::CAP_MSMF)
+#endif
+};
+
+class Media : public MediaTestFixture{};
+
+TEST_P(Media, audio)
+{
+    if (!videoio_registry::hasBackend(cv::VideoCaptureAPIs(backend)))
+        throw SkipTestException(cv::videoio_registry::getBackendName(backend) + " backend was not found");
+
+    doTest();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Media, testing::ValuesIn(mediaParams));
+
+}} //namespace
diff --git a/modules/videoio/test/test_microphone.cpp b/modules/videoio/test/test_microphone.cpp
new file mode 100644
index 0000000000..c82a7c4eda
--- /dev/null
+++ b/modules/videoio/test/test_microphone.cpp
@@ -0,0 +1,41 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+// Usage: opencv_test_videoio --gtest_also_run_disabled_tests
+
+#include "test_precomp.hpp"
+
+namespace opencv_test { namespace {
+
+TEST(DISABLED_videoio_micro, basic)
+{
+    int cursize = 0;
+    int validSize = 0;
+    Mat frame;
+
+    std::vector<int> params { CAP_PROP_AUDIO_STREAM, 0, CAP_PROP_VIDEO_STREAM, -1 };
+    VideoCapture cap(0, cv::CAP_MSMF, params);
+    ASSERT_TRUE(cap.isOpened());
+
+    int samplesPerSecond = (int)cap.get(cv::CAP_PROP_AUDIO_SAMPLES_PER_SECOND);
+    const int audio_base_index = (int)cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX);
+
+    const double cvTickFreq = cv::getTickFrequency();
+    int64 sysTimePrev = cv::getTickCount();
+    int64 sysTimeCurr = cv::getTickCount();
+
+    cout << "Audio would be captured for the next 10 seconds" << endl;
+    while ((sysTimeCurr-sysTimePrev)/cvTickFreq < 10)
+    {
+        if (cap.grab())
+        {
+            ASSERT_TRUE(cap.retrieve(frame, audio_base_index));
+            sysTimeCurr = cv::getTickCount();
+        }
+    }
+    validSize = samplesPerSecond*(int)((sysTimeCurr-sysTimePrev)/cvTickFreq);
+    cursize = (int)cap.get(cv::CAP_PROP_AUDIO_POS);
+    ASSERT_LT(validSize - cursize, cursize*0.05);
+}
+
+}} // namespace
diff --git a/samples/cpp/videocapture_audio.cpp b/samples/cpp/videocapture_audio.cpp
new file mode 100644
index 0000000000..c9f1ec94ce
--- /dev/null
+++ b/samples/cpp/videocapture_audio.cpp
@@ -0,0 +1,59 @@
+#include <opencv2/core.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <iostream>
+
+using namespace cv;
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    CommandLineParser parser(argc, argv, "{@audio||}");
+    string file = parser.get<string>("@audio");
+
+    if (file.empty())
+    {
+        return 1;
+    }
+
+    Mat frame;
+    vector<vector<Mat>> audioData;
+    VideoCapture cap;
+    vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
+                            CAP_PROP_VIDEO_STREAM, -1,
+                            CAP_PROP_AUDIO_DATA_DEPTH, CV_16S   };
+
+    cap.open(file, CAP_MSMF, params);
+    if (!cap.isOpened())
+    {
+        cerr << "ERROR! Can't to open file: " + file << endl;
+        return -1;
+    }
+
+    const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
+    const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
+    cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
+    cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
+    cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
+    cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
+
+    int numberOfSamples = 0;
+    audioData.resize(numberOfChannels);
+    for (;;)
+    {
+        if (cap.grab())
+        {
+            for (int nCh = 0; nCh < numberOfChannels; nCh++)
+            {
+                cap.retrieve(frame, audioBaseIndex+nCh);
+                audioData[nCh].push_back(frame);
+                numberOfSamples+=frame.cols;
+            }
+        }
+        else { break; }
+    }
+
+    cout << "Number of samples: " << numberOfSamples << endl;
+
+    return 0;
+}
diff --git a/samples/cpp/videocapture_audio_combination.cpp b/samples/cpp/videocapture_audio_combination.cpp
new file mode 100644
index 0000000000..7f0deecf16
--- /dev/null
+++ b/samples/cpp/videocapture_audio_combination.cpp
@@ -0,0 +1,69 @@
+#include <opencv2/core.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <iostream>
+
+using namespace cv;
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    cv::CommandLineParser parser(argc, argv, "{@audio||}");
+    string file = parser.get<string>("@audio");
+
+    if (file.empty())
+    {
+        return 1;
+    }
+
+    Mat videoFrame;
+    Mat audioFrame;
+    vector<vector<Mat>> audioData;
+    VideoCapture cap;
+    vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
+                            CAP_PROP_VIDEO_STREAM, 0,
+                            CAP_PROP_AUDIO_DATA_DEPTH, CV_16S   };
+
+    cap.open(file, CAP_MSMF, params);
+    if (!cap.isOpened())
+    {
+        cerr << "ERROR! Can't to open file: " + file << endl;
+        return -1;
+    }
+
+    const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
+    const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
+    cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
+    cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
+    cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS) << endl;
+    cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
+
+    int numberOfSamples = 0;
+    int numberOfFrames = 0;
+    audioData.resize(numberOfChannels);
+    for (;;)
+    {
+        if (cap.grab())
+        {
+            cap.retrieve(videoFrame);
+            for (int nCh = 0; nCh < numberOfChannels; nCh++)
+            {
+                cap.retrieve(audioFrame, audioBaseIndex+nCh);
+                if (!audioFrame.empty())
+                    audioData[nCh].push_back(audioFrame);
+                numberOfSamples+=audioFrame.cols;
+            }
+            if (!videoFrame.empty())
+            {
+                numberOfFrames++;
+                imshow("Live", videoFrame);
+                if (waitKey(5) >= 0)
+                    break;
+            }
+        } else { break; }
+    }
+
+    cout << "Number of audio samples: " << numberOfSamples << endl
+         << "Number of video frames: " << numberOfFrames << endl;
+    return 0;
+}
diff --git a/samples/cpp/videocapture_microphone.cpp b/samples/cpp/videocapture_microphone.cpp
new file mode 100644
index 0000000000..0c69ec929d
--- /dev/null
+++ b/samples/cpp/videocapture_microphone.cpp
@@ -0,0 +1,57 @@
+#include <opencv2/core.hpp>
+#include <opencv2/videoio.hpp>
+#include <opencv2/highgui.hpp>
+#include <iostream>
+
+using namespace cv;
+using namespace std;
+
+int main(int, char**)
+{
+    Mat frame;
+    vector<Mat> audioData;
+    VideoCapture cap;
+    vector<int> params {    CAP_PROP_AUDIO_STREAM, 0,
+                            CAP_PROP_VIDEO_STREAM, -1   };
+
+    cap.open(0, CAP_MSMF, params);
+    if (!cap.isOpened())
+    {
+        cerr << "ERROR! Can't to open microphone" << endl;
+        return -1;
+    }
+
+    const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX);
+    const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS);
+    cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl;
+    cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl;
+    cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl;
+    cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl;
+
+    const double cvTickFreq = getTickFrequency();
+    int64 sysTimeCurr = getTickCount();
+    int64 sysTimePrev = sysTimeCurr;
+    while ((sysTimeCurr-sysTimePrev)/cvTickFreq < 10)
+    {
+        if (cap.grab())
+        {
+            for (int nCh = 0; nCh < numberOfChannels; nCh++)
+            {
+                cap.retrieve(frame, audioBaseIndex+nCh);
+                audioData.push_back(frame);
+                sysTimeCurr = getTickCount();
+            }
+        }
+        else
+        {
+            cerr << "Grab error" << endl;
+            break;
+        }
+    }
+    int numberOfSamles = 0;
+    for (auto item : audioData)
+        numberOfSamles+=item.cols;
+    cout << "Number of samples: " << numberOfSamles << endl;
+
+    return 0;
+}