diff --git a/modules/videoio/include/opencv2/videoio.hpp b/modules/videoio/include/opencv2/videoio.hpp index 3276d0d5e4..4b5bc135bc 100644 --- a/modules/videoio/include/opencv2/videoio.hpp +++ b/modules/videoio/include/opencv2/videoio.hpp @@ -189,6 +189,17 @@ enum VideoCaptureProperties { CAP_PROP_OPEN_TIMEOUT_MSEC=53, //!< (**open-only**) timeout in milliseconds for opening a video capture (applicable for FFmpeg back-end only) CAP_PROP_READ_TIMEOUT_MSEC=54, //!< (**open-only**) timeout in milliseconds for reading from a video capture (applicable for FFmpeg back-end only) CAP_PROP_STREAM_OPEN_TIME_USEC =55, // #include #include +#include #include #include #include @@ -69,7 +70,6 @@ static void init_MFCreateDXGIDeviceManager() #endif #include - #include #include // QISearch @@ -108,6 +108,13 @@ public: { } + void swap(_In_ ComPtr& lp) + { + ComPtr tmp(p); + p = lp.p; + lp.p = tmp.p; + tmp = NULL; + } T** operator&() { CV_Assert(p == NULL); @@ -155,6 +162,7 @@ template inline T absDiff(T a, T b) { return a >= b ? a - b : b - a // Structure for collecting info about types of video which are supported by current video device struct MediaType { + //video param UINT32 width; UINT32 height; INT32 stride; // stride is negative if image is bottom-up @@ -165,9 +173,17 @@ struct MediaType UINT32 aspectRatioDenom; UINT32 sampleSize; UINT32 interlaceMode; + //audio param + UINT32 bit_per_sample; + UINT32 nChannels; + UINT32 nAvgBytesPerSec; + UINT32 nSamplesPerSec; + GUID majorType; // video or audio GUID subType; // fourCC + _ComPtr Type; MediaType(IMFMediaType *pType = 0) : + Type(pType), width(0), height(0), stride(0), isFixedSize(true), @@ -175,23 +191,38 @@ struct MediaType aspectRatioNum(1), aspectRatioDenom(1), sampleSize(0), interlaceMode(0), - majorType(MFMediaType_Video), + bit_per_sample(0), + nChannels(0), + nAvgBytesPerSec(0), + nSamplesPerSec(0), + majorType({ 0 }),//MFMediaType_Video subType({ 0 }) { if (pType) { - MFGetAttributeSize(pType, MF_MT_FRAME_SIZE, &width, &height); - pType->GetUINT32(MF_MT_DEFAULT_STRIDE, (UINT32*)&stride); // value is stored as UINT32 but should be casted to INT3) - pType->GetUINT32(MF_MT_FIXED_SIZE_SAMPLES, &isFixedSize); - MFGetAttributeRatio(pType, MF_MT_FRAME_RATE, &frameRateNum, &frameRateDenom); - MFGetAttributeRatio(pType, MF_MT_PIXEL_ASPECT_RATIO, &aspectRatioNum, &aspectRatioDenom); - pType->GetUINT32(MF_MT_SAMPLE_SIZE, &sampleSize); - pType->GetUINT32(MF_MT_INTERLACE_MODE, &interlaceMode); pType->GetGUID(MF_MT_MAJOR_TYPE, &majorType); pType->GetGUID(MF_MT_SUBTYPE, &subType); + if (majorType == MFMediaType_Audio) + { + pType->GetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, &bit_per_sample); + pType->GetUINT32(MF_MT_AUDIO_NUM_CHANNELS, &nChannels); + pType->GetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, &nAvgBytesPerSec); + pType->GetUINT32(MF_MT_AUDIO_FLOAT_SAMPLES_PER_SECOND, &nSamplesPerSec); + } + else if (majorType == MFMediaType_Video) + { + MFGetAttributeSize(pType, MF_MT_FRAME_SIZE, &width, &height); + pType->GetUINT32(MF_MT_DEFAULT_STRIDE, (UINT32*)&stride); // value is stored as UINT32 but should be casted to INT3) + pType->GetUINT32(MF_MT_FIXED_SIZE_SAMPLES, &isFixedSize); + MFGetAttributeRatio(pType, MF_MT_FRAME_RATE, &frameRateNum, &frameRateDenom); + MFGetAttributeRatio(pType, MF_MT_PIXEL_ASPECT_RATIO, &aspectRatioNum, &aspectRatioDenom); + pType->GetUINT32(MF_MT_SAMPLE_SIZE, &sampleSize); + pType->GetUINT32(MF_MT_INTERLACE_MODE, &interlaceMode); + pType->GetUINT32(MF_MT_INTERLACE_MODE, &interlaceMode); + } } } - static MediaType createDefault() + static MediaType createDefault_Video() { MediaType res; res.width = 640; @@ -199,11 +230,24 @@ struct MediaType res.setFramerate(30.0); return res; } - inline bool isEmpty() const + static MediaType createDefault_Audio() + { + MediaType res; + res.majorType = MFMediaType_Audio; + res.subType = MFAudioFormat_PCM; + res.bit_per_sample = 16; + res.nChannels = 1; + res.nSamplesPerSec = 44100; + return res; + } + inline bool isEmpty(bool flag = false) const { - return width == 0 && height == 0; + if (!flag) + return width == 0 && height == 0; + else + return nChannels == 0; } - _ComPtr createMediaType() const + _ComPtr createMediaType_Video() const { _ComPtr res; MFCreateMediaType(&res); @@ -225,6 +269,22 @@ struct MediaType res->SetGUID(MF_MT_SUBTYPE, subType); return res; } + _ComPtr createMediaType_Audio() const + { + _ComPtr res; + MFCreateMediaType(&res); + if (majorType != GUID()) + res->SetGUID(MF_MT_MAJOR_TYPE, majorType); + if (subType != GUID()) + res->SetGUID(MF_MT_SUBTYPE, subType); + if (bit_per_sample != 0) + res->SetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, bit_per_sample); + if (nChannels != 0) + res->SetUINT32(MF_MT_AUDIO_NUM_CHANNELS, nChannels); + if (nSamplesPerSec != 0) + res->SetUINT32(MF_MT_AUDIO_SAMPLES_PER_SECOND, nSamplesPerSec); + return res; + } void setFramerate(double fps) { frameRateNum = (UINT32)cvRound(fps * 1000.0); @@ -246,7 +306,7 @@ struct MediaType return wdiff + hdiff; } // check if 'this' is better than 'other' comparing to reference - bool isBetterThan(const MediaType& other, const MediaType& ref) const + bool VideoIsBetterThan(const MediaType& other, const MediaType& ref) const { const unsigned long thisDiff = resolutionDiff(ref); const unsigned long otherDiff = other.resolutionDiff(ref); @@ -268,6 +328,24 @@ struct MediaType } return false; } + bool AudioIsBetterThan(const MediaType& other, const MediaType& ref) const + { + double thisDiff = absDiff(nChannels, ref.nChannels); + double otherDiff = absDiff(other.nChannels, ref.nChannels); + if (otherDiff < thisDiff) + { + thisDiff = absDiff(bit_per_sample, ref.bit_per_sample); + otherDiff = absDiff(bit_per_sample, ref.bit_per_sample); + if (otherDiff < thisDiff) + { + thisDiff = absDiff(nSamplesPerSec, ref.nSamplesPerSec); + otherDiff = absDiff(nSamplesPerSec, ref.nSamplesPerSec); + if (otherDiff < thisDiff) + return true; + } + } + return false; + } }; void printFormat(std::ostream& out, const GUID& fmt) @@ -405,7 +483,7 @@ public: return S_OK; } - HRESULT Wait(DWORD dwMilliseconds, _ComPtr& videoSample, BOOL& pbEOS) + HRESULT Wait(DWORD dwMilliseconds, _ComPtr& mediaSample, BOOL& pbEOS) { pbEOS = FALSE; @@ -423,14 +501,14 @@ public: if (!pbEOS) { cv::AutoLock lock(m_mutex); - videoSample = m_lastSample; - CV_Assert(videoSample); + mediaSample = m_lastSample; + CV_Assert(mediaSample); m_lastSample.Release(); ResetEvent(m_hEvent); // event is auto-reset, but we need this forced reset due time gap between wait() and mutex hold. } - return m_hrStatus; } + private: // Destructor is private. Caller should call Release. virtual ~SourceReaderCB() @@ -496,22 +574,67 @@ public: } } } + void countNumberOfAudioStreams(DWORD &numberOfAudioStreams) + { + std::pair best; + std::map::const_iterator i = formats.begin(); + for (; i != formats.end(); ++i) + { + if(i->second.majorType == MFMediaType_Audio) + { + if(best.second.isEmpty() || i->first.stream != best.first.stream) + { + numberOfAudioStreams++; + best = *i; + } + } + } + } std::pair findBestVideoFormat(const MediaType& newType) { std::pair best; std::map::const_iterator i = formats.begin(); for (; i != formats.end(); ++i) { - if (i->second.majorType != MFMediaType_Video) - continue; - if (newType.isEmpty()) // file input - choose first returned media type + if (i->second.majorType == MFMediaType_Video) { - best = *i; - break; + if (best.second.isEmpty() || i->second.VideoIsBetterThan(best.second, newType)) + { + best = *i; + } } - if (best.second.isEmpty() || i->second.isBetterThan(best.second, newType)) + } + return best; + } + std::pair findBestAudioFormat(const MediaType& newType) + { + std::pair best; + std::map::const_iterator i = formats.begin(); + best = *i; + for (; i != formats.end(); ++i) + { + if (i->second.majorType == MFMediaType_Audio) + { + if ( i->second.AudioIsBetterThan(best.second, newType)) + { + best = *i; + } + } + } + return best; + } + std::pair findAudioFormatByStream(const DWORD StreamIndex) + { + std::pair best; + std::map::const_iterator i = formats.begin(); + for (; i != formats.end(); ++i) + { + if (i->second.majorType == MFMediaType_Audio) { - best = *i; + if ((*i).first.stream == StreamIndex) + { + best = *i; + } } } return best; @@ -586,21 +709,30 @@ public: virtual void close(); virtual double getProperty(int) const CV_OVERRIDE; virtual bool setProperty(int, double) CV_OVERRIDE; + bool grabAudioFrame(); + bool grabVideoFrame(); virtual bool grabFrame() CV_OVERRIDE; + bool retrieveAudioFrame(int, OutputArray); + bool retrieveVideoFrame(OutputArray); virtual bool retrieveFrame(int, cv::OutputArray) CV_OVERRIDE; virtual bool isOpened() const CV_OVERRIDE { return isOpen; } virtual int getCaptureDomain() CV_OVERRIDE { return CV_CAP_MSMF; } protected: - bool configureOutput(MediaType newType, cv::uint32_t outFormat); + bool configureOutput(); + bool configureAudioOutput(MediaType newType); + bool configureVideoOutput(MediaType newType, cv::uint32_t outFormat); bool setTime(double time, bool rough); + bool setTime(int numberFrame); bool configureHW(bool enable); + bool configureStreams(const cv::VideoCaptureParameters&); + bool setAudioProperties(const cv::VideoCaptureParameters&); template bool readComplexPropery(long prop, long& val) const; template bool writeComplexProperty(long prop, double val, long flags); _ComPtr getDefaultSourceConfig(UINT32 num = 10); - bool initStream(DWORD streamID, const MediaType& mt); + bool initStream(DWORD streamID, const MediaType mt); bool openFinalize_(const VideoCaptureParameters* params); @@ -615,17 +747,49 @@ protected: _ComPtr D3DMgr; #endif _ComPtr videoFileSource; - _ComPtr videoSample; _ComPtr readCallback; // non-NULL for "live" streams (camera capture) - DWORD dwStreamIndex; + std::vector dwStreamIndices; + std::vector<_ComPtr> audioSamples; + _ComPtr impendingVideoSample; + _ComPtr usedVideoSample; + DWORD dwVideoStreamIndex; + DWORD dwAudioStreamIndex; MediaType nativeFormat; - MediaType captureFormat; - int outputFormat; + MediaType captureVideoFormat; + MediaType captureAudioFormat; + bool device_status; //on or off + int videoStream; // look at CAP_PROP_VIDEO_STREAM + int audioStream; // look at CAP_PROP_AUDIO_STREAM + bool vEOS; + bool aEOS; + unsigned int audioBaseIndex; + int outputVideoFormat; + int outputAudioFormat; bool convertFormat; MFTIME duration; LONGLONG frameStep; - LONGLONG sampleTime; + LONGLONG nFrame; + LONGLONG impendingVideoSampleTime; + LONGLONG usedVideoSampleTime; + LONGLONG videoStartOffset; + LONGLONG videoSampleDuration; + LONGLONG requiredAudioTime; + LONGLONG audioSampleTime; + LONGLONG audioStartOffset; + LONGLONG audioSampleDuration; + LONGLONG audioTime; + LONGLONG chunkLengthOfBytes; + LONGLONG givenAudioTime; + LONGLONG numberOfAdditionalAudioBytes; // the number of additional bytes required to align the audio chunk + double bufferedAudioDuration; + LONGLONG audioSamplePos; + DWORD numberOfAudioStreams; + Mat audioFrame; + std::deque bufferAudioData; bool isOpen; + bool grabIsDone; + bool syncLastFrame; + bool lastFrame; }; CvCapture_MSMF::CvCapture_MSMF(): @@ -640,15 +804,42 @@ CvCapture_MSMF::CvCapture_MSMF(): D3DMgr(NULL), #endif videoFileSource(NULL), - videoSample(NULL), readCallback(NULL), - dwStreamIndex(0), - outputFormat(CV_CAP_MODE_BGR), + impendingVideoSample(NULL), + usedVideoSample(NULL), + dwVideoStreamIndex(0), + dwAudioStreamIndex(0), + device_status(false), + videoStream(0), + audioStream(-1), + vEOS(false), + aEOS(false), + audioBaseIndex(1), + outputVideoFormat(CV_CAP_MODE_BGR), + outputAudioFormat(CV_16S), convertFormat(true), duration(0), frameStep(0), - sampleTime(0), - isOpen(false) + nFrame(0), + impendingVideoSampleTime(0), + usedVideoSampleTime(0), + videoStartOffset(-1), + videoSampleDuration(0), + requiredAudioTime(0), + audioSampleTime(0), + audioStartOffset(-1), + audioSampleDuration(0), + audioTime(0), + chunkLengthOfBytes(0), + givenAudioTime(0), + numberOfAdditionalAudioBytes(0), + bufferedAudioDuration(0), + audioSamplePos(0), + numberOfAudioStreams(0), + isOpen(false), + grabIsDone(false), + syncLastFrame(true), + lastFrame(false) { } @@ -663,29 +854,37 @@ void CvCapture_MSMF::close() if (isOpen) { isOpen = false; - videoSample.Release(); + usedVideoSample.Release(); + for (auto item : audioSamples) + item.Release(); videoFileSource.Release(); + device_status = false; camid = -1; filename.clear(); } readCallback.Release(); } -bool CvCapture_MSMF::initStream(DWORD streamID, const MediaType& mt) +bool CvCapture_MSMF::initStream(DWORD streamID, const MediaType mt) { CV_LOG_DEBUG(NULL, "Init stream " << streamID << " with MediaType " << mt); - _ComPtr mediaTypeOut = mt.createMediaType(); - if (FAILED(videoFileSource->SetStreamSelection((DWORD)MF_SOURCE_READER_ALL_STREAMS, false))) + _ComPtr mediaTypesOut; + if (mt.majorType == MFMediaType_Audio) { - CV_LOG_WARNING(NULL, "Failed to reset streams"); - return false; + captureAudioFormat = mt; + mediaTypesOut = mt.createMediaType_Audio(); + } + if (mt.majorType == MFMediaType_Video) + { + captureVideoFormat = mt; + mediaTypesOut = mt.createMediaType_Video(); } if (FAILED(videoFileSource->SetStreamSelection(streamID, true))) { CV_LOG_WARNING(NULL, "Failed to select stream " << streamID); return false; } - HRESULT hr = videoFileSource->SetCurrentMediaType(streamID, NULL, mediaTypeOut.Get()); + HRESULT hr = videoFileSource->SetCurrentMediaType(streamID, NULL, mediaTypesOut.Get()); if (hr == MF_E_TOPO_CODEC_NOT_FOUND) { CV_LOG_WARNING(NULL, "Failed to set mediaType (stream " << streamID << ", " << mt << "(codec not found)"); @@ -701,7 +900,7 @@ bool CvCapture_MSMF::initStream(DWORD streamID, const MediaType& mt) CV_LOG_WARNING(NULL, "Failed to set mediaType (stream " << streamID << ", " << mt << "(HRESULT " << hr << ")"); return false; } - captureFormat = mt; + return true; } @@ -826,7 +1025,52 @@ bool CvCapture_MSMF::configureHW(const VideoCaptureParameters& params) return configureHW(va_type == VIDEO_ACCELERATION_D3D11 || va_type == VIDEO_ACCELERATION_ANY); } -bool CvCapture_MSMF::configureOutput(MediaType newType, cv::uint32_t outFormat) +bool CvCapture_MSMF::configureAudioOutput(MediaType newType) +{ + FormatStorage formats; + formats.read(videoFileSource.Get()); + std::pair bestMatch; + formats.countNumberOfAudioStreams(numberOfAudioStreams); + if (device_status) + bestMatch = formats.findBestAudioFormat(newType); + else + bestMatch = formats.findAudioFormatByStream(audioStream); + if (bestMatch.second.isEmpty(true)) + { + CV_LOG_DEBUG(NULL, "Can not find audio stream with requested parameters"); + return false; + } + dwAudioStreamIndex = bestMatch.first.stream; + dwStreamIndices.push_back(dwAudioStreamIndex); + MediaType newFormat = bestMatch.second; + + newFormat.majorType = MFMediaType_Audio; + newFormat.nSamplesPerSec = 44100; + switch (outputAudioFormat) + { + case CV_8S: + newFormat.subType = MFAudioFormat_PCM; + newFormat.bit_per_sample = 8; + break; + case CV_16S: + newFormat.subType = MFAudioFormat_PCM; + newFormat.bit_per_sample = 16; + break; + case CV_32S: + newFormat.subType = MFAudioFormat_PCM; + newFormat.bit_per_sample = 32; + case CV_32F: + newFormat.subType = MFAudioFormat_Float; + newFormat.bit_per_sample = 32; + break; + default: + break; + } + + return initStream(dwAudioStreamIndex, newFormat); +} + +bool CvCapture_MSMF::configureVideoOutput(MediaType newType, cv::uint32_t outFormat) { FormatStorage formats; formats.read(videoFileSource.Get()); @@ -836,9 +1080,11 @@ bool CvCapture_MSMF::configureOutput(MediaType newType, cv::uint32_t outFormat) CV_LOG_DEBUG(NULL, "Can not find video stream with requested parameters"); return false; } - dwStreamIndex = bestMatch.first.stream; + dwVideoStreamIndex = bestMatch.first.stream; + dwStreamIndices.push_back(dwVideoStreamIndex); nativeFormat = bestMatch.second; MediaType newFormat = nativeFormat; + if (convertFormat) { switch (outFormat) @@ -869,8 +1115,25 @@ bool CvCapture_MSMF::configureOutput(MediaType newType, cv::uint32_t outFormat) } // we select native format first and then our requested format (related issue #12822) if (!newType.isEmpty()) // camera input - initStream(dwStreamIndex, nativeFormat); - return initStream(dwStreamIndex, newFormat); + { + initStream(dwVideoStreamIndex, nativeFormat); + } + return initStream(dwVideoStreamIndex, newFormat); +} + +bool CvCapture_MSMF::configureOutput() +{ + if (FAILED(videoFileSource->SetStreamSelection((DWORD)MF_SOURCE_READER_ALL_STREAMS, false))) + { + CV_LOG_WARNING(NULL, "Failed to reset streams"); + return false; + } + bool tmp = true; + if (videoStream != -1) + tmp = (!device_status)? configureVideoOutput(MediaType(), outputVideoFormat) : configureVideoOutput(MediaType::createDefault_Video(), outputVideoFormat); + if (audioStream != -1) + tmp &= (!device_status)? configureAudioOutput(MediaType()) : configureAudioOutput(MediaType::createDefault_Audio()); + return tmp; } bool CvCapture_MSMF::open(int index, const cv::VideoCaptureParameters* params) @@ -882,10 +1145,19 @@ bool CvCapture_MSMF::open(int index, const cv::VideoCaptureParameters* params) if (params) { configureHW(*params); + configureStreams(*params); + } + if (videoStream != -1 && audioStream != -1 || videoStream == -1 && audioStream == -1) + { + CV_LOG_DEBUG(NULL, "Only one of the properties CAP_PROP_AUDIO_STREAM " << audioStream << " and " << CAP_PROP_VIDEO_STREAM << " must be different from -1"); + return false; } - DeviceList devices; - UINT32 count = devices.read(); + UINT32 count = 0; + if (audioStream != -1) + count = devices.read(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_AUDCAP_GUID); + if (videoStream != -1) + count = devices.read(MF_DEVSOURCE_ATTRIBUTE_SOURCE_TYPE_VIDCAP_GUID); if (count == 0 || static_cast(index) > count) { CV_LOG_DEBUG(NULL, "Device " << index << " not found (total " << count << " devices)"); @@ -902,14 +1174,14 @@ bool CvCapture_MSMF::open(int index, const cv::VideoCaptureParameters* params) } isOpen = true; + device_status = true; camid = index; readCallback = cb; duration = 0; - if (configureOutput(MediaType::createDefault(), outputFormat)) + if (configureOutput()) { - frameStep = captureFormat.getFrameStep(); + frameStep = captureVideoFormat.getFrameStep(); } - if (isOpen && !openFinalize_(params)) { close(); @@ -928,8 +1200,9 @@ bool CvCapture_MSMF::open(const cv::String& _filename, const cv::VideoCapturePar if (params) { configureHW(*params); + configureStreams(*params); + setAudioProperties(*params); } - // Set source reader parameters _ComPtr attr = getDefaultSourceConfig(); cv::AutoBuffer unicodeFileName(_filename.length() + 1); @@ -937,11 +1210,11 @@ bool CvCapture_MSMF::open(const cv::String& _filename, const cv::VideoCapturePar if (SUCCEEDED(MFCreateSourceReaderFromURL(unicodeFileName.data(), attr.Get(), &videoFileSource))) { isOpen = true; - sampleTime = 0; - if (configureOutput(MediaType(), outputFormat)) + usedVideoSampleTime = 0; + if (configureOutput()) { - frameStep = captureFormat.getFrameStep(); filename = _filename; + frameStep = captureVideoFormat.getFrameStep(); PROPVARIANT var; HRESULT hr; if (SUCCEEDED(hr = videoFileSource->GetPresentationAttribute((DWORD)MF_SOURCE_READER_MEDIASOURCE, MF_PD_DURATION, &var)) && @@ -954,13 +1227,18 @@ bool CvCapture_MSMF::open(const cv::String& _filename, const cv::VideoCapturePar duration = 0; } } - if (isOpen && !openFinalize_(params)) { close(); return false; } - + if (isOpen) + if (audioStream != -1 && videoStream != -1) + { + isOpen = grabFrame(); + if (isOpen) + grabIsDone = true; + } return isOpen; } @@ -997,71 +1275,212 @@ bool CvCapture_MSMF::openFinalize_(const VideoCaptureParameters* params) return true; } -bool CvCapture_MSMF::grabFrame() +bool CvCapture_MSMF::configureStreams(const cv::VideoCaptureParameters& params) { - CV_TRACE_FUNCTION(); - if (readCallback) // async "live" capture mode + if (params.has(CAP_PROP_VIDEO_STREAM)) { - HRESULT hr = 0; - SourceReaderCB* reader = ((SourceReaderCB*)readCallback.Get()); - if (!reader->m_reader) + double value = params.get(CAP_PROP_VIDEO_STREAM); + if (value == -1 || value == 0) + videoStream = static_cast(value); + else { - // Initiate capturing with async callback - reader->m_reader = videoFileSource.Get(); - reader->m_dwStreamIndex = dwStreamIndex; - if (FAILED(hr = videoFileSource->ReadSample(dwStreamIndex, 0, NULL, NULL, NULL, NULL))) - { - CV_LOG_ERROR(NULL, "videoio(MSMF): can't grab frame - initial async ReadSample() call failed: " << hr); - reader->m_reader = NULL; - return false; - } + CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: CAP_PROP_VIDEO_STREAM parameter value is invalid/unsupported: " << value); + return false; } - BOOL bEOS = false; - if (FAILED(hr = reader->Wait(10000, videoSample, bEOS))) // 10 sec + } + if (params.has(CAP_PROP_AUDIO_STREAM)) + { + double value = params.get(CAP_PROP_AUDIO_STREAM); + if (value == -1 || value > -1) + audioStream = static_cast(value); + else { - CV_LOG_WARNING(NULL, "videoio(MSMF): can't grab frame. Error: " << hr); + CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: CAP_PROP_AUDIO_STREAM parameter value is invalid/unsupported: " << value); return false; } - if (bEOS) + } + return true; +} +bool CvCapture_MSMF::setAudioProperties(const cv::VideoCaptureParameters& params) +{ + if (params.has(CAP_PROP_AUDIO_DATA_DEPTH)) + { + int value = static_cast(params.get(CAP_PROP_AUDIO_DATA_DEPTH)); + if (value != CV_8S && value != CV_16S && value != CV_32S && value != CV_32F) { - CV_LOG_WARNING(NULL, "videoio(MSMF): EOS signal. Capture stream is lost"); + CV_LOG_ERROR(NULL, "VIDEOIO/MSMF: CAP_PROP_AUDIO_DATA_DEPTH parameter value is invalid/unsupported: " << value); return false; } - sampleTime = reader->m_lastSampleTimestamp; - return true; + else + { + outputAudioFormat = value; + } } - else if (isOpen) + if (params.has(CAP_PROP_AUDIO_SYNCHRONIZE)) + { + int value = static_cast(params.get(CAP_PROP_AUDIO_SYNCHRONIZE)); + syncLastFrame = (value != 0) ? true : false; + } + return true; +} + +bool CvCapture_MSMF::grabVideoFrame() +{ + DWORD streamIndex, flags; + HRESULT hr; + usedVideoSample.Release(); + + bool returnFlag = false; + bool stopFlag = false; + if (audioStream != -1) { - DWORD streamIndex, flags; - videoSample.Release(); - HRESULT hr; - for(;;) + usedVideoSample.swap(impendingVideoSample); + std::swap(usedVideoSampleTime, impendingVideoSampleTime); + } + while (!stopFlag) + { + for (;;) { CV_TRACE_REGION("ReadSample"); if (!SUCCEEDED(hr = videoFileSource->ReadSample( - dwStreamIndex, // Stream index. + dwVideoStreamIndex, // Stream index. 0, // Flags. &streamIndex, // Receives the actual stream index. &flags, // Receives status flags. - &sampleTime, // Receives the time stamp. - &videoSample // Receives the sample or NULL. + &impendingVideoSampleTime, // Receives the time stamp. + &impendingVideoSample // Receives the sample or NULL. ))) break; - if (streamIndex != dwStreamIndex) + if (streamIndex != dwVideoStreamIndex) break; if (flags & (MF_SOURCE_READERF_ERROR | MF_SOURCE_READERF_ALLEFFECTSREMOVED | MF_SOURCE_READERF_ENDOFSTREAM)) break; - if (videoSample) + if (impendingVideoSample) break; if (flags & MF_SOURCE_READERF_STREAMTICK) { CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream tick detected. Retrying to grab the frame"); } } + if (SUCCEEDED(hr)) + { + if (streamIndex != dwVideoStreamIndex) + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): Wrong stream read. Abort capturing"); + close(); + } + else if (flags & MF_SOURCE_READERF_ERROR) + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream reading error. Abort capturing"); + close(); + } + else if (flags & MF_SOURCE_READERF_ALLEFFECTSREMOVED) + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream decoding error. Abort capturing"); + close(); + } + else if (flags & MF_SOURCE_READERF_ENDOFSTREAM) + { + vEOS = true; + lastFrame = true; + stopFlag = true; + if (audioStream == -1) + returnFlag = false; + else if (usedVideoSample) + returnFlag = true; + CV_LOG_DEBUG(NULL, "videoio(MSMF): End of video stream detected"); + } + else + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): got video frame with timestamp=" << impendingVideoSampleTime); + if (audioStream != -1) + { + if (!usedVideoSample) + { + usedVideoSample.swap(impendingVideoSample); + std::swap(usedVideoSampleTime, impendingVideoSampleTime); + videoStartOffset = usedVideoSampleTime; + } + else + { + stopFlag = true; + } + if (impendingVideoSample) + { + nFrame++; + videoSampleDuration = impendingVideoSampleTime - usedVideoSampleTime; + requiredAudioTime = impendingVideoSampleTime - givenAudioTime; + givenAudioTime += requiredAudioTime; + } + } + else + { + usedVideoSample.swap(impendingVideoSample); + std::swap(usedVideoSampleTime, impendingVideoSampleTime); + stopFlag = true; + nFrame++; + } + if (flags & MF_SOURCE_READERF_NEWSTREAM) + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): New stream detected"); + } + if (flags & MF_SOURCE_READERF_NATIVEMEDIATYPECHANGED) + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream native media type changed"); + } + if (flags & MF_SOURCE_READERF_CURRENTMEDIATYPECHANGED) + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream current media type changed"); + } + returnFlag = true; + } + } + } + return returnFlag; +} +bool CvCapture_MSMF::grabAudioFrame() +{ + DWORD streamIndex, flags; + HRESULT hr; + _ComPtr audioSample = NULL; + audioSamples.clear(); + + bool returnFlag = false; + audioTime = 0; + int numberOfSamples = -1; + if (bufferedAudioDuration*1e7 > requiredAudioTime) + return true; + while ((!vEOS) ? audioTime <= requiredAudioTime : !aEOS) + { + if (audioStartOffset - usedVideoSampleTime > videoSampleDuration) + return true; + for (;;) + { + CV_TRACE_REGION("ReadSample"); + if (!SUCCEEDED(hr = videoFileSource->ReadSample( + dwAudioStreamIndex, // Stream index. + 0, // Flags. + &streamIndex, // Receives the actual stream index. + &flags, // Receives status flags. + &audioSampleTime, // Receives the time stamp. + &audioSample // Receives the sample or NULL. + ))) + break; + if (streamIndex != dwAudioStreamIndex) + break; + if (flags & (MF_SOURCE_READERF_ERROR | MF_SOURCE_READERF_ALLEFFECTSREMOVED | MF_SOURCE_READERF_ENDOFSTREAM)) + break; + if (audioSample) + break; + if (flags & MF_SOURCE_READERF_STREAMTICK) + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream tick detected. Retrying to grab the frame"); + } + } if (SUCCEEDED(hr)) { - if (streamIndex != dwStreamIndex) + if (streamIndex != dwAudioStreamIndex) { CV_LOG_DEBUG(NULL, "videoio(MSMF): Wrong stream read. Abort capturing"); close(); @@ -1078,12 +1497,25 @@ bool CvCapture_MSMF::grabFrame() } else if (flags & MF_SOURCE_READERF_ENDOFSTREAM) { - sampleTime += frameStep; - CV_LOG_DEBUG(NULL, "videoio(MSMF): End of stream detected"); + aEOS = true; + if (videoStream != -1 && !vEOS) + returnFlag = true; + CV_LOG_DEBUG(NULL, "videoio(MSMF): End of audio stream detected"); + break; } else { - sampleTime += frameStep; + audioSamples.push_back(audioSample); + audioSample = NULL; + numberOfSamples++; + audioSamples[numberOfSamples]->GetSampleDuration(&audioSampleDuration); + CV_LOG_DEBUG(NULL, "videoio(MSMF): got audio frame with timestamp=" << audioSampleTime << " duration=" << audioSampleDuration); + audioTime += (LONGLONG)(audioSampleDuration + bufferedAudioDuration*1e7); + if (nFrame == 1 && audioStartOffset == -1) + { + audioStartOffset = audioSampleTime - audioSampleDuration; + requiredAudioTime -= audioStartOffset; + } if (flags & MF_SOURCE_READERF_NEWSTREAM) { CV_LOG_DEBUG(NULL, "videoio(MSMF): New stream detected"); @@ -1096,33 +1528,189 @@ bool CvCapture_MSMF::grabFrame() { CV_LOG_DEBUG(NULL, "videoio(MSMF): Stream current media type changed"); } - return true; + returnFlag = true; + } + } + else + { + CV_LOG_DEBUG(NULL, "videoio(MSMF): ReadSample() method is not succeeded"); + return false; + } + } + + if (!audioSamples.empty() || !bufferAudioData.empty() && aEOS) + { + _ComPtr buf = NULL; + std::vector audioDataInUse; + BYTE* ptr = NULL; + DWORD maxsize = 0, cursize = 0; + CV_TRACE_REGION("get_contiguous_buffer"); + for (auto item : audioSamples) + { + if (!SUCCEEDED(item->ConvertToContiguousBuffer(&buf))) + { + CV_TRACE_REGION("get_buffer"); + DWORD bcnt = 0; + if (!SUCCEEDED(item->GetBufferCount(&bcnt))) + break; + if (bcnt == 0) + break; + if (!SUCCEEDED(item->GetBufferByIndex(0, &buf))) + break; + } + if (!SUCCEEDED(buf->Lock(&ptr, &maxsize, &cursize))) + break; + size_t lastSize = bufferAudioData.size(); + bufferAudioData.resize(lastSize+cursize); + for (unsigned int i = 0; i < cursize; i++) + { + bufferAudioData[lastSize+i]=*(ptr+i); + } + CV_TRACE_REGION_NEXT("unlock"); + buf->Unlock(); + buf = NULL; + } + audioSamples.clear(); + + audioSamplePos += chunkLengthOfBytes/((captureAudioFormat.bit_per_sample/8)*captureAudioFormat.nChannels); + chunkLengthOfBytes = (videoStream != -1) ? (LONGLONG)((requiredAudioTime*captureAudioFormat.nSamplesPerSec*captureAudioFormat.nChannels*(captureAudioFormat.bit_per_sample)/8)/1e7) : cursize; + if ((videoStream != -1) && (chunkLengthOfBytes % ((int)(captureAudioFormat.bit_per_sample)/8* (int)captureAudioFormat.nChannels) != 0)) + { + if ( (double)audioSamplePos/captureAudioFormat.nSamplesPerSec + audioStartOffset * 1e-7 - usedVideoSampleTime * 1e-7 >= 0 ) + chunkLengthOfBytes -= numberOfAdditionalAudioBytes; + numberOfAdditionalAudioBytes = ((int)(captureAudioFormat.bit_per_sample)/8* (int)captureAudioFormat.nChannels) + - chunkLengthOfBytes % ((int)(captureAudioFormat.bit_per_sample)/8* (int)captureAudioFormat.nChannels); + chunkLengthOfBytes += numberOfAdditionalAudioBytes; + } + if (lastFrame && !syncLastFrame|| aEOS && !vEOS) + { + chunkLengthOfBytes = bufferAudioData.size(); + } + CV_Check((double)chunkLengthOfBytes, chunkLengthOfBytes >= INT_MIN || chunkLengthOfBytes <= INT_MAX, "MSMF: The chunkLengthOfBytes is out of the allowed range"); + copy(bufferAudioData.begin(), bufferAudioData.begin() + (int)chunkLengthOfBytes, std::back_inserter(audioDataInUse)); + bufferAudioData.erase(bufferAudioData.begin(), bufferAudioData.begin() + (int)chunkLengthOfBytes); + if (audioFrame.empty()) + { + switch (outputAudioFormat) + { + case CV_8S: + cv::Mat((int)chunkLengthOfBytes/(captureAudioFormat.nChannels), captureAudioFormat.nChannels, CV_8S, audioDataInUse.data()).copyTo(audioFrame); + break; + case CV_16S: + cv::Mat((int)chunkLengthOfBytes/(2*captureAudioFormat.nChannels), captureAudioFormat.nChannels, CV_16S, audioDataInUse.data()).copyTo(audioFrame); + break; + case CV_32S: + cv::Mat((int)chunkLengthOfBytes/(4*captureAudioFormat.nChannels), captureAudioFormat.nChannels, CV_32S, audioDataInUse.data()).copyTo(audioFrame); + break; + case CV_32F: + cv::Mat((int)chunkLengthOfBytes/(4*captureAudioFormat.nChannels), captureAudioFormat.nChannels, CV_32F, audioDataInUse.data()).copyTo(audioFrame); + break; + default: + break; + } + } + audioDataInUse.clear(); + audioDataInUse.shrink_to_fit(); + } + + return returnFlag; +} + +bool CvCapture_MSMF::grabFrame() +{ + CV_TRACE_FUNCTION(); + + if (grabIsDone) + { + grabIsDone = false; + return true; + } + + audioFrame = Mat(); + if (readCallback) // async "live" capture mode + { + audioSamples.push_back(NULL); + HRESULT hr = 0; + SourceReaderCB* reader = ((SourceReaderCB*)readCallback.Get()); + DWORD dwStreamIndex = 0; + if (videoStream != -1) + dwStreamIndex = dwVideoStreamIndex; + if (audioStream != -1) + dwStreamIndex = dwAudioStreamIndex; + if (!reader->m_reader) + { + // Initiate capturing with async callback + reader->m_reader = videoFileSource.Get(); + reader->m_dwStreamIndex = dwStreamIndex; + if (FAILED(hr = videoFileSource->ReadSample(dwStreamIndex, 0, NULL, NULL, NULL, NULL))) + { + CV_LOG_ERROR(NULL, "videoio(MSMF): can't grab frame - initial async ReadSample() call failed: " << hr); + reader->m_reader = NULL; + return false; } } + BOOL bEOS = false; + if (FAILED(hr = reader->Wait( videoStream == -1 ? INFINITE : 10000, (videoStream != -1) ? usedVideoSample : audioSamples[0], bEOS))) // 10 sec + { + CV_LOG_WARNING(NULL, "videoio(MSMF): can't grab frame. Error: " << hr); + return false; + } + if (bEOS) + { + CV_LOG_WARNING(NULL, "videoio(MSMF): EOS signal. Capture stream is lost"); + return false; + } + if (videoStream != -1) + usedVideoSampleTime = reader->m_lastSampleTimestamp; + return true; + } + else if (isOpen) + { + if (vEOS) + return false; + + bool returnFlag = true; + + if (videoStream != -1) + { + if (!vEOS) + returnFlag &= grabVideoFrame(); + if (!returnFlag) + return false; + } + + if (audioStream != -1) + { + bufferedAudioDuration = (double)(bufferAudioData.size()/((captureAudioFormat.bit_per_sample/8)*captureAudioFormat.nChannels))/captureAudioFormat.nSamplesPerSec; + audioFrame.release(); + if (!aEOS) + returnFlag &= grabAudioFrame(); + } + + return returnFlag; } return false; } -bool CvCapture_MSMF::retrieveFrame(int, cv::OutputArray frame) +bool CvCapture_MSMF::retrieveVideoFrame(cv::OutputArray frame) { CV_TRACE_FUNCTION(); do { - if (!videoSample) + if (!usedVideoSample) break; _ComPtr buf = NULL; - CV_TRACE_REGION("get_contiguous_buffer"); - if (!SUCCEEDED(videoSample->ConvertToContiguousBuffer(&buf))) + if (!SUCCEEDED(usedVideoSample->ConvertToContiguousBuffer(&buf))) { CV_TRACE_REGION("get_buffer"); DWORD bcnt = 0; - if (!SUCCEEDED(videoSample->GetBufferCount(&bcnt))) + if (!SUCCEEDED(usedVideoSample->GetBufferCount(&bcnt))) break; if (bcnt == 0) break; - if (!SUCCEEDED(videoSample->GetBufferByIndex(0, &buf))) + if (!SUCCEEDED(usedVideoSample->GetBufferByIndex(0, &buf))) break; } @@ -1158,27 +1746,27 @@ bool CvCapture_MSMF::retrieveFrame(int, cv::OutputArray frame) break; if (convertFormat) { - if (lock2d || (unsigned int)cursize == captureFormat.sampleSize) + if (lock2d || (unsigned int)cursize == captureVideoFormat.sampleSize) { - switch (outputFormat) + switch (outputVideoFormat) { case CV_CAP_MODE_YUYV: - cv::Mat(captureFormat.height, captureFormat.width, CV_8UC2, ptr, pitch).copyTo(frame); + cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC2, ptr, pitch).copyTo(frame); break; case CV_CAP_MODE_BGR: if (captureMode == MODE_HW) - cv::cvtColor(cv::Mat(captureFormat.height, captureFormat.width, CV_8UC4, ptr, pitch), frame, cv::COLOR_BGRA2BGR); + cv::cvtColor(cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC4, ptr, pitch), frame, cv::COLOR_BGRA2BGR); else - cv::Mat(captureFormat.height, captureFormat.width, CV_8UC3, ptr, pitch).copyTo(frame); + cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC3, ptr, pitch).copyTo(frame); break; case CV_CAP_MODE_RGB: if (captureMode == MODE_HW) - cv::cvtColor(cv::Mat(captureFormat.height, captureFormat.width, CV_8UC4, ptr, pitch), frame, cv::COLOR_BGRA2BGR); + cv::cvtColor(cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC4, ptr, pitch), frame, cv::COLOR_BGRA2BGR); else - cv::cvtColor(cv::Mat(captureFormat.height, captureFormat.width, CV_8UC3, ptr, pitch), frame, cv::COLOR_BGR2RGB); + cv::cvtColor(cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC3, ptr, pitch), frame, cv::COLOR_BGR2RGB); break; case CV_CAP_MODE_GRAY: - cv::Mat(captureFormat.height, captureFormat.width, CV_8UC1, ptr, pitch).copyTo(frame); + cv::Mat(captureVideoFormat.height, captureVideoFormat.width, CV_8UC1, ptr, pitch).copyTo(frame); break; default: frame.release(); @@ -1204,30 +1792,142 @@ bool CvCapture_MSMF::retrieveFrame(int, cv::OutputArray frame) return false; } +bool CvCapture_MSMF::retrieveAudioFrame(int index, cv::OutputArray frame) +{ + CV_TRACE_FUNCTION(); + if (audioStartOffset - usedVideoSampleTime > videoSampleDuration) + { + frame.release(); + return true; + } + do + { + if (audioFrame.empty()) + { + frame.release(); + if (aEOS) + return true; + } + cv::Mat data; + switch (outputAudioFormat) + { + case CV_8S: + data = cv::Mat(1, audioFrame.rows, CV_8S); + for (int i = 0; i < audioFrame.rows; i++) + data.at(0,i) = audioFrame.at(i,index-audioBaseIndex); + break; + case CV_16S: + data = cv::Mat(1, audioFrame.rows, CV_16S); + for (int i = 0; i < audioFrame.rows; i++) + data.at(0,i) = audioFrame.at(i,index-audioBaseIndex); + break; + case CV_32S: + data = cv::Mat(1, audioFrame.rows, CV_32S); + for (int i = 0; i < audioFrame.rows; i++) + data.at(0,i) = audioFrame.at(i,index-audioBaseIndex); + break; + case CV_32F: + data = cv::Mat(1, audioFrame.rows, CV_32F); + for (int i = 0; i < audioFrame.rows; i++) + data.at(0,i) = audioFrame.at(i,index-audioBaseIndex); + break; + default: + frame.release(); + break; + } + if (!data.empty()) + data.copyTo(frame); + + return !frame.empty(); + } while (0); + + return false; +} + +bool CvCapture_MSMF::retrieveFrame(int index, cv::OutputArray frame) +{ + CV_TRACE_FUNCTION(); + if (index < 0) + return false; + if ((unsigned int)index < audioBaseIndex) + { + if (videoStream == -1) + { + frame.release(); + return false; + } + else + return retrieveVideoFrame(frame); + } + else + { + if (audioStream == -1) + { + frame.release(); + return false; + } + else + return retrieveAudioFrame(index, frame); + } +} + bool CvCapture_MSMF::setTime(double time, bool rough) { + if (videoStream == -1) + return false; + if (videoStream != -1 && audioStream != -1) + if (time != 0) + return false; PROPVARIANT var; if (SUCCEEDED(videoFileSource->GetPresentationAttribute((DWORD)MF_SOURCE_READER_MEDIASOURCE, MF_SOURCE_READER_MEDIASOURCE_CHARACTERISTICS, &var)) && var.vt == VT_UI4 && var.ulVal & MFMEDIASOURCE_CAN_SEEK) { - videoSample.Release(); + usedVideoSample.Release(); bool useGrabbing = time > 0 && !rough && !(var.ulVal & MFMEDIASOURCE_HAS_SLOW_SEEK); PropVariantClear(&var); - sampleTime = (useGrabbing && time >= frameStep) ? (LONGLONG)floor(time + 0.5) - frameStep : (LONGLONG)floor(time + 0.5); + usedVideoSampleTime = (useGrabbing) ? 0 : (LONGLONG)floor(time + 0.5); + nFrame = (useGrabbing) ? 0 : usedVideoSampleTime/frameStep; + givenAudioTime = (useGrabbing) ? 0 : nFrame*frameStep; var.vt = VT_I8; - var.hVal.QuadPart = sampleTime; + var.hVal.QuadPart = usedVideoSampleTime; bool resOK = SUCCEEDED(videoFileSource->SetCurrentPosition(GUID_NULL, var)); PropVariantClear(&var); if (resOK && useGrabbing) { LONGLONG timeborder = (LONGLONG)floor(time + 0.5) - frameStep / 2; - do { resOK = grabFrame(); videoSample.Release(); } while (resOK && sampleTime < timeborder); + do { resOK = grabFrame(); usedVideoSample.Release(); } while (resOK && usedVideoSampleTime < timeborder); } return resOK; } return false; } +bool CvCapture_MSMF::setTime(int numberFrame) +{ + if (videoStream == -1) + return false; + if (videoStream != -1 && audioStream != -1) + if (numberFrame != 0) + return false; + PROPVARIANT var; + if (SUCCEEDED(videoFileSource->GetPresentationAttribute((DWORD)MF_SOURCE_READER_MEDIASOURCE, MF_SOURCE_READER_MEDIASOURCE_CHARACTERISTICS, &var)) && + var.vt == VT_UI4 && var.ulVal & MFMEDIASOURCE_CAN_SEEK) + { + usedVideoSample.Release(); + PropVariantClear(&var); + usedVideoSampleTime = 0; + nFrame = 0; + givenAudioTime = 0; + var.vt = VT_I8; + var.hVal.QuadPart = usedVideoSampleTime; + bool resOK = SUCCEEDED(videoFileSource->SetCurrentPosition(GUID_NULL, var)); + PropVariantClear(&var); + while (resOK && nFrame < numberFrame) { resOK = grabFrame(); usedVideoSample.Release(); }; + return resOK; + } + return false; +} + template bool CvCapture_MSMF::readComplexPropery(long prop, long & val) const { @@ -1269,29 +1969,31 @@ double CvCapture_MSMF::getProperty( int property_id ) const case CV_CAP_PROP_CONVERT_RGB: return convertFormat ? 1 : 0; case CV_CAP_PROP_SAR_NUM: - return captureFormat.aspectRatioNum; + return captureVideoFormat.aspectRatioNum; case CV_CAP_PROP_SAR_DEN: - return captureFormat.aspectRatioDenom; + return captureVideoFormat.aspectRatioDenom; case CV_CAP_PROP_FRAME_WIDTH: - return captureFormat.width; + return captureVideoFormat.width; case CV_CAP_PROP_FRAME_HEIGHT: - return captureFormat.height; + return captureVideoFormat.height; case CV_CAP_PROP_FOURCC: - return captureFormat.subType.Data1; + return captureVideoFormat.subType.Data1; case CV_CAP_PROP_FPS: - return captureFormat.getFramerate(); + return captureVideoFormat.getFramerate(); case CV_CAP_PROP_FRAME_COUNT: if (duration != 0) - return floor(((double)duration / 1e7)* captureFormat.getFramerate() + 0.5); + return floor(((double)duration / 1e7)* captureVideoFormat.getFramerate() + 0.5); else break; case CV_CAP_PROP_POS_FRAMES: - return floor(((double)sampleTime / 1e7)* captureFormat.getFramerate() + 0.5); + return (double)nFrame; case CV_CAP_PROP_POS_MSEC: - return (double)sampleTime / 1e4; + return (double)usedVideoSampleTime / 1e4; + case CAP_PROP_AUDIO_POS: + return (double)audioSamplePos; case CV_CAP_PROP_POS_AVI_RATIO: if (duration != 0) - return (double)sampleTime / duration; + return (double)usedVideoSampleTime / duration; else break; case CV_CAP_PROP_BRIGHTNESS: @@ -1383,6 +2085,18 @@ double CvCapture_MSMF::getProperty( int property_id ) const case CV_CAP_PROP_ISO_SPEED: case CV_CAP_PROP_SETTINGS: case CV_CAP_PROP_BUFFERSIZE: + case CAP_PROP_AUDIO_BASE_INDEX: + return audioBaseIndex; + case CAP_PROP_AUDIO_TOTAL_STREAMS: + return numberOfAudioStreams; + case CAP_PROP_AUDIO_TOTAL_CHANNELS: + return captureAudioFormat.nChannels; + case CAP_PROP_AUDIO_SAMPLES_PER_SECOND: + return captureAudioFormat.nSamplesPerSec; + case CAP_PROP_AUDIO_DATA_DEPTH: + return outputAudioFormat; + case CAP_PROP_AUDIO_SHIFT_NSEC: + return (double)(audioStartOffset - videoStartOffset)*1e2; default: break; } @@ -1408,7 +2122,7 @@ bool CvCapture_MSMF::writeComplexProperty(long prop, double val, long flags) bool CvCapture_MSMF::setProperty( int property_id, double value ) { - MediaType newFormat = captureFormat; + MediaType newFormat = captureVideoFormat; if (isOpen) switch (property_id) { @@ -1423,45 +2137,45 @@ bool CvCapture_MSMF::setProperty( int property_id, double value ) return false; } case CV_CAP_PROP_FOURCC: - return configureOutput(newFormat, (int)cvRound(value)); + return configureVideoOutput(newFormat, (int)cvRound(value)); case CV_CAP_PROP_FORMAT: - return configureOutput(newFormat, (int)cvRound(value)); + return configureVideoOutput(newFormat, (int)cvRound(value)); case CV_CAP_PROP_CONVERT_RGB: convertFormat = (value != 0); - return configureOutput(newFormat, outputFormat); + return configureVideoOutput(newFormat, outputVideoFormat); case CV_CAP_PROP_SAR_NUM: if (value > 0) { newFormat.aspectRatioNum = (UINT32)cvRound(value); - return configureOutput(newFormat, outputFormat); + return configureVideoOutput(newFormat, outputVideoFormat); } break; case CV_CAP_PROP_SAR_DEN: if (value > 0) { newFormat.aspectRatioDenom = (UINT32)cvRound(value); - return configureOutput(newFormat, outputFormat); + return configureVideoOutput(newFormat, outputVideoFormat); } break; case CV_CAP_PROP_FRAME_WIDTH: if (value >= 0) { newFormat.width = (UINT32)cvRound(value); - return configureOutput(newFormat, outputFormat); + return configureVideoOutput(newFormat, outputVideoFormat); } break; case CV_CAP_PROP_FRAME_HEIGHT: if (value >= 0) { newFormat.height = (UINT32)cvRound(value); - return configureOutput(newFormat, outputFormat); + return configureVideoOutput(newFormat, outputVideoFormat); } break; case CV_CAP_PROP_FPS: if (value >= 0) { newFormat.setFramerate(value); - return configureOutput(newFormat, outputFormat); + return configureVideoOutput(newFormat, outputVideoFormat); } break; case CV_CAP_PROP_FRAME_COUNT: @@ -1471,8 +2185,8 @@ bool CvCapture_MSMF::setProperty( int property_id, double value ) return setTime(duration * value, true); break; case CV_CAP_PROP_POS_FRAMES: - if (std::fabs(captureFormat.getFramerate()) > 0) - return setTime(value * 1e7 / captureFormat.getFramerate(), false); + if (std::fabs(captureVideoFormat.getFramerate()) > 0) + return setTime((int)value); break; case CV_CAP_PROP_POS_MSEC: return setTime(value * 1e4, false); diff --git a/modules/videoio/test/test_audio.cpp b/modules/videoio/test/test_audio.cpp new file mode 100644 index 0000000000..3ff51e2613 --- /dev/null +++ b/modules/videoio/test/test_audio.cpp @@ -0,0 +1,273 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#include "test_precomp.hpp" + +namespace opencv_test { namespace { + +//file name, number of audio channels, epsilon, video type, weight, height, number of frame, number of audio samples, fps, psnr Threshold, backend +typedef std::tuple paramCombination; +//file name, number of audio channels, number of audio samples, epsilon, backend +typedef std::tuple param; + +class AudioBaseTest +{ +protected: + AudioBaseTest(){}; + void getValidAudioData() + { + const double step = 3.14/22050; + double value = 0; + validAudioData.resize(expectedNumAudioCh); + for (int nCh = 0; nCh < expectedNumAudioCh; nCh++) + { + value = 0; + for(unsigned int i = 0; i < numberOfSamples; i++) + { + if (i != 0 && i % 44100 == 0) + value = 0; + validAudioData[nCh].push_back(sin(value)); + value += step; + } + } + } + void checkAudio() + { + getValidAudioData(); + + ASSERT_EQ(expectedNumAudioCh, (int)audioData.size()); + for (unsigned int nCh = 0; nCh < audioData.size(); nCh++) + { + ASSERT_EQ(numberOfSamples, audioData[nCh].size()) << "nCh=" << nCh; + for (unsigned int i = 0; i < numberOfSamples; i++) + { + EXPECT_NEAR(validAudioData[nCh][i], audioData[nCh][i], epsilon) << "sample index=" << i << " nCh=" << nCh; + } + } + } +protected: + int expectedNumAudioCh; + unsigned int numberOfSamples; + double epsilon; + VideoCaptureAPIs backend; + std::string root; + std::string fileName; + + std::vector> validAudioData; + std::vector> audioData; + std::vector params; + + Mat audioFrame; + VideoCapture cap; +}; + +class AudioTestFixture : public AudioBaseTest, public testing::TestWithParam +{ +public: + AudioTestFixture() + { + fileName = get<0>(GetParam()); + expectedNumAudioCh = get<1>(GetParam()); + numberOfSamples = get<2>(GetParam()); + epsilon = get<3>(GetParam()); + backend = get<4>(GetParam()); + root = "audio/"; + params = { CAP_PROP_AUDIO_STREAM, 0, + CAP_PROP_VIDEO_STREAM, -1, + CAP_PROP_AUDIO_DATA_DEPTH, CV_16S }; + } + + void doTest() + { + ASSERT_TRUE(cap.open(findDataFile(root + fileName), backend, params)); + const int audioBaseIndex = static_cast(cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX)); + const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS); + ASSERT_EQ(expectedNumAudioCh, numberOfChannels); + double f = 0; + audioData.resize(numberOfChannels); + for (;;) + { + if (cap.grab()) + { + for (int nCh = 0; nCh < numberOfChannels; nCh++) + { + ASSERT_TRUE(cap.retrieve(audioFrame, audioBaseIndex)); + ASSERT_EQ(CV_16SC1, audioFrame.type()) << audioData[nCh].size(); + for (int i = 0; i < audioFrame.cols; i++) + { + f = ((double) audioFrame.at(0,i)) / (double) 32768; + audioData[nCh].push_back(f); + } + } + } + else { break; } + } + ASSERT_FALSE(audioData.empty()); + + checkAudio(); + } +}; + +const param audioParams[] = +{ + param("test_audio.wav", 1, 132300, 0.0001, cv::CAP_MSMF), + param("test_mono_audio.mp3", 1, 133104, 0.12, cv::CAP_MSMF), + param("test_stereo_audio.mp3", 2, 133104, 0.12, cv::CAP_MSMF), + param("test_audio.mp4", 1, 133104, 0.15, cv::CAP_MSMF) +}; + +class Audio : public AudioTestFixture{}; + +TEST_P(Audio, audio) +{ + if (!videoio_registry::hasBackend(cv::VideoCaptureAPIs(backend))) + throw SkipTestException(cv::videoio_registry::getBackendName(backend) + " backend was not found"); + + doTest(); +} + +INSTANTIATE_TEST_CASE_P(/**/, Audio, testing::ValuesIn(audioParams)); + +class MediaTestFixture : public AudioBaseTest, public testing::TestWithParam +{ +public: + MediaTestFixture(): + videoType(get<3>(GetParam())), + height(get<4>(GetParam())), + width(get<5>(GetParam())), + numberOfFrames(get<6>(GetParam())), + fps(get<8>(GetParam())), + psnrThreshold(get<9>(GetParam())) + { + fileName = get<0>(GetParam()); + expectedNumAudioCh = get<1>(GetParam()); + numberOfSamples = get<7>(GetParam()); + epsilon = get<2>(GetParam()); + backend = get<10>(GetParam()); + root = "audio/"; + params = { CAP_PROP_AUDIO_STREAM, 0, + CAP_PROP_VIDEO_STREAM, 0, + CAP_PROP_AUDIO_DATA_DEPTH, CV_16S }; + }; + + void doTest() + { + ASSERT_TRUE(cap.open(findDataFile(root + fileName), backend, params)); + + const int audioBaseIndex = static_cast(cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX)); + const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS); + ASSERT_EQ(expectedNumAudioCh, numberOfChannels); + + const int samplePerSecond = (int)cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND); + ASSERT_EQ(44100, samplePerSecond); + int samplesPerFrame = (int)(1./fps*samplePerSecond); + int audioSamplesTolerance = samplesPerFrame / 2; + + double audio0_timestamp = 0; + + Mat videoFrame; + Mat img(height, width, videoType); + audioData.resize(numberOfChannels); + for (int frame = 0; frame < numberOfFrames; frame++) + { + SCOPED_TRACE(cv::format("frame=%d", frame)); + + ASSERT_TRUE(cap.grab()); + + if (frame == 0) + { + double audio_shift = cap.get(CAP_PROP_AUDIO_SHIFT_NSEC); + double video0_timestamp = cap.get(CAP_PROP_POS_MSEC) * 1e-3; + audio0_timestamp = video0_timestamp + audio_shift * 1e-9; + std::cout << "video0 timestamp: " << video0_timestamp << " audio0 timestamp: " << audio0_timestamp << " (audio shift nanoseconds: " << audio_shift << " , seconds: " << audio_shift * 1e-9 << ")" << std::endl; + } + + ASSERT_TRUE(cap.retrieve(videoFrame)); + if (epsilon >= 0) + { + generateFrame(frame, numberOfFrames, img); + ASSERT_EQ(img.size, videoFrame.size); + double psnr = cvtest::PSNR(img, videoFrame); + EXPECT_GE(psnr, psnrThreshold); + } + + int audioFrameCols = 0; + for (int nCh = 0; nCh < numberOfChannels; nCh++) + { + ASSERT_TRUE(cap.retrieve(audioFrame, audioBaseIndex+nCh)); + if (audioFrame.empty()) + continue; + ASSERT_EQ(CV_16SC1, audioFrame.type()); + if (nCh == 0) + audioFrameCols = audioFrame.cols; + else + ASSERT_EQ(audioFrameCols, audioFrame.cols) << "channel "<< nCh; + for (int i = 0; i < audioFrame.cols; i++) + { + double f = audioFrame.at(0,i) / 32768.0; + audioData[nCh].push_back(f); + } + } + + if (frame < 5 || frame >= numberOfFrames-5) + std::cout << "frame=" << frame << ": audioFrameSize=" << audioFrameCols << " videoTimestamp=" << cap.get(CAP_PROP_POS_MSEC) << " ms" << std::endl; + else if (frame == 6) + std::cout << "frame..." << std::endl; + + if (audioFrameCols == 0) + continue; + if (frame != 0 && frame != numberOfFrames-1) + { + // validate audio position + EXPECT_NEAR( + cap.get(CAP_PROP_AUDIO_POS) / samplePerSecond + audio0_timestamp, + cap.get(CAP_PROP_POS_MSEC) * 1e-3, + (1.0 / fps) * 0.3) + << "CAP_PROP_AUDIO_POS=" << cap.get(CAP_PROP_AUDIO_POS) << " CAP_PROP_POS_MSEC=" << cap.get(CAP_PROP_POS_MSEC); + } + if (frame != 0 && frame != numberOfFrames-1 && audioData[0].size() != (size_t)numberOfSamples) + { + // validate audio frame size + EXPECT_NEAR(audioFrame.cols, samplesPerFrame, audioSamplesTolerance); + } + } + ASSERT_FALSE(cap.grab()); + ASSERT_FALSE(audioData.empty()); + + std::cout << "Total audio samples=" << audioData[0].size() << std::endl; + + if (epsilon >= 0) + checkAudio(); + } +protected: + const int videoType; + const int height; + const int width; + const int numberOfFrames; + const int fps; + const double psnrThreshold; +}; + +const paramCombination mediaParams[] = +{ + paramCombination("test_audio.mp4", 1, 0.15, CV_8UC3, 240, 320, 90, 131819, 30, 30., cv::CAP_MSMF) +#if 0 + // https://filesamples.com/samples/video/mp4/sample_960x400_ocean_with_audio.mp4 + , paramCombination("sample_960x400_ocean_with_audio.mp4", 2, -1/*eplsilon*/, CV_8UC3, 400, 960, 1116, 2056588, 30, 30., cv::CAP_MSMF) +#endif +}; + +class Media : public MediaTestFixture{}; + +TEST_P(Media, audio) +{ + if (!videoio_registry::hasBackend(cv::VideoCaptureAPIs(backend))) + throw SkipTestException(cv::videoio_registry::getBackendName(backend) + " backend was not found"); + + doTest(); +} + +INSTANTIATE_TEST_CASE_P(/**/, Media, testing::ValuesIn(mediaParams)); + +}} //namespace diff --git a/modules/videoio/test/test_microphone.cpp b/modules/videoio/test/test_microphone.cpp new file mode 100644 index 0000000000..c82a7c4eda --- /dev/null +++ b/modules/videoio/test/test_microphone.cpp @@ -0,0 +1,41 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// Usage: opencv_test_videoio --gtest_also_run_disabled_tests + +#include "test_precomp.hpp" + +namespace opencv_test { namespace { + +TEST(DISABLED_videoio_micro, basic) +{ + int cursize = 0; + int validSize = 0; + Mat frame; + + std::vector params { CAP_PROP_AUDIO_STREAM, 0, CAP_PROP_VIDEO_STREAM, -1 }; + VideoCapture cap(0, cv::CAP_MSMF, params); + ASSERT_TRUE(cap.isOpened()); + + int samplesPerSecond = (int)cap.get(cv::CAP_PROP_AUDIO_SAMPLES_PER_SECOND); + const int audio_base_index = (int)cap.get(cv::CAP_PROP_AUDIO_BASE_INDEX); + + const double cvTickFreq = cv::getTickFrequency(); + int64 sysTimePrev = cv::getTickCount(); + int64 sysTimeCurr = cv::getTickCount(); + + cout << "Audio would be captured for the next 10 seconds" << endl; + while ((sysTimeCurr-sysTimePrev)/cvTickFreq < 10) + { + if (cap.grab()) + { + ASSERT_TRUE(cap.retrieve(frame, audio_base_index)); + sysTimeCurr = cv::getTickCount(); + } + } + validSize = samplesPerSecond*(int)((sysTimeCurr-sysTimePrev)/cvTickFreq); + cursize = (int)cap.get(cv::CAP_PROP_AUDIO_POS); + ASSERT_LT(validSize - cursize, cursize*0.05); +} + +}} // namespace diff --git a/samples/cpp/videocapture_audio.cpp b/samples/cpp/videocapture_audio.cpp new file mode 100644 index 0000000000..c9f1ec94ce --- /dev/null +++ b/samples/cpp/videocapture_audio.cpp @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +using namespace cv; +using namespace std; + +int main(int argc, char** argv) +{ + CommandLineParser parser(argc, argv, "{@audio||}"); + string file = parser.get("@audio"); + + if (file.empty()) + { + return 1; + } + + Mat frame; + vector> audioData; + VideoCapture cap; + vector params { CAP_PROP_AUDIO_STREAM, 0, + CAP_PROP_VIDEO_STREAM, -1, + CAP_PROP_AUDIO_DATA_DEPTH, CV_16S }; + + cap.open(file, CAP_MSMF, params); + if (!cap.isOpened()) + { + cerr << "ERROR! Can't to open file: " + file << endl; + return -1; + } + + const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX); + const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS); + cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl; + cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; + cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; + cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; + + int numberOfSamples = 0; + audioData.resize(numberOfChannels); + for (;;) + { + if (cap.grab()) + { + for (int nCh = 0; nCh < numberOfChannels; nCh++) + { + cap.retrieve(frame, audioBaseIndex+nCh); + audioData[nCh].push_back(frame); + numberOfSamples+=frame.cols; + } + } + else { break; } + } + + cout << "Number of samples: " << numberOfSamples << endl; + + return 0; +} diff --git a/samples/cpp/videocapture_audio_combination.cpp b/samples/cpp/videocapture_audio_combination.cpp new file mode 100644 index 0000000000..7f0deecf16 --- /dev/null +++ b/samples/cpp/videocapture_audio_combination.cpp @@ -0,0 +1,69 @@ +#include +#include +#include +#include + +using namespace cv; +using namespace std; + +int main(int argc, char** argv) +{ + cv::CommandLineParser parser(argc, argv, "{@audio||}"); + string file = parser.get("@audio"); + + if (file.empty()) + { + return 1; + } + + Mat videoFrame; + Mat audioFrame; + vector> audioData; + VideoCapture cap; + vector params { CAP_PROP_AUDIO_STREAM, 0, + CAP_PROP_VIDEO_STREAM, 0, + CAP_PROP_AUDIO_DATA_DEPTH, CV_16S }; + + cap.open(file, CAP_MSMF, params); + if (!cap.isOpened()) + { + cerr << "ERROR! Can't to open file: " + file << endl; + return -1; + } + + const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX); + const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS); + cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl; + cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; + cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS) << endl; + cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; + + int numberOfSamples = 0; + int numberOfFrames = 0; + audioData.resize(numberOfChannels); + for (;;) + { + if (cap.grab()) + { + cap.retrieve(videoFrame); + for (int nCh = 0; nCh < numberOfChannels; nCh++) + { + cap.retrieve(audioFrame, audioBaseIndex+nCh); + if (!audioFrame.empty()) + audioData[nCh].push_back(audioFrame); + numberOfSamples+=audioFrame.cols; + } + if (!videoFrame.empty()) + { + numberOfFrames++; + imshow("Live", videoFrame); + if (waitKey(5) >= 0) + break; + } + } else { break; } + } + + cout << "Number of audio samples: " << numberOfSamples << endl + << "Number of video frames: " << numberOfFrames << endl; + return 0; +} diff --git a/samples/cpp/videocapture_microphone.cpp b/samples/cpp/videocapture_microphone.cpp new file mode 100644 index 0000000000..0c69ec929d --- /dev/null +++ b/samples/cpp/videocapture_microphone.cpp @@ -0,0 +1,57 @@ +#include +#include +#include +#include + +using namespace cv; +using namespace std; + +int main(int, char**) +{ + Mat frame; + vector audioData; + VideoCapture cap; + vector params { CAP_PROP_AUDIO_STREAM, 0, + CAP_PROP_VIDEO_STREAM, -1 }; + + cap.open(0, CAP_MSMF, params); + if (!cap.isOpened()) + { + cerr << "ERROR! Can't to open microphone" << endl; + return -1; + } + + const int audioBaseIndex = (int)cap.get(CAP_PROP_AUDIO_BASE_INDEX); + const int numberOfChannels = (int)cap.get(CAP_PROP_AUDIO_TOTAL_CHANNELS); + cout << "CAP_PROP_AUDIO_DATA_DEPTH: " << depthToString((int)cap.get(CAP_PROP_AUDIO_DATA_DEPTH)) << endl; + cout << "CAP_PROP_AUDIO_SAMPLES_PER_SECOND: " << cap.get(CAP_PROP_AUDIO_SAMPLES_PER_SECOND) << endl; + cout << "CAP_PROP_AUDIO_TOTAL_CHANNELS: " << numberOfChannels << endl; + cout << "CAP_PROP_AUDIO_TOTAL_STREAMS: " << cap.get(CAP_PROP_AUDIO_TOTAL_STREAMS) << endl; + + const double cvTickFreq = getTickFrequency(); + int64 sysTimeCurr = getTickCount(); + int64 sysTimePrev = sysTimeCurr; + while ((sysTimeCurr-sysTimePrev)/cvTickFreq < 10) + { + if (cap.grab()) + { + for (int nCh = 0; nCh < numberOfChannels; nCh++) + { + cap.retrieve(frame, audioBaseIndex+nCh); + audioData.push_back(frame); + sysTimeCurr = getTickCount(); + } + } + else + { + cerr << "Grab error" << endl; + break; + } + } + int numberOfSamles = 0; + for (auto item : audioData) + numberOfSamles+=item.cols; + cout << "Number of samples: " << numberOfSamles << endl; + + return 0; +}