'Downsample from 48000 to 44100 for Bluetooth Windows Media Foundation

I've got a video/audio playback app working on Windows 10. The output wave supported by IsFormatSupported is almost always:

Sample Rate: 48000
Bits Per Sample: 32
Subformat: KSDATAFORMAT_SUBTYPE_PCM

The playback is working robustly. We have a reqest to add in Bluetooth playback functionality and the audio is terrible (crackling sound). The device is created as per usual but the difference between normal playback and bluetooth is that it is being downsampled to 44100. The sample rate is what Windows returns to me calling the IsFormatSupported method (returned as a closest format pointer). Is there some trick I am missing? On a more performant PC, the Bluetooth will play smoothly for a few seconds and then start stuttering, on less performant it will stutter immediately. Is the resampler incredibly slow or is there something I am missing? As mentioned above, for normal speaker playback the playback is robust and there is no crackling whatsoever. If anyone has any idea on this, it'll be much appreciated as I'm pulling my hair out!

Thanks in advance, Peter

Edit:

It's a bit difficult to show a lot of code, because it's all proprietary. I'll add what I can get away with.

Audio renderer setup:

    WAVEFORMATEX* mWaveformat;
    hResult = mAudioClient->GetMixFormat(&mWaveformat);
    if (S_OK != hResult) {
        throw std::exception("Failed to get mix format...");
    }

    mWaveformat->wBitsPerSample = mBitsPerSample;

    mSamplesPerSecond = 48000;
    mWaveformat->nSamplesPerSec = mSamplesPerSecond;
    DWORD frameSize = mWaveformat->nChannels * (mWaveformat->wBitsPerSample / 8);
    mWaveformat->nBlockAlign = frameSize;
    mWaveformat->nAvgBytesPerSec = mWaveformat->nSamplesPerSec * frameSize;

    mFrameSize = static_cast<uint32_t>(frameSize);
    mOutputChannelCount = mWaveformat->nChannels;

    DWORD channelMask = 0;
    switch (mOutputChannelCount) {
    case 1:
        channelMask = (DWORD)ChannelMask::Mono;
        break;
    case 2:
        channelMask = (DWORD)ChannelMask::Stereo;
        break;
    case 6:
        channelMask = (DWORD)ChannelMask::Surround_5;
        break;
    case 8:
        channelMask = (DWORD)ChannelMask::Surround_7;
        break;
    default:
        LogError("Unsupported channel format " << mWaveformat->nChannels);
        throw;
    }

    WAVEFORMATEXTENSIBLE waveFormatExtensible{};
    waveFormatExtensible.Format = *mWaveformat;
    waveFormatExtensible.SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
    waveFormatExtensible.Samples.wValidBitsPerSample = mWaveformat->wBitsPerSample;

    WAVEFORMATEX* pClosestFit;
    WAVEFORMATEXTENSIBLE* closest;
    hResult = mAudioClient->IsFormatSupported(AUDCLNT_SHAREMODE_SHARED, (WAVEFORMATEX*)mWaveformat, &pClosestFit);
    if (S_OK != hResult) {
        if (S_FALSE == hResult) {

            if (pClosestFit->wBitsPerSample != mWaveformat->wBitsPerSample ||
                pClosestFit->nChannels != mWaveformat->nChannels ||
                pClosestFit->nSamplesPerSec != mWaveformat->nSamplesPerSec)
            {
                LogInfo("Closest fit format provided");
            }

            mWaveformat = pClosestFit;

            mBitsPerSample = mWaveformat->wBitsPerSample;
            mOutputChannelCount = mWaveformat->nChannels;
            mSamplesPerSecond = mWaveformat->nSamplesPerSec;

            mSamplesPerVideoFrame = std::ceil(static_cast<float>(mSamplesPerSecond) / static_cast<float>(mVideoFramePerSecond));
            if (mSamplesPerVideoFrame == 0) {
                mSamplesPerVideoFrame = 2000;
            }
        }
        else {
            LogError("Wave format is not supported.");
            throw;
        }
    }

    // Reference time is in units of 100 nano seconds.
    // 10000 == 1ms
    REFERENCE_TIME defaultPeriod, minPeriod;
    hResult = mAudioClient->GetDevicePeriod(&defaultPeriod, &minPeriod);
    assert(hResult == S_OK);

    hResult = mAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_EVENTCALLBACK | AUDCLNT_STREAMFLAGS_NOPERSIST, minPeriod * 4, 0, mWaveformat, NULL);
    if (S_OK != hResult)
            throw std::exception("Failed to initialise audio client...");
    }

    // GetBufferSize is a misnomer, it's the number of frames the audio card is expecting.
    // frameSize = nChannels * (bitsPerSample / 8)
    // totalFrameSize = frameSize * bufferSize;
    hResult = mAudioClient->GetBufferSize(&mBufferSizeInFrames);
    if (S_OK != hResult) {
        throw std::exception("Failed to get buffer size from audio client");
    }

    // Get the render client
    hResult = mAudioClient->GetService(__uuidof(IAudioRenderClient), (void**)&mAudioRenderClient);
    if (S_OK != hResult) {
        throw std::exception("Failed to get render client from audio client");
    }

    // Create Async callback for sample events
    hResult = MFCreateAsyncResult(nullptr, &mSampleRequestCallback, nullptr, &mSampleRequestAsyncResult);
    if (hResult != S_OK) {
        throw std::exception("Failed to register sample ready callback.");
    }

    mSampleRequestEvent = CreateEventEx(nullptr, nullptr, 0, EVENT_ALL_ACCESS);
    if (mSampleRequestEvent == INVALID_HANDLE_VALUE) {
        throw std::exception("Failed to create event handle");
    }

    hResult = mAudioClient->SetEventHandle(mSampleRequestEvent);
    if (hResult != S_OK) {
        throw std::exception("Failed to set the eevent handle.");
    }

    // Set up the resampler
    ComPtr<IUnknown> pTransformUnk = nullptr;
    hResult = CoCreateInstance(CLSID_AudioResamplerMediaObject, nullptr, CLSCTX_INPROC_SERVER, IID_IUnknown, &pTransformUnk);
    if (S_OK != hResult) {
        throw std::exception("Creating resampler failed.");
    }

    hResult = pTransformUnk->QueryInterface(IID_PPV_ARGS(&mResampler));
    if (S_OK != hResult) {
        throw std::exception("Creating resampler failed.");
    }

    ComPtr<IMFMediaType> pResamplerInputType;
    hResult = MFCreateMediaType(&pResamplerInputType);
    if (S_OK != hResult) {
        throw std::exception("Creating empty media type failed.");
    }

    pResamplerInputType->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Audio);
    pResamplerInputType->SetGUID(MF_MT_SUBTYPE, MFAudioFormat_Float);
    pResamplerInputType->SetUINT32(MF_MT_AUDIO_NUM_CHANNELS, mInternalChannelCount);
    pResamplerInputType->SetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, 32);
    pResamplerInputType->SetUINT32(MF_MT_AUDIO_SAMPLES_PER_SECOND, mInternalSampleRate);

    channelMask = getChannelMask(mInternalChannelCount);
    pResamplerInputType->SetUINT32(MF_MT_AUDIO_CHANNEL_MASK, channelMask);

    frameSize = mInternalChannelCount * sizeof(float);
    pResamplerInputType->SetUINT32(MF_MT_AUDIO_BLOCK_ALIGNMENT, frameSize);

    uint32_t bytesPerSecond = frameSize * static_cast<uint32_t>(mInternalSampleRate);
    pResamplerInputType->SetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, bytesPerSecond);

    hResult = mResampler->SetInputType(0, pResamplerInputType.Get(), 0);
    if (S_OK != hResult) {
        throw std::exception("Setting resampler input failed.");
    }

    ComPtr<IMFMediaType> pResamplerOutput = nullptr;
    if (S_OK != MFCreateMediaType(&pResamplerOutput)) {
        throw std::exception("Creating resampler output failed.");
    }

    pResamplerOutput->SetGUID(MF_MT_MAJOR_TYPE, MFMediaType_Audio);
    pResamplerOutput->SetGUID(MF_MT_SUBTYPE, MFAudioFormat_Float);

    pResamplerOutput->SetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE, mBitsPerSample);
    pResamplerOutput->SetUINT32(MF_MT_AUDIO_NUM_CHANNELS, mOutputChannelCount);
    pResamplerOutput->SetUINT32(MF_MT_AUDIO_SAMPLES_PER_SECOND, mSamplesPerSecond);

    channelMask = getChannelMask(mOutputChannelCount);
    pResamplerOutput->SetUINT32(MF_MT_AUDIO_CHANNEL_MASK, channelMask);

    frameSize = mOutputChannelCount * sizeof(float);
    pResamplerOutput->SetUINT32(MF_MT_AUDIO_BLOCK_ALIGNMENT, frameSize);
    pResamplerOutput->SetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, frameSize * mSamplesPerSecond);

    if (S_OK != mResampler->SetOutputType(0, pResamplerOutput.Get(), 0)) {
        throw std::exception("Setting resampler output failed.");
    }

    // Notify the resampler.
    mResampler->ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, NULL);
    mResampler->ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, NULL);
    mResampler->ProcessMessage(MFT_MESSAGE_NOTIFY_START_OF_STREAM, NULL);

Resample Audio:

uint32_t bufferSize = mImpl->mInternalChannelCount * mImpl->mSamplesPerVideoFrame * sizeof(float);
    ComPtr<IMFMediaBuffer> pSampleBuffer;
    if (S_OK != MFCreateMemoryBuffer(bufferSize, &pSampleBuffer)) {
        LogError("Couldn't create a buffer to push into resampler...");
        return;
    }

    uint8_t* pRawBuffer;
    DWORD maxSize, allocatedSize;
    pSampleBuffer->Lock(&pRawBuffer, &maxSize, &allocatedSize);

    // We need to make it interleaved again after processing
    uint32_t offset = 0;
    for (int channel = 0; channel < mImpl->mInternalChannelCount; ++channel) {
        uint32_t* pSampleData = (uint32_t*)audioSample->getData()[channel];
        for (int i = 0; i < mImpl->mSamplesPerVideoFrame; ++i) {
            offset = channel + (i * mImpl->mInternalChannelCount);
            memcpy((uint32_t*)pRawBuffer + offset, pSampleData + i, sizeof(float));
        }
    }

    pSampleBuffer->Unlock();

    pSampleBuffer->SetCurrentLength(bufferSize);

    ComPtr<IMFSample> pInputSample;
    if (S_OK != MFCreateSample(&pInputSample)) {
        LogError("Couldn't create a sample to hold the buffer...");
        return;
    }

    pInputSample->AddBuffer(pSampleBuffer.Get());

    HRESULT hResult = mImpl->mResampler->ProcessInput(0, pInputSample.Get(), 0);
    if (S_OK != hResult) {
        // Flush and try again.
        if (MF_E_NOTACCEPTING == hResult) {
            mImpl->mResampler->ProcessMessage(MFT_MESSAGE_COMMAND_FLUSH, NULL);
            hResult = mImpl->mResampler->ProcessInput(0, pInputSample.Get(), 0);
        }

        if (S_OK != hResult) {
            LogError("Failed to input into resampler...");
            return;
        }
    }

    MFT_OUTPUT_DATA_BUFFER outputDataBuffer{};
    DWORD dwStatus;
    DWORD maxLength = 0, allocatedLength = 0;
    do {
        ComPtr<IMFSample> pOutputSample;
        hResult = MFCreateSample(&pOutputSample);

        ComPtr<IMFMediaBuffer> pOutputBuffer = NULL;
        hResult = MFCreateMemoryBuffer(bufferSize, &pOutputBuffer);

        hResult = pOutputSample->AddBuffer(pOutputBuffer.Get());
        outputDataBuffer.pSample = pOutputSample.Get();

        hResult = mImpl->mResampler->ProcessOutput(0, 1, &outputDataBuffer, &dwStatus);
        if (hResult == MF_E_TRANSFORM_NEED_MORE_INPUT) {
            //LogInfo("End of sample processing");
            break;
        }

        pOutputSample = outputDataBuffer.pSample;
        pOutputSample->GetBufferByIndex(0, &pOutputBuffer);

        uint8_t* pRawTransformBuffer = nullptr;
        pOutputBuffer->Lock(&pRawTransformBuffer, &maxLength, &allocatedLength);

        mImpl->writeToRingBuffer((uint8_t*)pRawTransformBuffer, allocatedLength);

        pOutputBuffer->Unlock();

    } while (true);

Write to the ring buffer:

    int32_t remaining = size;
    while (remaining > 0) {
        std::unique_lock<std::mutex> lck(mAudioMutex);
        size_t reserved = 0;
        void* pRingBuffer = mRingBuffer.reserve(size, reserved);

        //LogInfo("Size::" << size << " reserved::" << reserved << " committed::" << mRingBuffer.getCommittedSize());
        if (nullptr != pRingBuffer && reserved != 0) {
            memcpy(pRingBuffer, pData, reserved);
            mRingBuffer.commit(reserved);
            remaining -= reserved;
            pData += reserved;
        }
        else {
            LogError("Audio buffer full::" << mRingBuffer.getCommittedSize());
            abort();
        }
    }

Render Event from sound card:

if (mShouldRender) {
        HRESULT hr = S_OK;
        uint32_t writeableFrames = 0, padding = 0;

        // How much unread data do we have?
        mAudioClient->GetCurrentPadding(&padding);

        writeableFrames = mBufferSizeInFrames - padding;

        uint8_t* pRenderBuffer;
        hr = mAudioRenderClient->GetBuffer(writeableFrames, &pRenderBuffer);
        if (S_OK != hr) {
            if (mShouldRender) {
                LogError("Failed to get device audio buffer...");
                return S_FALSE;
            }
        }

        // WASAPI returns the number of free frames, but we need the byte equivalent.
        uint32_t audioBufferSize = mFrameSize * writeableFrames;
        {
            //LogInfo("audioConsumer::" << mRingBuffer.getCommittedSize());
            std::unique_lock<std::mutex> lck(mAudioMutex);
            size_t bufferDataBlockSize = static_cast<size_t>(audioBufferSize);
            void* pDataBlock = mRingBuffer.getContiguousBlock(bufferDataBlockSize);
            if (nullptr != pDataBlock) {

                if (bufferDataBlockSize >= audioBufferSize) {
                    memcpy(pRenderBuffer, pDataBlock, audioBufferSize);

                    mRingBuffer.decommitBlock(audioBufferSize);
                    mBytesRendered += audioBufferSize;
                    hr = mAudioRenderClient->ReleaseBuffer(writeableFrames, 0);
                    //LogInfo("audio output::" << audioBufferSize);
                    if (S_OK != hr) {
                        return 0;
                    }
                }
                else {
                    audioBufferSize -= bufferDataBlockSize;
                    memcpy(pRenderBuffer, pDataBlock, bufferDataBlockSize);
                    mRingBuffer.decommitBlock(bufferDataBlockSize);
                    mBytesRendered += bufferDataBlockSize;
                    //LogInfo("audio output::" << bufferDataBlockSize);

                    size_t blockSize2 = audioBufferSize;
                    pDataBlock = mRingBuffer.getContiguousBlock(blockSize2);

                    if (blockSize2 >= audioBufferSize) {
                        memcpy(pRenderBuffer + bufferDataBlockSize, pDataBlock, audioBufferSize);
                        mRingBuffer.decommitBlock(audioBufferSize);
                        mBytesRendered += audioBufferSize;
                        hr = mAudioRenderClient->ReleaseBuffer(writeableFrames, 0);
                        //LogInfo("audio output::" << audioBufferSize);
                    }
                    else {
                        // This must be done! Otherwise we faiiiil
                        mAudioRenderClient->ReleaseBuffer(0, 0);
                    }
                }
            }
            else {
                // This must be done! Otherwise we faiiiil
                mAudioRenderClient->ReleaseBuffer(0, 0);

                if (mRingBuffer.getCommittedSize() == 0 && mRenderedFrames > 0) {
                    //LogError("We're getting nothing from the buffer");

                    if (nullptr != mFinishedCallback) {
                        mFinishedCallback();
                    }
                }
            }
        }

        MFPutWaitingWorkItem(mSampleRequestEvent, 0, mSampleRequestAsyncResult.Get(), &mEventKey);

Hopefully this will help. Additional point of note is that on Windows 11 the output sample rate is 48000 when bluetooth is connected. It's only on Windows 10 when the downsample happens.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'Downsample from 48000 to 44100 for Bluetooth Windows Media Foundation

Sources

Related Questions