459 lines
17 KiB
C++
459 lines
17 KiB
C++
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
/* vim:set ts=2 sw=2 sts=2 et cindent: */
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
#include "FFmpegAudioEncoder.h"
|
|
|
|
#include "FFmpegRuntimeLinker.h"
|
|
#include "FFmpegLog.h"
|
|
#include "FFmpegUtils.h"
|
|
#include "MediaData.h"
|
|
|
|
#include "AudioSegment.h"
|
|
|
|
namespace mozilla {
|
|
|
|
FFmpegAudioEncoder<LIBAV_VER>::FFmpegAudioEncoder(
|
|
const FFmpegLibWrapper* aLib, AVCodecID aCodecID,
|
|
const RefPtr<TaskQueue>& aTaskQueue, const EncoderConfig& aConfig)
|
|
: FFmpegDataEncoder(aLib, aCodecID, aTaskQueue, aConfig) {}
|
|
|
|
nsCString FFmpegAudioEncoder<LIBAV_VER>::GetDescriptionName() const {
|
|
#ifdef USING_MOZFFVPX
|
|
return "ffvpx audio encoder"_ns;
|
|
#else
|
|
const char* lib =
|
|
# if defined(MOZ_FFMPEG)
|
|
FFmpegRuntimeLinker::LinkStatusLibraryName();
|
|
# else
|
|
"no library: ffmpeg disabled during build";
|
|
# endif
|
|
return nsPrintfCString("ffmpeg audio encoder (%s)", lib);
|
|
#endif
|
|
}
|
|
|
|
void FFmpegAudioEncoder<LIBAV_VER>::ResamplerDestroy::operator()(
|
|
SpeexResamplerState* aResampler) {
|
|
speex_resampler_destroy(aResampler);
|
|
}
|
|
|
|
nsresult FFmpegAudioEncoder<LIBAV_VER>::InitSpecific() {
|
|
MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());
|
|
|
|
FFMPEG_LOG("FFmpegAudioEncoder::InitInternal");
|
|
|
|
// Initialize the common members of the encoder instance
|
|
AVCodec* codec = FFmpegDataEncoder<LIBAV_VER>::InitCommon();
|
|
if (!codec) {
|
|
FFMPEG_LOG("FFmpegDataEncoder::InitCommon failed");
|
|
return NS_ERROR_DOM_MEDIA_NOT_SUPPORTED_ERR;
|
|
}
|
|
|
|
// Find a compatible input rate for the codec, update the encoder config, and
|
|
// note the rate at which this instance was configured.
|
|
mInputSampleRate = AssertedCast<int>(mConfig.mSampleRate);
|
|
if (codec->supported_samplerates) {
|
|
// Ensure the sample-rate list is sorted, iterate and either find that the
|
|
// sample rate is supported, or pick the same rate just above the audio
|
|
// input sample-rate (as to not lose information). If the audio is higher
|
|
// than the highest supported sample-rate, down-sample to the highest
|
|
// sample-rate supported by the codec. This is the case when encoding high
|
|
// samplerate audio to opus.
|
|
AutoTArray<int, 16> supportedSampleRates;
|
|
IterateZeroTerminated(codec->supported_samplerates,
|
|
[&supportedSampleRates](int aRate) mutable {
|
|
supportedSampleRates.AppendElement(aRate);
|
|
});
|
|
supportedSampleRates.Sort();
|
|
|
|
for (const auto& rate : supportedSampleRates) {
|
|
if (mInputSampleRate == rate) {
|
|
mConfig.mSampleRate = rate;
|
|
break;
|
|
}
|
|
if (mInputSampleRate < rate) {
|
|
// This rate is the smallest supported rate above the content's rate.
|
|
mConfig.mSampleRate = rate;
|
|
break;
|
|
}
|
|
if (mInputSampleRate > rate) {
|
|
mConfig.mSampleRate = rate;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (mConfig.mSampleRate != AssertedCast<uint32_t>(mInputSampleRate)) {
|
|
// Need to resample to targetRate
|
|
int err;
|
|
SpeexResamplerState* resampler = speex_resampler_init(
|
|
mConfig.mNumberOfChannels, mInputSampleRate, mConfig.mSampleRate,
|
|
SPEEX_RESAMPLER_QUALITY_DEFAULT, &err);
|
|
if (!err) {
|
|
mResampler.reset(resampler);
|
|
} else {
|
|
FFMPEG_LOG(
|
|
"Error creating resampler in FFmpegAudioEncoder %dHz -> %dHz (%dch)",
|
|
mInputSampleRate, mConfig.mSampleRate, mConfig.mNumberOfChannels);
|
|
}
|
|
}
|
|
|
|
// And now the audio-specific part
|
|
mCodecContext->sample_rate = AssertedCast<int>(mConfig.mSampleRate);
|
|
|
|
#if LIBAVCODEC_VERSION_MAJOR >= 60
|
|
// Gecko's ordering intentionnally matches ffmepg's ordering
|
|
mLib->av_channel_layout_default(&mCodecContext->ch_layout,
|
|
AssertedCast<int>(mConfig.mNumberOfChannels));
|
|
#else
|
|
mCodecContext->channels = AssertedCast<int>(mConfig.mNumberOfChannels);
|
|
#endif
|
|
|
|
switch (mConfig.mCodec) {
|
|
case CodecType::Opus:
|
|
// When using libopus, ffmpeg supports interleaved float and s16 input.
|
|
mCodecContext->sample_fmt = AV_SAMPLE_FMT_FLT;
|
|
break;
|
|
case CodecType::Vorbis:
|
|
// When using libvorbis, ffmpeg only supports planar f32 input.
|
|
mCodecContext->sample_fmt = AV_SAMPLE_FMT_FLTP;
|
|
break;
|
|
default:
|
|
MOZ_ASSERT_UNREACHABLE("Not supported");
|
|
}
|
|
|
|
if (mConfig.mCodec == CodecType::Opus) {
|
|
// Default is VBR
|
|
if (mConfig.mBitrateMode == BitrateMode::Constant) {
|
|
mLib->av_opt_set(mCodecContext->priv_data, "vbr", "off", 0);
|
|
}
|
|
if (mConfig.mCodecSpecific.isSome()) {
|
|
MOZ_ASSERT(mConfig.mCodecSpecific->is<OpusSpecific>());
|
|
const OpusSpecific& specific = mConfig.mCodecSpecific->as<OpusSpecific>();
|
|
// This attribute maps directly to complexity
|
|
mCodecContext->compression_level = specific.mComplexity;
|
|
FFMPEG_LOG("Opus complexity set to %d", specific.mComplexity);
|
|
float frameDurationMs =
|
|
AssertedCast<float>(specific.mFrameDuration) / 1000.f;
|
|
if (mLib->av_opt_set_double(mCodecContext->priv_data, "frame_duration",
|
|
frameDurationMs, 0)) {
|
|
FFMPEG_LOG("Error setting the frame duration on Opus encoder");
|
|
return NS_ERROR_FAILURE;
|
|
}
|
|
FFMPEG_LOG("Opus frame duration set to %0.2f", frameDurationMs);
|
|
if (specific.mPacketLossPerc) {
|
|
if (mLib->av_opt_set_int(
|
|
mCodecContext->priv_data, "packet_loss",
|
|
AssertedCast<int64_t>(specific.mPacketLossPerc), 0)) {
|
|
FFMPEG_LOG("Error setting the packet loss percentage to %" PRIu64
|
|
" on Opus encoder",
|
|
specific.mPacketLossPerc);
|
|
return NS_ERROR_FAILURE;
|
|
}
|
|
FFMPEG_LOGV("Packet loss set to %d%% in Opus encoder",
|
|
AssertedCast<int>(specific.mPacketLossPerc));
|
|
}
|
|
if (specific.mUseInBandFEC) {
|
|
if (mLib->av_opt_set(mCodecContext->priv_data, "fec", "on", 0)) {
|
|
FFMPEG_LOG("Error %s FEC on Opus encoder",
|
|
specific.mUseInBandFEC ? "enabling" : "disabling");
|
|
return NS_ERROR_FAILURE;
|
|
}
|
|
FFMPEG_LOGV("In-band FEC enabled for Opus encoder.");
|
|
}
|
|
if (specific.mUseDTX) {
|
|
if (mLib->av_opt_set(mCodecContext->priv_data, "dtx", "on", 0)) {
|
|
FFMPEG_LOG("Error %s DTX on Opus encoder",
|
|
specific.mUseDTX ? "enabling" : "disabling");
|
|
return NS_ERROR_FAILURE;
|
|
}
|
|
// DTX packets are a TOC byte, and possibly one byte of length, packets
|
|
// 3 bytes and larger are to be returned.
|
|
mDtxThreshold = 3;
|
|
}
|
|
// TODO: format
|
|
// https://bugzilla.mozilla.org/show_bug.cgi?id=1876066
|
|
}
|
|
}
|
|
// Override the time base: always the sample-rate the encoder is running at
|
|
mCodecContext->time_base =
|
|
AVRational{.num = 1, .den = mCodecContext->sample_rate};
|
|
|
|
MediaResult rv = FinishInitCommon(codec);
|
|
if (NS_FAILED(rv)) {
|
|
FFMPEG_LOG("FFmpeg encode initialization failure.");
|
|
return rv.Code();
|
|
}
|
|
|
|
return NS_OK;
|
|
}
|
|
|
|
// avcodec_send_frame and avcodec_receive_packet were introduced in version 58.
|
|
#if LIBAVCODEC_VERSION_MAJOR >= 58
|
|
|
|
Result<MediaDataEncoder::EncodedData, nsresult>
|
|
FFmpegAudioEncoder<LIBAV_VER>::EncodeOnePacket(Span<float> aSamples,
|
|
media::TimeUnit aPts) {
|
|
// Allocate AVFrame.
|
|
if (!PrepareFrame()) {
|
|
FFMPEG_LOG("failed to allocate frame");
|
|
return Err(NS_ERROR_OUT_OF_MEMORY);
|
|
}
|
|
|
|
uint32_t frameCount = aSamples.Length() / mConfig.mNumberOfChannels;
|
|
|
|
// This method assumes that the audio has been packetized appropriately --
|
|
// packets smaller than the packet size are allowed when draining.
|
|
MOZ_ASSERT(AssertedCast<int>(frameCount) <= mCodecContext->frame_size);
|
|
|
|
ChannelCount(mFrame) = AssertedCast<int>(mConfig.mNumberOfChannels);
|
|
|
|
# if LIBAVCODEC_VERSION_MAJOR >= 60
|
|
int rv = mLib->av_channel_layout_copy(&mFrame->ch_layout,
|
|
&mCodecContext->ch_layout);
|
|
if (rv < 0) {
|
|
FFMPEG_LOG("channel layout copy error: %s",
|
|
MakeErrorString(mLib, rv).get());
|
|
return Err(NS_ERROR_DOM_MEDIA_FATAL_ERR);
|
|
}
|
|
# endif
|
|
|
|
mFrame->sample_rate = AssertedCast<int>(mConfig.mSampleRate);
|
|
// Not a mistake, nb_samples is per channel in ffmpeg
|
|
mFrame->nb_samples = AssertedCast<int>(frameCount);
|
|
// Audio is converted below if needed
|
|
mFrame->format = mCodecContext->sample_fmt;
|
|
// Set presentation timestamp and duration of the AVFrame.
|
|
# if LIBAVCODEC_VERSION_MAJOR >= 59
|
|
mFrame->time_base =
|
|
AVRational{.num = 1, .den = static_cast<int>(mConfig.mSampleRate)};
|
|
# endif
|
|
mFrame->pts = aPts.ToTicksAtRate(mConfig.mSampleRate);
|
|
# if LIBAVCODEC_VERSION_MAJOR >= 60
|
|
mFrame->duration = frameCount;
|
|
# else
|
|
mFrame->pkt_duration = frameCount;
|
|
// Save duration in the time_base unit.
|
|
mDurationMap.Insert(mFrame->pts, mFrame->pkt_duration);
|
|
# endif
|
|
|
|
if (int ret = mLib->av_frame_get_buffer(mFrame, 16); ret < 0) {
|
|
FFMPEG_LOG("failed to allocate frame data: %s",
|
|
MakeErrorString(mLib, ret).get());
|
|
return Err(NS_ERROR_OUT_OF_MEMORY);
|
|
}
|
|
|
|
// Make sure AVFrame is writable.
|
|
if (int ret = mLib->av_frame_make_writable(mFrame); ret < 0) {
|
|
FFMPEG_LOG("failed to make frame writable: %s",
|
|
MakeErrorString(mLib, ret).get());
|
|
return Err(NS_ERROR_DOM_MEDIA_FATAL_ERR);
|
|
}
|
|
|
|
// The input is always in f32 interleaved for now
|
|
if (mCodecContext->sample_fmt == AV_SAMPLE_FMT_FLT) {
|
|
PodCopy(reinterpret_cast<float*>(mFrame->data[0]), aSamples.data(),
|
|
aSamples.Length());
|
|
} else {
|
|
MOZ_ASSERT(mCodecContext->sample_fmt == AV_SAMPLE_FMT_FLTP);
|
|
for (uint32_t i = 0; i < mConfig.mNumberOfChannels; i++) {
|
|
DeinterleaveAndConvertBuffer(aSamples.data(), mFrame->nb_samples,
|
|
mConfig.mNumberOfChannels, mFrame->data);
|
|
}
|
|
}
|
|
|
|
// Now send the AVFrame to ffmpeg for encoding, same code for audio and video.
|
|
return FFmpegDataEncoder<LIBAV_VER>::EncodeWithModernAPIs();
|
|
}
|
|
|
|
Result<MediaDataEncoder::EncodedData, nsresult> FFmpegAudioEncoder<
|
|
LIBAV_VER>::EncodeInputWithModernAPIs(RefPtr<const MediaData> aSample) {
|
|
MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());
|
|
MOZ_ASSERT(mCodecContext);
|
|
MOZ_ASSERT(aSample);
|
|
|
|
RefPtr<const AudioData> sample(aSample->As<AudioData>());
|
|
|
|
FFMPEG_LOG("Encoding %" PRIu32 " frames of audio at pts: %s",
|
|
sample->Frames(), sample->mTime.ToString().get());
|
|
|
|
if ((!mResampler && sample->mRate != mConfig.mSampleRate) ||
|
|
(mResampler &&
|
|
sample->mRate != AssertedCast<uint32_t>(mInputSampleRate)) ||
|
|
sample->mChannels != mConfig.mNumberOfChannels) {
|
|
FFMPEG_LOG(
|
|
"Rate or sample-rate at the inputof the encoder different from what "
|
|
"has been configured initially, erroring out");
|
|
return Result<MediaDataEncoder::EncodedData, nsresult>(
|
|
NS_ERROR_DOM_ENCODING_NOT_SUPPORTED_ERR);
|
|
}
|
|
|
|
// ffmpeg expects exactly sized input audio packets most of the time.
|
|
// Packetization is performed if needed, and audio packets of the correct size
|
|
// are fed to ffmpeg, with timestamps extrapolated the timestamp found on
|
|
// the input MediaData.
|
|
|
|
if (!mPacketizer) {
|
|
media::TimeUnit basePts = media::TimeUnit::Zero(mConfig.mSampleRate);
|
|
basePts += sample->mTime;
|
|
mPacketizer.emplace(mCodecContext->frame_size, sample->mChannels,
|
|
basePts.ToTicksAtRate(mConfig.mSampleRate),
|
|
mConfig.mSampleRate);
|
|
}
|
|
|
|
if (!mFirstPacketPts.IsValid()) {
|
|
mFirstPacketPts = sample->mTime;
|
|
}
|
|
|
|
Span<float> audio = sample->Data();
|
|
|
|
if (mResampler) {
|
|
// Ensure that all input frames are consumed each time by oversizing the
|
|
// output buffer.
|
|
int bufferLengthGuess = std::ceil(2. * static_cast<float>(audio.size()) *
|
|
mConfig.mSampleRate / mInputSampleRate);
|
|
mTempBuffer.SetLength(bufferLengthGuess);
|
|
uint32_t inputFrames = audio.size() / mConfig.mNumberOfChannels;
|
|
uint32_t inputFramesProcessed = inputFrames;
|
|
uint32_t outputFrames = bufferLengthGuess / mConfig.mNumberOfChannels;
|
|
DebugOnly<int> rv = speex_resampler_process_interleaved_float(
|
|
mResampler.get(), audio.data(), &inputFramesProcessed,
|
|
mTempBuffer.Elements(), &outputFrames);
|
|
audio = Span<float>(mTempBuffer.Elements(),
|
|
outputFrames * mConfig.mNumberOfChannels);
|
|
MOZ_ASSERT(inputFrames == inputFramesProcessed,
|
|
"increate the buffer to consume all input each time");
|
|
MOZ_ASSERT(rv == RESAMPLER_ERR_SUCCESS);
|
|
}
|
|
|
|
EncodedData output;
|
|
MediaResult rv = NS_OK;
|
|
|
|
mPacketizer->Input(audio.data(), audio.Length() / mConfig.mNumberOfChannels);
|
|
|
|
// Dequeue and encode each packet
|
|
while (mPacketizer->PacketsAvailable() && rv.Code() == NS_OK) {
|
|
mTempBuffer.SetLength(mCodecContext->frame_size *
|
|
mConfig.mNumberOfChannels);
|
|
media::TimeUnit pts = mPacketizer->Output(mTempBuffer.Elements());
|
|
auto audio = Span(mTempBuffer.Elements(), mTempBuffer.Length());
|
|
FFMPEG_LOG("Encoding %" PRIu32 " frames, pts: %s",
|
|
mPacketizer->PacketSize(), pts.ToString().get());
|
|
auto encodeResult = EncodeOnePacket(audio, pts);
|
|
if (encodeResult.isOk()) {
|
|
output.AppendElements(std::move(encodeResult.unwrap()));
|
|
} else {
|
|
return encodeResult;
|
|
}
|
|
pts += media::TimeUnit(mPacketizer->PacketSize(), mConfig.mSampleRate);
|
|
}
|
|
return Result<MediaDataEncoder::EncodedData, nsresult>(std::move(output));
|
|
}
|
|
|
|
Result<MediaDataEncoder::EncodedData, nsresult>
|
|
FFmpegAudioEncoder<LIBAV_VER>::DrainWithModernAPIs() {
|
|
// If there's no packetizer, or it's empty, we can proceed immediately.
|
|
if (!mPacketizer || mPacketizer->FramesAvailable() == 0) {
|
|
return FFmpegDataEncoder<LIBAV_VER>::DrainWithModernAPIs();
|
|
}
|
|
EncodedData output;
|
|
MediaResult rv = NS_OK;
|
|
// Dequeue and encode each packet
|
|
mTempBuffer.SetLength(mCodecContext->frame_size *
|
|
mPacketizer->ChannelCount());
|
|
uint32_t written;
|
|
media::TimeUnit pts = mPacketizer->Drain(mTempBuffer.Elements(), written);
|
|
auto audio =
|
|
Span(mTempBuffer.Elements(), written * mPacketizer->ChannelCount());
|
|
auto encodeResult = EncodeOnePacket(audio, pts);
|
|
if (encodeResult.isOk()) {
|
|
auto array = encodeResult.unwrap();
|
|
output.AppendElements(std::move(array));
|
|
} else {
|
|
return encodeResult;
|
|
}
|
|
// Now, drain the encoder
|
|
auto drainResult = FFmpegDataEncoder<LIBAV_VER>::DrainWithModernAPIs();
|
|
if (drainResult.isOk()) {
|
|
auto array = drainResult.unwrap();
|
|
output.AppendElements(std::move(array));
|
|
} else {
|
|
return drainResult;
|
|
}
|
|
return Result<MediaDataEncoder::EncodedData, nsresult>(std::move(output));
|
|
}
|
|
#endif // if LIBAVCODEC_VERSION_MAJOR >= 58
|
|
|
|
RefPtr<MediaRawData> FFmpegAudioEncoder<LIBAV_VER>::ToMediaRawData(
|
|
AVPacket* aPacket) {
|
|
MOZ_ASSERT(mTaskQueue->IsOnCurrentThread());
|
|
MOZ_ASSERT(aPacket);
|
|
|
|
if (aPacket->size < mDtxThreshold) {
|
|
FFMPEG_LOG(
|
|
"DTX enabled and packet is %d bytes (threshold %d), not returning.",
|
|
aPacket->size, mDtxThreshold);
|
|
return nullptr;
|
|
}
|
|
|
|
RefPtr<MediaRawData> data = ToMediaRawDataCommon(aPacket);
|
|
|
|
data->mTime = media::TimeUnit(aPacket->pts, mConfig.mSampleRate);
|
|
data->mTimecode = data->mTime;
|
|
data->mDuration =
|
|
media::TimeUnit(mCodecContext->frame_size, mConfig.mSampleRate);
|
|
|
|
// Handle encoder delay
|
|
// Tracked in https://github.com/w3c/webcodecs/issues/626 because not quite
|
|
// specced yet.
|
|
if (mFirstPacketPts > data->mTime) {
|
|
data->mOriginalPresentationWindow =
|
|
Some(media::TimeInterval{data->mTime, data->GetEndTime()});
|
|
// Duration is likely to be ajusted when the above spec issue is fixed. For
|
|
// now, leave it as-is
|
|
// data->mDuration -= (mFirstPacketPts - data->mTime);
|
|
// if (data->mDuration.IsNegative()) {
|
|
// data->mDuration = media::TimeUnit::Zero();
|
|
// }
|
|
data->mTime = mFirstPacketPts;
|
|
}
|
|
|
|
if (mPacketsDelivered++ == 0) {
|
|
// Attach extradata, and the config (including any channel / samplerate
|
|
// modification to fit the encoder requirements), if needed.
|
|
if (auto r = GetExtraData(aPacket); r.isOk()) {
|
|
data->mExtraData = r.unwrap();
|
|
}
|
|
data->mConfig = MakeUnique<EncoderConfig>(mConfig);
|
|
}
|
|
|
|
if (data->mExtraData) {
|
|
FFMPEG_LOG(
|
|
"FFmpegAudioEncoder out: [%s,%s] (%zu bytes, extradata %zu bytes)",
|
|
data->mTime.ToString().get(), data->mDuration.ToString().get(),
|
|
data->Size(), data->mExtraData->Length());
|
|
} else {
|
|
FFMPEG_LOG("FFmpegAudioEncoder out: [%s,%s] (%zu bytes)",
|
|
data->mTime.ToString().get(), data->mDuration.ToString().get(),
|
|
data->Size());
|
|
}
|
|
|
|
return data;
|
|
}
|
|
|
|
Result<already_AddRefed<MediaByteBuffer>, nsresult>
|
|
FFmpegAudioEncoder<LIBAV_VER>::GetExtraData(AVPacket* /* aPacket */) {
|
|
if (!mCodecContext->extradata_size) {
|
|
return Err(NS_ERROR_NOT_AVAILABLE);
|
|
}
|
|
// Create extra data -- they are on the context.
|
|
auto extraData = MakeRefPtr<MediaByteBuffer>();
|
|
extraData->SetLength(mCodecContext->extradata_size);
|
|
MOZ_ASSERT(extraData);
|
|
PodCopy(extraData->Elements(), mCodecContext->extradata,
|
|
mCodecContext->extradata_size);
|
|
return extraData.forget();
|
|
}
|
|
|
|
} // namespace mozilla
|