#include "mixer_speaker.h" #ifdef USE_ESP32 #include "esphome/core/hal.h" #include "esphome/core/helpers.h" #include "esphome/core/log.h" #include #include namespace esphome { namespace mixer_speaker { static const UBaseType_t MIXER_TASK_PRIORITY = 10; static const uint32_t TRANSFER_BUFFER_DURATION_MS = 50; static const uint32_t TASK_DELAY_MS = 25; static const size_t TASK_STACK_SIZE = 4096; static const int16_t MAX_AUDIO_SAMPLE_VALUE = INT16_MAX; static const int16_t MIN_AUDIO_SAMPLE_VALUE = INT16_MIN; static const char *const TAG = "speaker_mixer"; // Gives the Q15 fixed point scaling factor to reduce by 0 dB, 1dB, ..., 50 dB // dB to PCM scaling factor formula: floating_point_scale_factor = 2^(-db/6.014) // float to Q15 fixed point formula: q15_scale_factor = floating_point_scale_factor * 2^(15) static const std::vector DECIBEL_REDUCTION_TABLE = { 32767, 29201, 26022, 23189, 20665, 18415, 16410, 14624, 13032, 11613, 10349, 9222, 8218, 7324, 6527, 5816, 5183, 4619, 4116, 3668, 3269, 2913, 2596, 2313, 2061, 1837, 1637, 1459, 1300, 1158, 1032, 920, 820, 731, 651, 580, 517, 461, 411, 366, 326, 291, 259, 231, 206, 183, 163, 146, 130, 116, 103}; enum MixerEventGroupBits : uint32_t { COMMAND_STOP = (1 << 0), // stops the mixer task STATE_STARTING = (1 << 10), STATE_RUNNING = (1 << 11), STATE_STOPPING = (1 << 12), STATE_STOPPED = (1 << 13), ERR_ESP_NO_MEM = (1 << 19), ALL_BITS = 0x00FFFFFF, // All valid FreeRTOS event group bits }; void SourceSpeaker::dump_config() { ESP_LOGCONFIG(TAG, "Mixer Source Speaker\n" " Buffer Duration: %" PRIu32 " ms", this->buffer_duration_ms_); if (this->timeout_ms_.has_value()) { ESP_LOGCONFIG(TAG, " Timeout: %" PRIu32 " ms", this->timeout_ms_.value()); } else { ESP_LOGCONFIG(TAG, " Timeout: never"); } } void SourceSpeaker::setup() { this->parent_->get_output_speaker()->add_audio_output_callback([this](uint32_t new_frames, int64_t write_timestamp) { // The SourceSpeaker may not have included any audio in the mixed output, so verify there were pending frames uint32_t speakers_playback_frames = std::min(new_frames, this->pending_playback_frames_); this->pending_playback_frames_ -= speakers_playback_frames; if (speakers_playback_frames > 0) { this->audio_output_callback_(speakers_playback_frames, write_timestamp); } }); } void SourceSpeaker::loop() { switch (this->state_) { case speaker::STATE_STARTING: { esp_err_t err = this->start_(); if (err == ESP_OK) { this->state_ = speaker::STATE_RUNNING; this->stop_gracefully_ = false; this->last_seen_data_ms_ = millis(); this->status_clear_error(); } else { switch (err) { case ESP_ERR_NO_MEM: this->status_set_error("Failed to start mixer: not enough memory"); break; case ESP_ERR_NOT_SUPPORTED: this->status_set_error("Failed to start mixer: unsupported bits per sample"); break; case ESP_ERR_INVALID_ARG: this->status_set_error("Failed to start mixer: audio stream isn't compatible with the other audio stream."); break; case ESP_ERR_INVALID_STATE: this->status_set_error("Failed to start mixer: mixer task failed to start"); break; default: this->status_set_error("Failed to start mixer"); break; } this->state_ = speaker::STATE_STOPPING; } break; } case speaker::STATE_RUNNING: if (!this->transfer_buffer_->has_buffered_data()) { if ((this->timeout_ms_.has_value() && ((millis() - this->last_seen_data_ms_) > this->timeout_ms_.value())) || this->stop_gracefully_) { this->state_ = speaker::STATE_STOPPING; } } break; case speaker::STATE_STOPPING: this->stop_(); this->stop_gracefully_ = false; this->state_ = speaker::STATE_STOPPED; break; case speaker::STATE_STOPPED: break; } } size_t SourceSpeaker::play(const uint8_t *data, size_t length, TickType_t ticks_to_wait) { if (this->is_stopped()) { this->start(); } size_t bytes_written = 0; if (this->ring_buffer_.use_count() == 1) { std::shared_ptr temp_ring_buffer = this->ring_buffer_.lock(); bytes_written = temp_ring_buffer->write_without_replacement(data, length, ticks_to_wait); if (bytes_written > 0) { this->last_seen_data_ms_ = millis(); } } return bytes_written; } void SourceSpeaker::start() { this->state_ = speaker::STATE_STARTING; } esp_err_t SourceSpeaker::start_() { const size_t ring_buffer_size = this->audio_stream_info_.ms_to_bytes(this->buffer_duration_ms_); if (this->transfer_buffer_.use_count() == 0) { this->transfer_buffer_ = audio::AudioSourceTransferBuffer::create(this->audio_stream_info_.ms_to_bytes(TRANSFER_BUFFER_DURATION_MS)); if (this->transfer_buffer_ == nullptr) { return ESP_ERR_NO_MEM; } std::shared_ptr temp_ring_buffer; if (!this->ring_buffer_.use_count()) { temp_ring_buffer = RingBuffer::create(ring_buffer_size); this->ring_buffer_ = temp_ring_buffer; } if (!this->ring_buffer_.use_count()) { return ESP_ERR_NO_MEM; } else { this->transfer_buffer_->set_source(temp_ring_buffer); } } this->pending_playback_frames_ = 0; // reset return this->parent_->start(this->audio_stream_info_); } void SourceSpeaker::stop() { if (this->state_ != speaker::STATE_STOPPED) { this->state_ = speaker::STATE_STOPPING; } } void SourceSpeaker::stop_() { this->transfer_buffer_.reset(); // deallocates the transfer buffer } void SourceSpeaker::finish() { this->stop_gracefully_ = true; } bool SourceSpeaker::has_buffered_data() const { return ((this->transfer_buffer_.use_count() > 0) && this->transfer_buffer_->has_buffered_data()); } void SourceSpeaker::set_mute_state(bool mute_state) { this->mute_state_ = mute_state; this->parent_->get_output_speaker()->set_mute_state(mute_state); } bool SourceSpeaker::get_mute_state() { return this->parent_->get_output_speaker()->get_mute_state(); } void SourceSpeaker::set_volume(float volume) { this->volume_ = volume; this->parent_->get_output_speaker()->set_volume(volume); } float SourceSpeaker::get_volume() { return this->parent_->get_output_speaker()->get_volume(); } size_t SourceSpeaker::process_data_from_source(TickType_t ticks_to_wait) { if (!this->transfer_buffer_.use_count()) { return 0; } // Store current offset, as these samples are already ducked const size_t current_length = this->transfer_buffer_->available(); size_t bytes_read = this->transfer_buffer_->transfer_data_from_source(ticks_to_wait); uint32_t samples_to_duck = this->audio_stream_info_.bytes_to_samples(bytes_read); if (samples_to_duck > 0) { int16_t *current_buffer = reinterpret_cast(this->transfer_buffer_->get_buffer_start() + current_length); duck_samples(current_buffer, samples_to_duck, &this->current_ducking_db_reduction_, &this->ducking_transition_samples_remaining_, this->samples_per_ducking_step_, this->db_change_per_ducking_step_); } return bytes_read; } void SourceSpeaker::apply_ducking(uint8_t decibel_reduction, uint32_t duration) { if (this->target_ducking_db_reduction_ != decibel_reduction) { this->current_ducking_db_reduction_ = this->target_ducking_db_reduction_; this->target_ducking_db_reduction_ = decibel_reduction; uint8_t total_ducking_steps = 0; if (this->target_ducking_db_reduction_ > this->current_ducking_db_reduction_) { // The dB reduction level is increasing (which results in quieter audio) total_ducking_steps = this->target_ducking_db_reduction_ - this->current_ducking_db_reduction_ - 1; this->db_change_per_ducking_step_ = 1; } else { // The dB reduction level is decreasing (which results in louder audio) total_ducking_steps = this->current_ducking_db_reduction_ - this->target_ducking_db_reduction_ - 1; this->db_change_per_ducking_step_ = -1; } if ((duration > 0) && (total_ducking_steps > 0)) { this->ducking_transition_samples_remaining_ = this->audio_stream_info_.ms_to_samples(duration); this->samples_per_ducking_step_ = this->ducking_transition_samples_remaining_ / total_ducking_steps; this->ducking_transition_samples_remaining_ = this->samples_per_ducking_step_ * total_ducking_steps; // Adjust for integer division rounding this->current_ducking_db_reduction_ += this->db_change_per_ducking_step_; } else { this->ducking_transition_samples_remaining_ = 0; this->current_ducking_db_reduction_ = this->target_ducking_db_reduction_; } } } void SourceSpeaker::duck_samples(int16_t *input_buffer, uint32_t input_samples_to_duck, int8_t *current_ducking_db_reduction, uint32_t *ducking_transition_samples_remaining, uint32_t samples_per_ducking_step, int8_t db_change_per_ducking_step) { if (*ducking_transition_samples_remaining > 0) { // Ducking level is still transitioning // Takes the ceiling of input_samples_to_duck/samples_per_ducking_step uint32_t ducking_steps_in_batch = input_samples_to_duck / samples_per_ducking_step + (input_samples_to_duck % samples_per_ducking_step != 0); for (uint32_t i = 0; i < ducking_steps_in_batch; ++i) { uint32_t samples_left_in_step = *ducking_transition_samples_remaining % samples_per_ducking_step; if (samples_left_in_step == 0) { samples_left_in_step = samples_per_ducking_step; } uint32_t samples_to_duck = std::min(input_samples_to_duck, samples_left_in_step); samples_to_duck = std::min(samples_to_duck, *ducking_transition_samples_remaining); // Ensure we only point to valid index in the Q15 scaling factor table uint8_t safe_db_reduction_index = clamp(*current_ducking_db_reduction, 0, DECIBEL_REDUCTION_TABLE.size() - 1); int16_t q15_scale_factor = DECIBEL_REDUCTION_TABLE[safe_db_reduction_index]; audio::scale_audio_samples(input_buffer, input_buffer, q15_scale_factor, samples_to_duck); if (samples_left_in_step - samples_to_duck == 0) { // After scaling the current samples, we are ready to transition to the next step *current_ducking_db_reduction += db_change_per_ducking_step; } input_buffer += samples_to_duck; *ducking_transition_samples_remaining -= samples_to_duck; input_samples_to_duck -= samples_to_duck; } } if ((*current_ducking_db_reduction > 0) && (input_samples_to_duck > 0)) { // Audio is ducked, but its not in the middle of a transition step uint8_t safe_db_reduction_index = clamp(*current_ducking_db_reduction, 0, DECIBEL_REDUCTION_TABLE.size() - 1); int16_t q15_scale_factor = DECIBEL_REDUCTION_TABLE[safe_db_reduction_index]; audio::scale_audio_samples(input_buffer, input_buffer, q15_scale_factor, input_samples_to_duck); } } void MixerSpeaker::dump_config() { ESP_LOGCONFIG(TAG, "Speaker Mixer:\n" " Number of output channels: %u", this->output_channels_); } void MixerSpeaker::setup() { this->event_group_ = xEventGroupCreate(); if (this->event_group_ == nullptr) { ESP_LOGE(TAG, "Failed to create event group"); this->mark_failed(); return; } } void MixerSpeaker::loop() { uint32_t event_group_bits = xEventGroupGetBits(this->event_group_); if (event_group_bits & MixerEventGroupBits::STATE_STARTING) { ESP_LOGD(TAG, "Starting speaker mixer"); xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_STARTING); } if (event_group_bits & MixerEventGroupBits::ERR_ESP_NO_MEM) { this->status_set_error("Failed to allocate the mixer's internal buffer"); xEventGroupClearBits(this->event_group_, MixerEventGroupBits::ERR_ESP_NO_MEM); } if (event_group_bits & MixerEventGroupBits::STATE_RUNNING) { ESP_LOGD(TAG, "Started speaker mixer"); this->status_clear_error(); xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_RUNNING); } if (event_group_bits & MixerEventGroupBits::STATE_STOPPING) { ESP_LOGD(TAG, "Stopping speaker mixer"); xEventGroupClearBits(this->event_group_, MixerEventGroupBits::STATE_STOPPING); } if (event_group_bits & MixerEventGroupBits::STATE_STOPPED) { if (this->delete_task_() == ESP_OK) { xEventGroupClearBits(this->event_group_, MixerEventGroupBits::ALL_BITS); } } if (this->task_handle_ != nullptr) { bool all_stopped = true; for (auto &speaker : this->source_speakers_) { all_stopped &= speaker->is_stopped(); } if (all_stopped) { this->stop(); } } } esp_err_t MixerSpeaker::start(audio::AudioStreamInfo &stream_info) { if (!this->audio_stream_info_.has_value()) { if (stream_info.get_bits_per_sample() != 16) { // Audio streams that don't have 16 bits per sample are not supported return ESP_ERR_NOT_SUPPORTED; } this->audio_stream_info_ = audio::AudioStreamInfo(stream_info.get_bits_per_sample(), this->output_channels_, stream_info.get_sample_rate()); this->output_speaker_->set_audio_stream_info(this->audio_stream_info_.value()); } else { if (!this->queue_mode_ && (stream_info.get_sample_rate() != this->audio_stream_info_.value().get_sample_rate())) { // The two audio streams must have the same sample rate to mix properly if not in queue mode return ESP_ERR_INVALID_ARG; } } return this->start_task_(); } esp_err_t MixerSpeaker::start_task_() { if (this->task_stack_buffer_ == nullptr) { if (this->task_stack_in_psram_) { RAMAllocator stack_allocator(RAMAllocator::ALLOC_EXTERNAL); this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE); } else { RAMAllocator stack_allocator(RAMAllocator::ALLOC_INTERNAL); this->task_stack_buffer_ = stack_allocator.allocate(TASK_STACK_SIZE); } } if (this->task_stack_buffer_ == nullptr) { return ESP_ERR_NO_MEM; } if (this->task_handle_ == nullptr) { this->task_handle_ = xTaskCreateStatic(audio_mixer_task, "mixer", TASK_STACK_SIZE, (void *) this, MIXER_TASK_PRIORITY, this->task_stack_buffer_, &this->task_stack_); } if (this->task_handle_ == nullptr) { return ESP_ERR_INVALID_STATE; } return ESP_OK; } esp_err_t MixerSpeaker::delete_task_() { if (!this->task_created_) { this->task_handle_ = nullptr; if (this->task_stack_buffer_ != nullptr) { if (this->task_stack_in_psram_) { RAMAllocator stack_allocator(RAMAllocator::ALLOC_EXTERNAL); stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE); } else { RAMAllocator stack_allocator(RAMAllocator::ALLOC_INTERNAL); stack_allocator.deallocate(this->task_stack_buffer_, TASK_STACK_SIZE); } this->task_stack_buffer_ = nullptr; } return ESP_OK; } return ESP_ERR_INVALID_STATE; } void MixerSpeaker::stop() { xEventGroupSetBits(this->event_group_, MixerEventGroupBits::COMMAND_STOP); } void MixerSpeaker::copy_frames(const int16_t *input_buffer, audio::AudioStreamInfo input_stream_info, int16_t *output_buffer, audio::AudioStreamInfo output_stream_info, uint32_t frames_to_transfer) { uint8_t input_channels = input_stream_info.get_channels(); uint8_t output_channels = output_stream_info.get_channels(); const uint8_t max_input_channel_index = input_channels - 1; if (input_channels == output_channels) { size_t bytes_to_copy = input_stream_info.frames_to_bytes(frames_to_transfer); memcpy(output_buffer, input_buffer, bytes_to_copy); return; } for (uint32_t frame_index = 0; frame_index < frames_to_transfer; ++frame_index) { for (uint8_t output_channel_index = 0; output_channel_index < output_channels; ++output_channel_index) { uint8_t input_channel_index = std::min(output_channel_index, max_input_channel_index); output_buffer[output_channels * frame_index + output_channel_index] = input_buffer[input_channels * frame_index + input_channel_index]; } } } void MixerSpeaker::mix_audio_samples(const int16_t *primary_buffer, audio::AudioStreamInfo primary_stream_info, const int16_t *secondary_buffer, audio::AudioStreamInfo secondary_stream_info, int16_t *output_buffer, audio::AudioStreamInfo output_stream_info, uint32_t frames_to_mix) { const uint8_t primary_channels = primary_stream_info.get_channels(); const uint8_t secondary_channels = secondary_stream_info.get_channels(); const uint8_t output_channels = output_stream_info.get_channels(); const uint8_t max_primary_channel_index = primary_channels - 1; const uint8_t max_secondary_channel_index = secondary_channels - 1; for (uint32_t frames_index = 0; frames_index < frames_to_mix; ++frames_index) { for (uint8_t output_channel_index = 0; output_channel_index < output_channels; ++output_channel_index) { const uint32_t secondary_channel_index = std::min(output_channel_index, max_secondary_channel_index); const int32_t secondary_sample = secondary_buffer[frames_index * secondary_channels + secondary_channel_index]; const uint32_t primary_channel_index = std::min(output_channel_index, max_primary_channel_index); const int32_t primary_sample = static_cast(primary_buffer[frames_index * primary_channels + primary_channel_index]); const int32_t added_sample = secondary_sample + primary_sample; output_buffer[frames_index * output_channels + output_channel_index] = static_cast(clamp(added_sample, MIN_AUDIO_SAMPLE_VALUE, MAX_AUDIO_SAMPLE_VALUE)); } } } void MixerSpeaker::audio_mixer_task(void *params) { MixerSpeaker *this_mixer = (MixerSpeaker *) params; xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STARTING); this_mixer->task_created_ = true; std::unique_ptr output_transfer_buffer = audio::AudioSinkTransferBuffer::create( this_mixer->audio_stream_info_.value().ms_to_bytes(TRANSFER_BUFFER_DURATION_MS)); if (output_transfer_buffer == nullptr) { xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPED | MixerEventGroupBits::ERR_ESP_NO_MEM); this_mixer->task_created_ = false; vTaskDelete(nullptr); } output_transfer_buffer->set_sink(this_mixer->output_speaker_); xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_RUNNING); bool sent_finished = false; while (true) { uint32_t event_group_bits = xEventGroupGetBits(this_mixer->event_group_); if (event_group_bits & MixerEventGroupBits::COMMAND_STOP) { break; } // Never shift the data in the output transfer buffer to avoid unnecessary, slow data moves output_transfer_buffer->transfer_data_to_sink(pdMS_TO_TICKS(TASK_DELAY_MS), false); const uint32_t output_frames_free = this_mixer->audio_stream_info_.value().bytes_to_frames(output_transfer_buffer->free()); std::vector speakers_with_data; std::vector> transfer_buffers_with_data; for (auto &speaker : this_mixer->source_speakers_) { if (speaker->get_transfer_buffer().use_count() > 0) { std::shared_ptr transfer_buffer = speaker->get_transfer_buffer().lock(); speaker->process_data_from_source(0); // Transfers and ducks audio from source ring buffers if ((transfer_buffer->available() > 0) && !speaker->get_pause_state()) { // Store the locked transfer buffers in their own vector to avoid releasing ownership until after the loop transfer_buffers_with_data.push_back(transfer_buffer); speakers_with_data.push_back(speaker); } } } if (transfer_buffers_with_data.empty()) { // No audio available for transferring, block task temporarily delay(TASK_DELAY_MS); continue; } uint32_t frames_to_mix = output_frames_free; if ((transfer_buffers_with_data.size() == 1) || this_mixer->queue_mode_) { // Only one speaker has audio data, just copy samples over audio::AudioStreamInfo active_stream_info = speakers_with_data[0]->get_audio_stream_info(); if (active_stream_info.get_sample_rate() == this_mixer->output_speaker_->get_audio_stream_info().get_sample_rate()) { // Speaker's sample rate matches the output speaker's, copy directly const uint32_t frames_available_in_buffer = active_stream_info.bytes_to_frames(transfer_buffers_with_data[0]->available()); frames_to_mix = std::min(frames_to_mix, frames_available_in_buffer); copy_frames(reinterpret_cast(transfer_buffers_with_data[0]->get_buffer_start()), active_stream_info, reinterpret_cast(output_transfer_buffer->get_buffer_end()), this_mixer->audio_stream_info_.value(), frames_to_mix); // Update source speaker buffer length transfer_buffers_with_data[0]->decrease_buffer_length(active_stream_info.frames_to_bytes(frames_to_mix)); speakers_with_data[0]->pending_playback_frames_ += frames_to_mix; // Update output transfer buffer length output_transfer_buffer->increase_buffer_length( this_mixer->audio_stream_info_.value().frames_to_bytes(frames_to_mix)); } else { // Speaker's stream info doesn't match the output speaker's, so it's a new source speaker if (!this_mixer->output_speaker_->is_stopped()) { if (!sent_finished) { this_mixer->output_speaker_->finish(); sent_finished = true; // Avoid repeatedly sending the finish command } } else { // Speaker has finished writing the current audio, update the stream information and restart the speaker this_mixer->audio_stream_info_ = audio::AudioStreamInfo(active_stream_info.get_bits_per_sample(), this_mixer->output_channels_, active_stream_info.get_sample_rate()); this_mixer->output_speaker_->set_audio_stream_info(this_mixer->audio_stream_info_.value()); this_mixer->output_speaker_->start(); sent_finished = false; } } } else { // Determine how many frames to mix for (int i = 0; i < transfer_buffers_with_data.size(); ++i) { const uint32_t frames_available_in_buffer = speakers_with_data[i]->get_audio_stream_info().bytes_to_frames(transfer_buffers_with_data[i]->available()); frames_to_mix = std::min(frames_to_mix, frames_available_in_buffer); } int16_t *primary_buffer = reinterpret_cast(transfer_buffers_with_data[0]->get_buffer_start()); audio::AudioStreamInfo primary_stream_info = speakers_with_data[0]->get_audio_stream_info(); // Mix two streams together for (int i = 1; i < transfer_buffers_with_data.size(); ++i) { mix_audio_samples(primary_buffer, primary_stream_info, reinterpret_cast(transfer_buffers_with_data[i]->get_buffer_start()), speakers_with_data[i]->get_audio_stream_info(), reinterpret_cast(output_transfer_buffer->get_buffer_end()), this_mixer->audio_stream_info_.value(), frames_to_mix); if (i != transfer_buffers_with_data.size() - 1) { // Need to mix more streams together, point primary buffer and stream info to the already mixed output primary_buffer = reinterpret_cast(output_transfer_buffer->get_buffer_end()); primary_stream_info = this_mixer->audio_stream_info_.value(); } } // Update source transfer buffer lengths and add new audio durations to the source speaker pending playbacks for (int i = 0; i < transfer_buffers_with_data.size(); ++i) { transfer_buffers_with_data[i]->decrease_buffer_length( speakers_with_data[i]->get_audio_stream_info().frames_to_bytes(frames_to_mix)); speakers_with_data[i]->pending_playback_frames_ += frames_to_mix; } // Update output transfer buffer length output_transfer_buffer->increase_buffer_length( this_mixer->audio_stream_info_.value().frames_to_bytes(frames_to_mix)); } } xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPING); output_transfer_buffer.reset(); xEventGroupSetBits(this_mixer->event_group_, MixerEventGroupBits::STATE_STOPPED); this_mixer->task_created_ = false; vTaskDelete(nullptr); } } // namespace mixer_speaker } // namespace esphome #endif