#pragma once #include "esphome/core/defines.h" #include #include namespace esphome { namespace audio { class AudioStreamInfo { /* Class to respresent important parameters of the audio stream that also provides helper function to convert between * various audio related units. * * - An audio sample represents a unit of audio for one channel. * - A frame represents a unit of audio with a sample for every channel. * * In gneneral, converting between bytes, samples, and frames shouldn't result in rounding errors so long as frames * are used as the main unit when transferring audio data. Durations may result in rounding for certain sample rates; * e.g., 44.1 KHz. The ``frames_to_milliseconds_with_remainder`` function should be used for accuracy, as it takes * into account the remainder rather than just ignoring any rounding. */ public: AudioStreamInfo() : AudioStreamInfo(16, 1, 16000){}; // Default values represent ESPHome's audio components historical values AudioStreamInfo(uint8_t bits_per_sample, uint8_t channels, uint32_t sample_rate); uint8_t get_bits_per_sample() const { return this->bits_per_sample_; } uint8_t get_channels() const { return this->channels_; } uint32_t get_sample_rate() const { return this->sample_rate_; } /// @brief Convert bytes to duration in milliseconds. /// @param bytes Number of bytes to convert /// @return Duration in milliseconds that will store `bytes` bytes of audio. May round down for certain sample rates /// or values of `bytes`. uint32_t bytes_to_ms(size_t bytes) const { return bytes * 1000 / (this->sample_rate_ * this->bytes_per_sample_ * this->channels_); } /// @brief Convert bytes to frames. /// @param bytes Number of bytes to convert /// @return Audio frames that will store `bytes` bytes. uint32_t bytes_to_frames(size_t bytes) const { return (bytes / (this->bytes_per_sample_ * this->channels_)); } /// @brief Convert bytes to samples. /// @param bytes Number of bytes to convert /// @return Audio samples that will store `bytes` bytes. uint32_t bytes_to_samples(size_t bytes) const { return (bytes / this->bytes_per_sample_); } /// @brief Converts frames to bytes. /// @param frames Number of frames to convert. /// @return Number of bytes that will store `frames` frames of audio. size_t frames_to_bytes(uint32_t frames) const { return frames * this->bytes_per_sample_ * this->channels_; } /// @brief Converts samples to bytes. /// @param samples Number of samples to convert. /// @return Number of bytes that will store `samples` samples of audio. size_t samples_to_bytes(uint32_t samples) const { return samples * this->bytes_per_sample_; } /// @brief Converts duration to frames. /// @param ms Duration in milliseconds /// @return Audio frames that will store `ms` milliseconds of audio. May round down for certain sample rates. uint32_t ms_to_frames(uint32_t ms) const { return (ms * this->sample_rate_) / 1000; } /// @brief Converts duration to samples. /// @param ms Duration in milliseconds /// @return Audio samples that will store `ms` milliseconds of audio. May round down for certain sample rates. uint32_t ms_to_samples(uint32_t ms) const { return (ms * this->channels_ * this->sample_rate_) / 1000; } /// @brief Converts duration to bytes. May round down for certain sample rates. /// @param ms Duration in milliseconds /// @return Bytes that will store `ms` milliseconds of audio. May round down for certain sample rates. size_t ms_to_bytes(uint32_t ms) const { return (ms * this->bytes_per_sample_ * this->channels_ * this->sample_rate_) / 1000; } /// @brief Computes the duration, in microseconds, the given amount of frames represents. /// @param frames Number of audio frames /// @return Duration in microseconds `frames` respresents. May be slightly inaccurate due to integer divison rounding /// for certain sample rates. uint32_t frames_to_microseconds(uint32_t frames) const; /// @brief Computes the duration, in milliseconds, the given amount of frames represents. Avoids /// accumulating rounding errors by updating `frames` with the remainder after converting. /// @param frames Pointer to uint32_t with the number of audio frames. Replaced with the remainder. /// @return Duration in milliseconds `frames` represents. Always less than or equal to the actual value due to /// rounding. uint32_t frames_to_milliseconds_with_remainder(uint32_t *frames) const; // Class comparison operators bool operator==(const AudioStreamInfo &rhs) const; bool operator!=(const AudioStreamInfo &rhs) const { return !operator==(rhs); } protected: uint8_t bits_per_sample_; uint8_t channels_; uint32_t sample_rate_; // The greatest common divisor between 1000 ms = 1 second and the sample rate. Used to avoid accumulating error when // converting from frames to duration. Computed at construction. uint32_t ms_sample_rate_gcd_; // Conversion factor derived from the number of bits per sample. Assumes audio data is aligned to the byte. Computed // at construction. size_t bytes_per_sample_; }; enum class AudioFileType : uint8_t { NONE = 0, #ifdef USE_AUDIO_FLAC_SUPPORT FLAC, #endif #ifdef USE_AUDIO_MP3_SUPPORT MP3, #endif WAV, }; struct AudioFile { const uint8_t *data; size_t length; AudioFileType file_type; }; /// @brief Helper function to convert file type to a const char string /// @param file_type /// @return const char pointer to the readable file type const char *audio_file_type_to_string(AudioFileType file_type); /// @brief Scales Q15 fixed point audio samples. Scales in place if audio_samples == output_buffer. /// @param audio_samples PCM int16 audio samples /// @param output_buffer Buffer to store the scaled samples /// @param scale_factor Q15 fixed point scaling factor /// @param samples_to_scale Number of samples to scale void scale_audio_samples(const int16_t *audio_samples, int16_t *output_buffer, int16_t scale_factor, size_t samples_to_scale); /// @brief Unpacks a quantized audio sample into a Q31 fixed-point number. /// @param data Pointer to uint8_t array containing the audio sample /// @param bytes_per_sample The number of bytes per sample /// @return Q31 sample inline int32_t unpack_audio_sample_to_q31(const uint8_t *data, size_t bytes_per_sample) { int32_t sample = 0; if (bytes_per_sample == 1) { sample |= data[0] << 24; } else if (bytes_per_sample == 2) { sample |= data[0] << 16; sample |= data[1] << 24; } else if (bytes_per_sample == 3) { sample |= data[0] << 8; sample |= data[1] << 16; sample |= data[2] << 24; } else if (bytes_per_sample == 4) { sample |= data[0]; sample |= data[1] << 8; sample |= data[2] << 16; sample |= data[3] << 24; } return sample; } /// @brief Packs a Q31 fixed-point number as an audio sample with the specified number of bytes per sample. /// Packs the most significant bits - no dithering is applied. /// @param sample Q31 fixed-point number to pack /// @param data Pointer to data array to store /// @param bytes_per_sample The audio data's bytes per sample inline void pack_q31_as_audio_sample(int32_t sample, uint8_t *data, size_t bytes_per_sample) { if (bytes_per_sample == 1) { data[0] = static_cast(sample >> 24); } else if (bytes_per_sample == 2) { data[0] = static_cast(sample >> 16); data[1] = static_cast(sample >> 24); } else if (bytes_per_sample == 3) { data[0] = static_cast(sample >> 8); data[1] = static_cast(sample >> 16); data[2] = static_cast(sample >> 24); } else if (bytes_per_sample == 4) { data[0] = static_cast(sample); data[1] = static_cast(sample >> 8); data[2] = static_cast(sample >> 16); data[3] = static_cast(sample >> 24); } } } // namespace audio } // namespace esphome