Add audio capture with ffmpeg

Capture audio from arbitrary audio streams and resample to a specified
sample rate. Use the same interface as when capturing from a mic.
pull/1549/head
Radoslav Gerganov 2023-11-24 13:36:30 +02:00
parent 146169ec38
commit 7596ee8ae2
4 changed files with 409 additions and 6 deletions

View File

@ -345,9 +345,11 @@ clean:
#
CC_SDL=`sdl2-config --cflags --libs`
CC_FFMPEG=`pkg-config --cflags --libs libavformat libavcodec libswresample libavutil`
SRC_COMMON = examples/common.cpp examples/common-ggml.cpp
SRC_COMMON_SDL = examples/common-sdl.cpp
SRC_COMMON_FFMPEG = examples/common-ffmpeg.cpp
main: examples/main/main.cpp $(SRC_COMMON) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o main $(LDFLAGS)
@ -362,8 +364,8 @@ quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_FFMPEG) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_FFMPEG) $(WHISPER_OBJ) -o stream $(CC_FFMPEG) $(LDFLAGS)
command: examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/command/command.cpp examples/grammar-parser.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)

View File

@ -0,0 +1,329 @@
#include "common-ffmpeg.h"
audio_capture::audio_capture(int len_ms)
: m_len_ms(len_ms), m_running(false), m_initiated(false)
{
}
audio_capture::~audio_capture()
{
pause();
av_frame_free(&frame);
avcodec_close(soundCodecContext);
avcodec_free_context(&soundCodecContext);
av_packet_free(&packet);
avformat_close_input(&ctx);
if (dst_data) {
av_freep(&dst_data[0]);
}
av_freep(&dst_data);
swr_close(swr_ctx);
swr_free(&swr_ctx);
}
bool audio_capture::init(const char * url, int stream_id, int sample_rate)
{
av_log_set_level(AV_LOG_INFO);
int ret = 0;
ctx = NULL;
if ((ret = avformat_open_input(&ctx, url, NULL, NULL)) < 0) {
fprintf(stderr, "Cannot open input: %s\n", url);
return false;
}
if ((ret = avformat_find_stream_info(ctx, NULL)) < 0) {
fprintf(stderr, "Cannot find stream information\n");
return false;
}
int streamCount = ctx->nb_streams;
if (stream_id >= streamCount) {
fprintf(stderr, "Audio stream index out of range\n");
return false;
}
if (stream_id < 0) {
// use the first audio stream
for (int i = 0; i < streamCount; ++i) {
AVStream *stream = ctx->streams[i];
AVCodecParameters *codecpar = stream->codecpar;
AVMediaType codecType = codecpar->codec_type;
av_dump_format(ctx, i, url, 0);
if (codecType == AVMEDIA_TYPE_AUDIO) {
stream_id = i;
break;
}
}
}
if (stream_id < 0) {
fprintf(stderr, "No audio stream found\n");
return false;
}
AVStream *stream = ctx->streams[stream_id];
AVCodecParameters *codecpar = stream->codecpar;
AVCodecID codecId = codecpar->codec_id;
AVMediaType codecType = codecpar->codec_type;
if (codecType != AVMEDIA_TYPE_AUDIO) {
fprintf(stderr, "Stream %d is not an audio stream\n", stream_id);
return false;
}
m_stream_id = stream_id;
int src_rate = codecpar->sample_rate;
const AVCodec* soundCodec = avcodec_find_decoder(codecId);
if (!soundCodec) {
fprintf(stderr, "Cannot find codec\n");
return false;
}
soundCodecContext = avcodec_alloc_context3(soundCodec);
if (!soundCodecContext) {
fprintf(stderr, "Cannot allocate codec context\n");
return false;
}
if (avcodec_parameters_to_context(soundCodecContext, codecpar) < 0) {
fprintf(stderr, "Cannot initialize codec context\n");
return false;
}
if (avcodec_open2(soundCodecContext, soundCodec, NULL) < 0) {
fprintf(stderr, "Cannot open codec\n");
return false;
}
int64_t src_ch_layout = codecpar->channel_layout;
// resample to mono, float format and the specified sample rate
int dst_ch_layout = AV_CH_LAYOUT_MONO;
m_sample_rate = sample_rate;
m_audio.resize((m_sample_rate*m_len_ms)/1000);
// create resampler context
swr_ctx = swr_alloc();
if (!swr_ctx) {
fprintf(stderr, "Could not allocate resampler context\n");
return false;
}
// set resample options
av_opt_set_int(swr_ctx, "in_channel_layout", src_ch_layout, 0);
av_opt_set_int(swr_ctx, "in_sample_rate", src_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "in_sample_fmt", (AVSampleFormat)codecpar->format, 0);
av_opt_set_int(swr_ctx, "out_channel_layout", dst_ch_layout, 0);
av_opt_set_int(swr_ctx, "out_sample_rate", m_sample_rate, 0);
av_opt_set_sample_fmt(swr_ctx, "out_sample_fmt", AV_SAMPLE_FMT_FLT, 0);
// initialize the resampling context
if ((ret = swr_init(swr_ctx)) < 0) {
fprintf(stderr, "Failed to initialize the resampling context\n");
return false;
}
max_dst_nb_samples = 1024;
dst_linesize = 0;
dst_data = NULL;
ret = av_samples_alloc_array_and_samples(&dst_data, &dst_linesize, 1, max_dst_nb_samples, AV_SAMPLE_FMT_FLT, 0);
if (ret < 0) {
fprintf(stderr, "Could not allocate destination samples\n");
return false;
}
frame = av_frame_alloc();
if (!frame) {
fprintf(stderr, "Cannot allocate frame\n");
return false;
}
packet = av_packet_alloc();
if (!packet) {
fprintf(stderr, "Cannot allocate packet\n");
return false;
}
m_initiated = true;
return true;
}
bool audio_capture::decode_packet()
{
if (!m_initiated) {
return false;
}
int ret = 0;
if ((ret = av_read_frame(ctx, packet)) < 0) {
return false;
}
if (packet->stream_index != m_stream_id) {
av_packet_unref(packet);
return true;
}
ret = avcodec_send_packet(soundCodecContext, packet);
if (ret < 0) {
char errbuf[AV_ERROR_MAX_STRING_SIZE] = { 0 };
av_strerror(ret, errbuf, AV_ERROR_MAX_STRING_SIZE);
fprintf(stderr, "Error while sending a packet to the decoder: %s\n", errbuf);
return false;
}
while (ret >= 0) {
ret = avcodec_receive_frame(soundCodecContext, frame);
if (ret != 0) {
break;
}
int src_nb_samples = frame->nb_samples;
int dst_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx, m_sample_rate) + src_nb_samples, m_sample_rate, m_sample_rate, AV_ROUND_UP);
if (dst_nb_samples > max_dst_nb_samples) {
av_freep(&dst_data[0]);
ret = av_samples_alloc(dst_data, &dst_linesize, 1, dst_nb_samples, AV_SAMPLE_FMT_FLT, 1);
if (ret < 0) {
break;
}
max_dst_nb_samples = dst_nb_samples;
}
// convert frame data to destination format
ret = swr_convert(swr_ctx, dst_data, dst_nb_samples, (const uint8_t **)frame->data, src_nb_samples);
if (ret < 0) {
fprintf(stderr, "Error while converting\n");
return false;
}
int dst_bufsize = av_samples_get_buffer_size(&dst_linesize, 1, ret, AV_SAMPLE_FMT_FLT, 1);
if (dst_bufsize < 0) {
fprintf(stderr, "Could not get sample buffer size\n");
return false;
}
callback(dst_data[0], dst_bufsize);
av_frame_unref(frame);
}
av_packet_unref(packet);
return true;
}
void audio_capture::callback(uint8_t * stream, int len) {
if (!m_running) {
return;
}
size_t n_samples = len / sizeof(float);
if (n_samples > m_audio.size()) {
n_samples = m_audio.size();
stream += (len - (n_samples * sizeof(float)));
}
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
{
std::lock_guard<std::mutex> lock(m_mutex);
if (m_audio_pos + n_samples > m_audio.size()) {
const size_t n0 = m_audio.size() - m_audio_pos;
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
memcpy(&m_audio[0], stream + n0 * sizeof(float), (n_samples - n0) * sizeof(float));
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = m_audio.size();
} else {
memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
}
}
}
bool audio_capture::clear() {
if (!m_initiated) {
fprintf(stderr, "%s: not initiated!\n", __func__);
return false;
}
if (!m_running) {
fprintf(stderr, "%s: not running!\n", __func__);
return false;
}
{
std::lock_guard<std::mutex> lock(m_mutex);
m_audio_pos = 0;
m_audio_len = 0;
}
return true;
}
void audio_capture::get(int ms, std::vector<float> & result) {
if (!m_initiated) {
fprintf(stderr, "%s: not initiated!\n", __func__);
return;
}
if (!m_running) {
fprintf(stderr, "%s: not running!\n", __func__);
return;
}
result.clear();
{
std::lock_guard<std::mutex> lock(m_mutex);
if (ms <= 0) {
ms = m_len_ms;
}
size_t n_samples = (m_sample_rate * ms) / 1000;
if (n_samples > m_audio_len) {
n_samples = m_audio_len;
}
result.resize(n_samples);
int s0 = m_audio_pos - n_samples;
if (s0 < 0) {
s0 += m_audio.size();
}
if (s0 + n_samples > m_audio.size()) {
const size_t n0 = m_audio.size() - s0;
memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
} else {
memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
}
}
}
static void audio_decoder(audio_capture * capture, std::atomic_bool & running)
{
while (running) {
if (!capture->decode_packet()) {
break;
}
}
}
bool audio_capture::resume()
{
if (!m_initiated) {
fprintf(stderr, "%s: not initiated!\n", __func__);
return false;
}
if (m_running) {
return true;
}
decode_thread = std::thread(audio_decoder, this, std::ref(m_running));
m_running = true;
return true;
}
bool audio_capture::pause()
{
if (!m_initiated) {
fprintf(stderr, "%s: not initiated!\n", __func__);
return false;
}
if (!m_running) {
return true;
}
m_running = false;
decode_thread.join();
return true;
}

View File

@ -0,0 +1,69 @@
#pragma once
extern "C" {
#include <libavutil/opt.h>
#include <libavutil/channel_layout.h>
#include <libavformat/avformat.h>
#include <libavcodec/avcodec.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}
#include <atomic>
#include <cstdint>
#include <vector>
#include <mutex>
#include <thread>
//
// FFmpeg Audio capture
//
class audio_capture {
public:
audio_capture(int len_ms);
~audio_capture();
// open the url and use the audio stream with the specified id
// if stream_id < 0, use the first audio stream
// resample audio to the specified sample_rate
bool init(const char * url, int stream_id, int sample_rate);
// start decoding and resampling the audio stream in a separate thread
// keep last len_ms seconds of audio in a circular buffer
bool resume();
bool pause();
bool clear();
// decode and resample a single packet
bool decode_packet();
// callback to be called by the audio decoder thread
void callback(uint8_t * stream, int len);
// get audio data from the circular buffer
void get(int ms, std::vector<float> & audio);
private:
int m_len_ms = 0;
std::atomic_bool m_running;
bool m_initiated;
std::mutex m_mutex;
std::vector<float> m_audio;
size_t m_audio_pos = 0;
size_t m_audio_len = 0;
int m_stream_id = -1;
int m_sample_rate = 0;
int max_dst_nb_samples = 1024;
int dst_linesize = 0;
uint8_t **dst_data = NULL;
AVFormatContext *ctx;
AVCodecContext* soundCodecContext;
AVPacket *packet;
AVFrame* frame;
struct SwrContext *swr_ctx;
std::thread decode_thread;
};

View File

@ -2,7 +2,7 @@
//
// A very quick-n-dirty implementation serving mainly as a proof of concept.
//
#include "common-sdl.h"
#include "common-ffmpeg.h"
#include "common.h"
#include "whisper.h"
@ -51,6 +51,7 @@ struct whisper_params {
bool save_audio = false; // save audio to wav file
bool use_gpu = true;
std::string url = "http://localhost:5000/";
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_out;
@ -71,6 +72,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); }
else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-u" || arg == "--url") { params.url = argv[++i]; }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
@ -108,6 +110,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.length_ms);
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n", params.keep_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -u URL, --url URL [%-7s] capture device ID\n", params.url.c_str());
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
@ -151,8 +154,8 @@ int main(int argc, char ** argv) {
// init audio
audio_async audio(params.length_ms);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
audio_capture audio(params.length_ms);
if (!audio.init(params.url.c_str(), -1, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
@ -243,7 +246,7 @@ int main(int argc, char ** argv) {
wavWriter.write(pcmf32_new.data(), pcmf32_new.size());
}
// handle Ctrl + C
is_running = sdl_poll_events();
//is_running = sdl_poll_events();
if (!is_running) {
break;