1、编译libbaresp https://github.com/baresip
win 安装cmake gui 、win openssl 即可
使用gui 的生成即可
创建项目导入lib 可以参考baresip wiki 的 lib的使用初始化
https://github.com/baresip/baresip/wiki/Using-baresip-as-a-library
#pragma comment(lib, "Ws2_32.lib")
#pragma comment(lib, "winmm.lib")
#pragma comment(lib, "re-static.lib")
#pragma comment(lib, "libbaresip.lib")
#pragma comment(lib, "Qwave.lib")
#pragma comment(lib, "DbgHelp.lib")
#pragma comment(lib, "iphlpapi.lib")
#pragma comment(lib, "libssl64MDd.lib")
#pragma comment(lib, "libcrypto64MDd.lib")
#pragma comment(lib, "bcg729.lib")
#pragma comment(lib, "blibwebrtc.lib")
2、实现本地的vad 加asr 选用VAD — sherpa 1.3 documentation 和它的asr 初始化和baresip一起
#pragma comment(lib, "sherpa-onnx-c-api.lib")
int err = 0; //struct tmr tmr_quit;
const char* language = "auto";
const char* provider = "cpu";
const char* model_filename =
"G:/ai/sherpa-onnx/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
const char* tokens_filename =
"G:/ai/sherpa-onnx/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
int32_t use_inverse_text_normalization = 1;
SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config;
memset(&sense_voice_config, 0, sizeof(sense_voice_config));
sense_voice_config.model = model_filename;
sense_voice_config.language = language;
sense_voice_config.use_itn = use_inverse_text_normalization;
// Offline model config
SherpaOnnxOfflineModelConfig offline_model_config;
memset(&offline_model_config, 0, sizeof(offline_model_config));
offline_model_config.debug = 0;
offline_model_config.num_threads = 1;
offline_model_config.provider = provider;
offline_model_config.tokens = tokens_filename;
offline_model_config.sense_voice = sense_voice_config;
// Recognizer config
SherpaOnnxOfflineRecognizerConfig recognizer_config;
memset(&recognizer_config, 0, sizeof(recognizer_config));
recognizer_config.decoding_method = "greedy_search";
recognizer_config.model_config = offline_model_config;
recognizer = SherpaOnnxCreateOfflineRecognizer(&recognizer_config);
if (recognizer == NULL) {
fprintf(stderr, "Please check your recognizer config!\n");
goto out;
}
printf("asr model load ok\n");
SherpaOnnxOfflineTtsConfig config;
memset(&config, 0, sizeof(config));
config.model.vits.model = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx";
config.model.vits.lexicon = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/lexicon.txt";
config.model.vits.tokens = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/tokens.txt";
config.model.vits.dict_dir = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/dict";
config.model.num_threads = 1;
config.model.debug = false;
config.rule_fsts = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/phone.fst,G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/date.fst,G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/number.fst";
tts = SherpaOnnxCreateOfflineTts(&config);
if (tts == NULL) {
fprintf(stderr, "Please check your tts config!\n");
goto out;
}
printf("tts model load ok\n");
err = libre_init();
3、将aufile模块代码复制过来 将他的wsapi代码放入改初始化里面创建一个线程 实现本地的vad +语音识别
while (re_atomic_rlx(&st->run) ) {
if (aubuf_cur_size(st->aubuf) != 0) { continue; }
hr = IAudioCaptureClient_GetNextPacketSize(service,
&packet_sz);
CHECK_HR(hr, "wasapi/src: GetNextPacketSize failed");
if (packet_sz == 0) {
sys_msleep(5);
continue;
}
//num_frames 这个值会变化 具体输出看
hr = IAudioCaptureClient_GetBuffer(service, (BYTE**)&af.sampv,
&num_frames, &flags, NULL,
NULL);
CHECK_HR(hr, "wasapi/src: GetBuffer failed");
af.timestamp = tmr_jiffies_usec();
af.sampc = num_frames * format->nChannels;
//16k的大小 稳定后160
if (num_frames != 160) { hr = IAudioCaptureClient_ReleaseBuffer(service, num_frames); continue; }
int16_t* int16_buffer = reinterpret_cast<int16_t*>(af.sampv);
size_t available_space = 640 - buffer_size;
size_t samples_to_copy = (num_frames < available_space) ? num_frames : available_space;
memcpy(buffer + buffer_size, int16_buffer, samples_to_copy * sizeof(int16_t));
buffer_size += samples_to_copy;
if (buffer_size == 640) {
ConvertToFloatSamples(buffer, float_samples, 640, 1);
SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, float_samples, vadConfig.silero_vad.window_size);
//if (SherpaOnnxVoiceActivityDetectorDetected(vad)) {
//printf("检测dao说话\n");
//}
while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) {
const SherpaOnnxSpeechSegment* segment = SherpaOnnxVoiceActivityDetectorFront(vad);
const SherpaOnnxOfflineStream* stream = SherpaOnnxCreateOfflineStream(mythread->recognizer);
SherpaOnnxAcceptWaveformOffline(stream, 16000, segment->samples, segment->n);
SherpaOnnxDecodeOfflineStream(mythread->recognizer, stream);
const SherpaOnnxOfflineRecognizerResult* result = SherpaOnnxGetOfflineStreamResult(stream);
if(result&& result->text && strlen(result->text) > 0){
float start = segment->start / 16000.0f;
float duration = segment->n / 16000.0f;
float stop = start + duration;
printf("%.3f -- %.3f: %s\n", start, stop, result->text);
st->asrtext = result->text ;
}
SherpaOnnxDestroyOfflineRecognizerResult(result);
SherpaOnnxDestroyOfflineStream(stream);
SherpaOnnxDestroySpeechSegment(segment);
SherpaOnnxVoiceActivityDetectorPop(vad);
}
buffer_size = 0;
}
4、这里拿到tts文本 在翻译成英文转成tts 还是用的sherpa 的tts ,再开个线程处理
这里要baresip的模块的aufile参考将tts的音频推入(aupcm改写的名字4注册的)发送的buf
if(audio){
std::vector<int16_t> pcm_samples;
convert_float_to_pcm(audio->samples, audio->n, pcm_samples);
printf("tts send %s\n", vec.data());
//SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, "C:/Users/lilin/Desktop/test.wav");
struct auframe af;
auframe_init(&af, AUFMT_S16LE, NULL, 0, audio->sample_rate , 1);
struct mbuf* mb = NULL;
mem_deref(mb);
int lend = pcm_samples.size() * sizeof(int16_t);
mb = mbuf_alloc(lend);
if (!mb){
st->asrtext = "";
continue;
}
memcpy(mb->buf, pcm_samples.data(), lend);
mb->end = lend;
uint16_t* sampv = (uint16_t*)mb->buf;
for (size_t i = 0; i < mb->end / 2; i++) {
sampv[i] = sys_ltohs(sampv[i]); // 小端转本地字节序
}
aubuf_append_auframe(st->aubuf, mb, &af);
mem_deref(mb);
}
SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);
5、最后在注册url加入自己的模块;audio_source=aupcm,16000
aupcm是注入的名字ausrc_register(&ausrc, baresip_ausrcl(), "aupcm", aufile_src_alloc);
拨打顺利的就显示
对方听到的就是翻译的tts 。
5、实现对方pcm 的翻译 这里是8k 这里用snffile模块的代码拿过来即可
在decode(struct aufilt_dec_st* st, struct auframe* af) 里面是数据
af->sampv的就是数据 放入asr 识别就行了,
如果模块或流程、呼叫系统觉得麻烦可以到