baresip+sherpa-onnx在sip客户端实现自定义声源及本地、对方asr及实现中转英翻译

1、编译libbaresp https://github.com/baresip 

win 安装cmake gui 、win openssl 即可

使用gui 的生成即可

创建项目导入lib 可以参考baresip wiki 的 lib的使用初始化

https://github.com/baresip/baresip/wiki/Using-baresip-as-a-library
 
#pragma comment(lib, "Ws2_32.lib") 
#pragma comment(lib, "winmm.lib") 
#pragma comment(lib, "re-static.lib") 
#pragma comment(lib, "libbaresip.lib") 
#pragma comment(lib, "Qwave.lib") 
#pragma comment(lib, "DbgHelp.lib")
#pragma comment(lib, "iphlpapi.lib") 
#pragma comment(lib, "libssl64MDd.lib") 
#pragma comment(lib, "libcrypto64MDd.lib") 

#pragma comment(lib, "bcg729.lib") 
#pragma comment(lib, "blibwebrtc.lib") 

2、实现本地的vad 加asr  选用VAD — sherpa 1.3 documentation 和它的asr 初始化和baresip一起

#pragma comment(lib, "sherpa-onnx-c-api.lib") 


    int err = 0; //struct tmr tmr_quit;
    const char* language = "auto";
    const char* provider = "cpu";
    const char* model_filename =
        "G:/ai/sherpa-onnx/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx";
    const char* tokens_filename =
        "G:/ai/sherpa-onnx/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt";
    int32_t use_inverse_text_normalization = 1;
    SherpaOnnxOfflineSenseVoiceModelConfig sense_voice_config;
    memset(&sense_voice_config, 0, sizeof(sense_voice_config));
    sense_voice_config.model = model_filename;
    sense_voice_config.language = language;
    sense_voice_config.use_itn = use_inverse_text_normalization;

    // Offline model config
    SherpaOnnxOfflineModelConfig offline_model_config;
    memset(&offline_model_config, 0, sizeof(offline_model_config));
    offline_model_config.debug = 0;
    offline_model_config.num_threads = 1;
    offline_model_config.provider = provider;
    offline_model_config.tokens = tokens_filename;
    offline_model_config.sense_voice = sense_voice_config;

    // Recognizer config
    SherpaOnnxOfflineRecognizerConfig recognizer_config;
    memset(&recognizer_config, 0, sizeof(recognizer_config));
    recognizer_config.decoding_method = "greedy_search";
    recognizer_config.model_config = offline_model_config;

    recognizer = SherpaOnnxCreateOfflineRecognizer(&recognizer_config);
     
    if (recognizer == NULL) {
        fprintf(stderr, "Please check your recognizer config!\n"); 
        goto out;
    }

    printf("asr model load ok\n");
    SherpaOnnxOfflineTtsConfig config;
    memset(&config, 0, sizeof(config));
    config.model.vits.model = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx";
    config.model.vits.lexicon = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/lexicon.txt";
    config.model.vits.tokens = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/tokens.txt";
    config.model.vits.dict_dir = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/dict";
    config.model.num_threads = 1;
    config.model.debug = false;
    config.rule_fsts = "G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/phone.fst,G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/date.fst,G:/ai/sherpa-onnx/vits-zh-hf-fanchen-C/number.fst";

    tts = SherpaOnnxCreateOfflineTts(&config); 
    if (tts == NULL) {
        fprintf(stderr, "Please check your tts config!\n");
        goto out;
    }
    printf("tts model load ok\n");

    err = libre_init();

3、将aufile模块代码复制过来  将他的wsapi代码放入改初始化里面创建一个线程 实现本地的vad +语音识别

while (re_atomic_rlx(&st->run)  ) {

        if (aubuf_cur_size(st->aubuf) != 0) { continue; }

        hr = IAudioCaptureClient_GetNextPacketSize(service,
            &packet_sz);
        CHECK_HR(hr, "wasapi/src: GetNextPacketSize failed");

        if (packet_sz == 0) {
            sys_msleep(5);
            continue;
        }
        //num_frames 这个值会变化  具体输出看
        hr = IAudioCaptureClient_GetBuffer(service, (BYTE**)&af.sampv,
            &num_frames, &flags, NULL,
            NULL);
        CHECK_HR(hr, "wasapi/src: GetBuffer failed");

        af.timestamp = tmr_jiffies_usec();
        af.sampc = num_frames * format->nChannels;

        //16k的大小 稳定后160
        if (num_frames != 160) { hr = IAudioCaptureClient_ReleaseBuffer(service, num_frames); continue; }
        int16_t* int16_buffer = reinterpret_cast<int16_t*>(af.sampv);
        size_t available_space = 640 - buffer_size;
        size_t samples_to_copy = (num_frames < available_space) ? num_frames : available_space;
        memcpy(buffer + buffer_size, int16_buffer, samples_to_copy * sizeof(int16_t));
        buffer_size += samples_to_copy; 
        if (buffer_size == 640) {  
            ConvertToFloatSamples(buffer, float_samples, 640, 1);
            SherpaOnnxVoiceActivityDetectorAcceptWaveform(vad, float_samples, vadConfig.silero_vad.window_size);
            //if (SherpaOnnxVoiceActivityDetectorDetected(vad)) {
               //printf("检测dao说话\n");
            //} 
            while (!SherpaOnnxVoiceActivityDetectorEmpty(vad)) {
                const SherpaOnnxSpeechSegment* segment = SherpaOnnxVoiceActivityDetectorFront(vad);
                const SherpaOnnxOfflineStream* stream = SherpaOnnxCreateOfflineStream(mythread->recognizer);

                SherpaOnnxAcceptWaveformOffline(stream, 16000, segment->samples, segment->n);

                SherpaOnnxDecodeOfflineStream(mythread->recognizer, stream);

                const SherpaOnnxOfflineRecognizerResult* result = SherpaOnnxGetOfflineStreamResult(stream);
                if(result&& result->text && strlen(result->text) > 0){
                    float start = segment->start / 16000.0f;
                    float duration = segment->n / 16000.0f;
                    float stop = start + duration; 
                    printf("%.3f -- %.3f: %s\n", start, stop, result->text); 
                    st->asrtext = result->text ; 
                }
                SherpaOnnxDestroyOfflineRecognizerResult(result);
                SherpaOnnxDestroyOfflineStream(stream);

                SherpaOnnxDestroySpeechSegment(segment);
                SherpaOnnxVoiceActivityDetectorPop(vad);
            }
            buffer_size = 0;
        }

4、这里拿到tts文本 在翻译成英文转成tts 还是用的sherpa 的tts ,再开个线程处理   

这里要baresip的模块的aufile参考将tts的音频推入(aupcm改写的名字4注册的)发送的buf

if(audio){
            std::vector<int16_t> pcm_samples;
            convert_float_to_pcm(audio->samples, audio->n, pcm_samples);
            printf("tts send %s\n", vec.data());
            //SherpaOnnxWriteWave(audio->samples, audio->n, audio->sample_rate, "C:/Users/lilin/Desktop/test.wav");

            struct auframe af;
            auframe_init(&af, AUFMT_S16LE, NULL, 0, audio->sample_rate , 1);
            struct mbuf* mb = NULL;
            mem_deref(mb);
            int lend = pcm_samples.size() * sizeof(int16_t);
            mb = mbuf_alloc(lend);
            if (!mb){ 
                st->asrtext = "";
                continue;
            }
            memcpy(mb->buf, pcm_samples.data(), lend);

            mb->end = lend;

            uint16_t* sampv = (uint16_t*)mb->buf;

            for (size_t i = 0; i < mb->end / 2; i++) {
                sampv[i] = sys_ltohs(sampv[i]); // 小端转本地字节序
            }

            aubuf_append_auframe(st->aubuf, mb, &af);
            mem_deref(mb);
        }
        SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio);

5、最后在注册url加入自己的模块;audio_source=aupcm,16000   

aupcm是注入的名字ausrc_register(&ausrc, baresip_ausrcl(), "aupcm", aufile_src_alloc);

拨打顺利的就显示

对方听到的就是翻译的tts 。

5、实现对方pcm 的翻译 这里是8k   这里用snffile模块的代码拿过来即可

 在decode(struct aufilt_dec_st* st, struct auframe* af)  里面是数据

af->sampv的就是数据  放入asr 识别就行了,

 如果模块或流程、呼叫系统觉得麻烦可以到  

https://item.taobao.com/item.htm?id=653611115230

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值