现在TTS 文字生成声音 很成熟也很方便,比如 r.maifeipin.com 中的RSS 语音播报功能,就是调有免费的azure接口。那么把音视频转文字的STT有哪些好的方案呢?Google的AIStudio 可以,而且支持多人多场景多角色模拟自动切换,非常的惊艳,但这个只能试用或者升级收费。有没有免费好用的呢,当然就是openai 家的 whisper ,为了测试 whisper ,特意研究了一下直播源的采集 和pytorch环境部署,记录如下:

image-1748058921598

采集 直播流数据 在线电台

  • 使用的在线电台 https://www.bbc.co.uk/sounds/player/bbc_world_service

  • 在webview2的 DevToolsProtocolEventReceived 事件中获取请求,发现有主要有mpd 和m4s 两种请求,通过AI知道这是dash 直播流,分析头文件

    https://a.files.bbci.co.uk/ms6/live/3441A116-B12E-4D2F-ACA8-C1984642FA4B/audio/simulcast/dash/nonuk/pc_hd_abr_v2/cfs/bbc_world_service.mpd

    <MPD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:mpeg:dash:schema:mpd:2011" xmlns:dvb="urn:dvb:dash:dash-extensions:2014-1"
          xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-DASH_schema_files/DASH-MPD.xsd"
          type="dynamic" availabilityStartTime="1969-12-31T23:59:44Z"
          minimumUpdatePeriod="PT6H" timeShiftBufferDepth="PT6H" maxSegmentDuration="PT7S" minBufferTime="PT3.200S"
          profiles="urn:dvb:dash:profile:dvb-dash:2014,urn:dvb:dash:profile:dvb-dash:isoff-ext-live:2014"
          publishTime="2025-02-04T16:18:01">
    
            <UTCTiming schemeIdUri="urn:mpeg:dash:utc:http-iso:2014" value="https://time.akamai.com/?iso" />
    
            <BaseURL dvb:priority="1" dvb:weight="1" serviceLocation="cfs">https://as-dash-ww.live.cf.md.bbci.co.uk/pool_87948813/live/ww/bbc_world_service/bbc_world_service.isml/dash/</BaseURL>
    
            <Period id="1" start="PT0S">
            <AdaptationSet group="1" contentType="audio" lang="en" minBandwidth="48000" maxBandwidth="96000"
                           segmentAlignment="true" audioSamplingRate="48000" mimeType="audio/mp4" codecs="mp4a.40.5" startWithSAP="1">
                <AudioChannelConfiguration schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011" value="2"/>
                <Role schemeIdUri="urn:mpeg:dash:role:2011" value="main"/>
                <SegmentTemplate timescale="48000" initialization="bbc_world_service-$RepresentationID$.dash"
                               media="bbc_world_service-$RepresentationID$-$Number$.m4s" startNumber="1" duration="307200"/>
                <Representation id="audio=48000" bandwidth="48000"/>
                <Representation id="audio=96000" bandwidth="96000"/>
            </AdaptationSet>
    
          </Period>
        </MPD>
    
    
  • Deepseek给的分析+ 个人整理后,大约有这些概念:

    SegmentTemplate 采样率有4800和9600
    minimumUpdatePeriod 片长6小时
    minimumUpdatePeriod 和 maxSegmentDuration 平分片长6.4秒
    totalSegments 总片3375个
    currentSegmentNumber = availabilityStartTime/segmentDuration
    currentSegmentNumber 就是当前 分片id和当前分片 url

  • c# 解析MPD

    private (string baseUrl, string representationId, long currentSegmentNumber, long totalSegments, string initializationUrl, string mediaTemplate, string mediaPrefix) GetMPDInfo(string mpdContent)
    {
        // Parse MPD content
        var doc = XDocument.Parse(mpdContent);
        XNamespace ns = "urn:mpeg:dash:schema:mpd:2011";
    
      // Extract BaseURL
      string baseUrl = doc.Descendants(ns + "BaseURL").FirstOrDefault()?.Value;
    
      if (string.IsNullOrEmpty(baseUrl))
      {
          throw new InvalidOperationException("BaseURL not found in MPD.");
      }
    
      // Extract the Representation ID (audio=96000)
      var representation = doc.Descendants(ns + "Representation")
                               .FirstOrDefault(rep => rep.Attribute("bandwidth")?.Value == "96000");
    
      string representationId = representation?.Attribute("id")?.Value;
    
      if (string.IsNullOrEmpty(representationId))
      {
              throw new InvalidOperationException("Representation for audio=96000 not found in MPD.");
          }
    
          // Extract Segment Duration
          var segmentTemplate = doc.Descendants(ns + "SegmentTemplate").FirstOrDefault();
          var segmentDurationStr = segmentTemplate?.Attribute("duration")?.Value;
          TimeSpan segmentDuration = TimeSpan.FromSeconds(Convert.ToDouble(segmentDurationStr));
    
          // Extract the TimeShiftBufferDepth (total available time)
          var timeShiftBufferDepthElement = doc.Descendants(ns + "MPD")
                                               .FirstOrDefault()?.Attribute("timeShiftBufferDepth");
    
          if (timeShiftBufferDepthElement == null)
          {
              throw new InvalidOperationException("TimeShiftBufferDepth not found in MPD.");
          }
          string timeShiftBufferDepthValue = timeShiftBufferDepthElement.Value;
          TimeSpan totalTime = XmlConvert.ToTimeSpan(timeShiftBufferDepthValue);
    
          // 计算当前分片编号(基于当前时间)
          DateTime availabilityStartTime = DateTime.Parse("1969-12-31T23:59:44Z");
    
          segmentDuration = TimeSpan.FromSeconds(6.4); // 明确指定分片时长
          long currentSegmentNumber = CalculateCurrentSegmentNumber(availabilityStartTime, segmentDuration.TotalSeconds);
    
          long totalSegments = (long)(totalTime.TotalSeconds / segmentDuration.TotalSeconds);
          // Calculate the total number of segments based on total time and segment duration
    
          // Extract the initialization URL and media prefix
          string initializationUrl = segmentTemplate?.Attribute("initialization")?.Value;
          string mediaTemplate = segmentTemplate?.Attribute("media")?.Value;
    
          if (string.IsNullOrEmpty(initializationUrl) || string.IsNullOrEmpty(mediaTemplate))
          {
              throw new InvalidOperationException("Initialization file URL or media URL template not found in SegmentTemplate.");
          }
    
          // Extract common prefix before "$RepresentationID$" in both initialization and media templates
          string mediaPrefix = mediaTemplate.Split('$')[0];  // This will give you "bbc_world_service-" part
    
          // Return information, including the calculated total segments
          return (baseUrl, representationId, currentSegmentNumber, totalSegments, initializationUrl, mediaTemplate, mediaPrefix);
      }
    
  • 下载头文件为mp4

      // 1. 下载初始化文件
    string initSegmentUrl = $"{baseUrl}{initializationUrl.Replace("$RepresentationID$", representationId)}";
    Console.WriteLine($"Downloading initialization segment: {initSegmentUrl}");
    DownloadFile(initSegmentUrl, $"{representationId}.mp4");
    
  • 下载分片文件为m4s,这是自动下载。

    for (long i = 0; i < mpdInfo.totalSegments; i++)
    {
        long segmentNumber = mpdInfo.currentSegmentNumber - i; // 从最新分片向前追溯
        string segmentUrl = $"{baseUrl}{mediaTemplate}"
            .Replace("$RepresentationID$", representationId)
            .Replace("$Number$", segmentNumber.ToString());
    
        Console.WriteLine($"Downloading segment {segmentNumber}: {segmentUrl}");
        try
        {
            DownloadFile(segmentUrl, $"{representationId}-{segmentNumber}.m4s");
        }
        catch (Exception ex) when (ex.Message.Contains("404"))
        {
            Console.WriteLine($"Segment {segmentNumber} not found (expired), skipping...");
            break; // 如果分片过期,停止继续尝试更早分片
        }
    }
    
  • 在 DevToolsProtocolEventReceived 中的请求是实时的,更精确。按大小或时间段下载,精确实现不同大小的合并文件。

        CoreWebView2DevToolsProtocolEventReceiver receiver = webView21.CoreWebView2.GetDevToolsProtocolEventReceiver("Network.requestWillBeSent");
    receiver.DevToolsProtocolEventReceived += async (sender, e) =>
    {
        try
        {
            var eventData = JObject.Parse(e.ParameterObjectAsJson);
            string url = eventData["request"]?["url"]?.ToString();
    
            if (url == null) return;
    
            // 捕获 MPD 文件
            if (url.EndsWith(".mpd"))
            {
                Console.WriteLine("捕获到 MPD 文件: " + url);
                string mpdContent = await DownloadText(url); // 下载 MPD 内容
                //ParseMPD(mpdContent); // 解析并下载所有分段
                DownloadAndMergeSegments(mpdContent);
            }
            // 捕获 .m4s 分段
            else if (url.Contains(".m4s"))
            {
                Console.WriteLine("捕获到 m4s 请求: " + url);
                Uri uri = new Uri(url); 
                DownloadFile(url, Path.GetFileName(uri.AbsolutePath).Split('=')[1]);
                //DownloadM4S(url);
            }
        }
        catch (Exception ex)
        {
        }
    };
    
    
  • 下载后的 头文件和分片
    image-1748060834816

合并直播流

  • 有坑!!!
    把上面的头文件和分片合并,这个对于流媒体小白来说太坑了,所有AI都回答用ffmpeg或者MP4Box之类工具,怎么尝试都不对,后来看来N_m3u8DL 项目源码,才知道原来只需要简单的用头文件流合并分片文件流 就行了,包括上面的MPD的分析都来自对N_m3u8D的项目调试结果,并不是权威或MEPG公开标准,感兴趣可自行研究。WebView2的 DevToolsProtocolEventReceived只能下载mpd中dash和m4s。ffmpeg 不能直接用分片转直播流的。

  • c# 合并DAS头文件和分片的实现:
    
    private void btnMerge_Click(object sender, EventArgs e)
       {
           string[] files = Directory.GetFiles(audioFolder, "*.m4s");
           List<string> listFiles = new List<string>();
           string initFile = Path.Combine(audioFolder, "audio=96000.mp4");
           listFiles.AddRange(files);
           Array.Sort(files);  // 确保文件按顺序排列
    
       string outputFilePath = Path.Combine(audioFolder, "merged_output.mp4");
       using (Stream fileOutputStream = File.Open(outputFilePath, FileMode.Create, FileAccess.Write))
       { 
           using (var inputStream = File.OpenRead(initFile))
           {
               inputStream.CopyTo(fileOutputStream);
           } 
           foreach (var inputFilePath in files)
           {
               using (var inputStream = File.OpenRead(inputFilePath))
               {
                   inputStream.CopyTo(fileOutputStream);
               }
           } 
       }
    

使用whisper 生成字幕

  • 准备本地模型运行 (P15V笔记本电脑T600 4g 驱动cuda12.9) pytorch 环境:
    pip install git+https://github.com/openai/whisper.git
    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

      C:\Users\chenl>python
     Python 3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)] on win32
     Type "help", "copyright", "credits" or "license" for more information.
     >>> import torch
     >>> print(torch.cuda.is_available())
     True
     >>> print(torch.cuda.get_device_name(0))
     NVIDIA T600 Laptop GPU
     >>>
    
  • whisper_transcribe.py

    import whisper
     import sys
    
     def transcribe_audio(audio_path):
         model = whisper.load_model("base")  # 使用最小模型("tiny" 也可以,取决于你硬件配置)
         result = model.transcribe(audio_path)
         return result['text']
    
     if __name__ == "__main__":
         if len(sys.argv) < 2:
             print("Please provide an audio file path.")
             sys.exit(1)
    
         audio_path = sys.argv[1]
         transcript = transcribe_audio(audio_path)
         print(transcript)
    
    
  • 生成STT,用c# 调用python

    private void btnSTT_Click(object sender, EventArgs e)
    {
        string outputFilePath = Path.Combine(audioFolder, "merged_output.mp4");
        // 检查文件是否存在
        if (!File.Exists(outputFilePath))
        {
            MessageBox.Show("音频文件未找到!");
            return;
        }
        // 调用 Python 脚本进行音频转文字 
        string sourceDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); 
        string whisperScriptPath = Path.Combine(sourceDirectory, "whisper_transcribe.py");
    
        string pythonExePath = "python"; //python 在环境变量已配             
    
    
        ProcessStartInfo startInfo = new ProcessStartInfo
        {
            FileName = pythonExePath,
            Arguments = $"\"{whisperScriptPath}\" \"{outputFilePath}\"",  // 传递音频文件路径
            RedirectStandardOutput = true,
            UseShellExecute = false,
            CreateNoWindow = true
        };
    
        Process process = new Process
        {
            StartInfo = startInfo
        };
    
        try
        {
            process.Start();
    
            // 获取 Python 脚本输出(转录的文本)
            string output = process.StandardOutput.ReadToEnd();
            process.WaitForExit();
    
            // 弹出新窗体并显示转录结果
            TranscriptionForm.ShowTranscription(output);
    
            // 可选:将转录结果保存到文件中
            string outputTextFilePath = Path.Combine(audioFolder, "transcription.txt");
            File.WriteAllText(outputTextFilePath, output);
        }
        catch (Exception ex)
        {
            MessageBox.Show($"错误: {ex.Message}");
        }
    
    }
    

    image-1748085088874