用WebView 采集直播流，whisper 生成字幕。

现在TTS 文字生成声音很成熟也很方便，比如 r.maifeipin.com 中的RSS 语音播报功能，就是调有免费的azure接口。那么把音视频转文字的STT有哪些好的方案呢？Google的AIStudio 可以，而且支持多人多场景多角色模拟自动切换，非常的惊艳，但这个只能试用或者升级收费。有没有免费好用的呢，当然就是openai 家的 whisper ，为了测试 whisper ，特意研究了一下直播源的采集和pytorch环境部署，记录如下：

采集直播流数据在线电台

使用的在线电台 https://www.bbc.co.uk/sounds/player/bbc_world_service

在webview2的 DevToolsProtocolEventReceived 事件中获取请求，发现有主要有mpd 和m4s 两种请求，通过AI知道这是dash 直播流，分析头文件

https://a.files.bbci.co.uk/ms6/live/3441A116-B12E-4D2F-ACA8-C1984642FA4B/audio/simulcast/dash/nonuk/pc_hd_abr_v2/cfs/bbc_world_service.mpd

<MPD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:mpeg:dash:schema:mpd:2011" xmlns:dvb="urn:dvb:dash:dash-extensions:2014-1"
      xsi:schemaLocation="urn:mpeg:dash:schema:mpd:2011 http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-DASH_schema_files/DASH-MPD.xsd"
      type="dynamic" availabilityStartTime="1969-12-31T23:59:44Z"
      minimumUpdatePeriod="PT6H" timeShiftBufferDepth="PT6H" maxSegmentDuration="PT7S" minBufferTime="PT3.200S"
      profiles="urn:dvb:dash:profile:dvb-dash:2014,urn:dvb:dash:profile:dvb-dash:isoff-ext-live:2014"
      publishTime="2025-02-04T16:18:01">

        <UTCTiming schemeIdUri="urn:mpeg:dash:utc:http-iso:2014" value="https://time.akamai.com/?iso" />

        <BaseURL dvb:priority="1" dvb:weight="1" serviceLocation="cfs">https://as-dash-ww.live.cf.md.bbci.co.uk/pool_87948813/live/ww/bbc_world_service/bbc_world_service.isml/dash/</BaseURL>

        <Period id="1" start="PT0S">
        <AdaptationSet group="1" contentType="audio" lang="en" minBandwidth="48000" maxBandwidth="96000"
                       segmentAlignment="true" audioSamplingRate="48000" mimeType="audio/mp4" codecs="mp4a.40.5" startWithSAP="1">
            <AudioChannelConfiguration schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011" value="2"/>
            <Role schemeIdUri="urn:mpeg:dash:role:2011" value="main"/>
            <SegmentTemplate timescale="48000" initialization="bbc_world_service-$RepresentationID$.dash"
                           media="bbc_world_service-$RepresentationID$-$Number$.m4s" startNumber="1" duration="307200"/>
            <Representation id="audio=48000" bandwidth="48000"/>
            <Representation id="audio=96000" bandwidth="96000"/>
        </AdaptationSet>

      </Period>
    </MPD>

Deepseek给的分析+ 个人整理后，大约有这些概念：

SegmentTemplate 采样率有4800和9600
minimumUpdatePeriod 片长6小时
minimumUpdatePeriod 和 maxSegmentDuration 平分片长6.4秒
totalSegments 总片3375个
currentSegmentNumber = availabilityStartTime/segmentDuration
currentSegmentNumber 就是当前分片id和当前分片 url

c# 解析MPD

private (string baseUrl, string representationId, long currentSegmentNumber, long totalSegments, string initializationUrl, string mediaTemplate, string mediaPrefix) GetMPDInfo(string mpdContent)
{
    // Parse MPD content
    var doc = XDocument.Parse(mpdContent);
    XNamespace ns = "urn:mpeg:dash:schema:mpd:2011";

  // Extract BaseURL
  string baseUrl = doc.Descendants(ns + "BaseURL").FirstOrDefault()?.Value;

  if (string.IsNullOrEmpty(baseUrl))
  {
      throw new InvalidOperationException("BaseURL not found in MPD.");
  }

  // Extract the Representation ID (audio=96000)
  var representation = doc.Descendants(ns + "Representation")
                           .FirstOrDefault(rep => rep.Attribute("bandwidth")?.Value == "96000");

  string representationId = representation?.Attribute("id")?.Value;

  if (string.IsNullOrEmpty(representationId))
  {
          throw new InvalidOperationException("Representation for audio=96000 not found in MPD.");
      }

      // Extract Segment Duration
      var segmentTemplate = doc.Descendants(ns + "SegmentTemplate").FirstOrDefault();
      var segmentDurationStr = segmentTemplate?.Attribute("duration")?.Value;
      TimeSpan segmentDuration = TimeSpan.FromSeconds(Convert.ToDouble(segmentDurationStr));

      // Extract the TimeShiftBufferDepth (total available time)
      var timeShiftBufferDepthElement = doc.Descendants(ns + "MPD")
                                           .FirstOrDefault()?.Attribute("timeShiftBufferDepth");

      if (timeShiftBufferDepthElement == null)
      {
          throw new InvalidOperationException("TimeShiftBufferDepth not found in MPD.");
      }
      string timeShiftBufferDepthValue = timeShiftBufferDepthElement.Value;
      TimeSpan totalTime = XmlConvert.ToTimeSpan(timeShiftBufferDepthValue);

      // 计算当前分片编号（基于当前时间）
      DateTime availabilityStartTime = DateTime.Parse("1969-12-31T23:59:44Z");

      segmentDuration = TimeSpan.FromSeconds(6.4); // 明确指定分片时长
      long currentSegmentNumber = CalculateCurrentSegmentNumber(availabilityStartTime, segmentDuration.TotalSeconds);

      long totalSegments = (long)(totalTime.TotalSeconds / segmentDuration.TotalSeconds);
      // Calculate the total number of segments based on total time and segment duration

      // Extract the initialization URL and media prefix
      string initializationUrl = segmentTemplate?.Attribute("initialization")?.Value;
      string mediaTemplate = segmentTemplate?.Attribute("media")?.Value;

      if (string.IsNullOrEmpty(initializationUrl) || string.IsNullOrEmpty(mediaTemplate))
      {
          throw new InvalidOperationException("Initialization file URL or media URL template not found in SegmentTemplate.");
      }

      // Extract common prefix before "$RepresentationID$" in both initialization and media templates
      string mediaPrefix = mediaTemplate.Split('$')[0];  // This will give you "bbc_world_service-" part

      // Return information, including the calculated total segments
      return (baseUrl, representationId, currentSegmentNumber, totalSegments, initializationUrl, mediaTemplate, mediaPrefix);
  }

下载头文件为mp4

  // 1. 下载初始化文件
string initSegmentUrl = $"{baseUrl}{initializationUrl.Replace("$RepresentationID$", representationId)}";
Console.WriteLine($"Downloading initialization segment: {initSegmentUrl}");
DownloadFile(initSegmentUrl, $"{representationId}.mp4");

下载分片文件为m4s，这是自动下载。

for (long i = 0; i < mpdInfo.totalSegments; i++)
{
    long segmentNumber = mpdInfo.currentSegmentNumber - i; // 从最新分片向前追溯
    string segmentUrl = $"{baseUrl}{mediaTemplate}"
        .Replace("$RepresentationID$", representationId)
        .Replace("$Number$", segmentNumber.ToString());

    Console.WriteLine($"Downloading segment {segmentNumber}: {segmentUrl}");
    try
    {
        DownloadFile(segmentUrl, $"{representationId}-{segmentNumber}.m4s");
    }
    catch (Exception ex) when (ex.Message.Contains("404"))
    {
        Console.WriteLine($"Segment {segmentNumber} not found (expired), skipping...");
        break; // 如果分片过期，停止继续尝试更早分片
    }
}

在 DevToolsProtocolEventReceived 中的请求是实时的，更精确。按大小或时间段下载，精确实现不同大小的合并文件。

    CoreWebView2DevToolsProtocolEventReceiver receiver = webView21.CoreWebView2.GetDevToolsProtocolEventReceiver("Network.requestWillBeSent");
receiver.DevToolsProtocolEventReceived += async (sender, e) =>
{
    try
    {
        var eventData = JObject.Parse(e.ParameterObjectAsJson);
        string url = eventData["request"]?["url"]?.ToString();

        if (url == null) return;

        // 捕获 MPD 文件
        if (url.EndsWith(".mpd"))
        {
            Console.WriteLine("捕获到 MPD 文件: " + url);
            string mpdContent = await DownloadText(url); // 下载 MPD 内容
            //ParseMPD(mpdContent); // 解析并下载所有分段
            DownloadAndMergeSegments(mpdContent);
        }
        // 捕获 .m4s 分段
        else if (url.Contains(".m4s"))
        {
            Console.WriteLine("捕获到 m4s 请求: " + url);
            Uri uri = new Uri(url); 
            DownloadFile(url, Path.GetFileName(uri.AbsolutePath).Split('=')[1]);
            //DownloadM4S(url);
        }
    }
    catch (Exception ex)
    {
    }
};

下载后的头文件和分片

合并直播流

有坑！！！
把上面的头文件和分片合并，这个对于流媒体小白来说太坑了，所有AI都回答用ffmpeg或者MP4Box之类工具，怎么尝试都不对，后来看来N_m3u8DL 项目源码，才知道原来只需要简单的用头文件流合并分片文件流就行了，包括上面的MPD的分析都来自对N_m3u8D的项目调试结果，并不是权威或MEPG公开标准，感兴趣可自行研究。WebView2的 DevToolsProtocolEventReceived只能下载mpd中dash和m4s。ffmpeg 不能直接用分片转直播流的。

c# 合并DAS头文件和分片的实现：

private void btnMerge_Click(object sender, EventArgs e)
   {
       string[] files = Directory.GetFiles(audioFolder, "*.m4s");
       List<string> listFiles = new List<string>();
       string initFile = Path.Combine(audioFolder, "audio=96000.mp4");
       listFiles.AddRange(files);
       Array.Sort(files);  // 确保文件按顺序排列

   string outputFilePath = Path.Combine(audioFolder, "merged_output.mp4");
   using (Stream fileOutputStream = File.Open(outputFilePath, FileMode.Create, FileAccess.Write))
   { 
       using (var inputStream = File.OpenRead(initFile))
       {
           inputStream.CopyTo(fileOutputStream);
       } 
       foreach (var inputFilePath in files)
       {
           using (var inputStream = File.OpenRead(inputFilePath))
           {
               inputStream.CopyTo(fileOutputStream);
           }
       } 
   }

使用whisper 生成字幕

准备本地模型运行 (P15V笔记本电脑T600 4g 驱动cuda12.9) pytorch 环境：
pip install git+https://github.com/openai/whisper.git
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

  C:\Users\chenl>python
 Python 3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)] on win32
 Type "help", "copyright", "credits" or "license" for more information.
 >>> import torch
 >>> print(torch.cuda.is_available())
 True
 >>> print(torch.cuda.get_device_name(0))
 NVIDIA T600 Laptop GPU
 >>>

whisper_transcribe.py

import whisper
 import sys

 def transcribe_audio(audio_path):
     model = whisper.load_model("base")  # 使用最小模型（"tiny" 也可以，取决于你硬件配置）
     result = model.transcribe(audio_path)
     return result['text']

 if __name__ == "__main__":
     if len(sys.argv) < 2:
         print("Please provide an audio file path.")
         sys.exit(1)

     audio_path = sys.argv[1]
     transcript = transcribe_audio(audio_path)
     print(transcript)

生成STT，用c# 调用python

private void btnSTT_Click(object sender, EventArgs e)
{
    string outputFilePath = Path.Combine(audioFolder, "merged_output.mp4");
    // 检查文件是否存在
    if (!File.Exists(outputFilePath))
    {
        MessageBox.Show("音频文件未找到！");
        return;
    }
    // 调用 Python 脚本进行音频转文字 
    string sourceDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); 
    string whisperScriptPath = Path.Combine(sourceDirectory, "whisper_transcribe.py");

    string pythonExePath = "python"; //python 在环境变量已配             


    ProcessStartInfo startInfo = new ProcessStartInfo
    {
        FileName = pythonExePath,
        Arguments = $"\"{whisperScriptPath}\" \"{outputFilePath}\"",  // 传递音频文件路径
        RedirectStandardOutput = true,
        UseShellExecute = false,
        CreateNoWindow = true
    };

    Process process = new Process
    {
        StartInfo = startInfo
    };

    try
    {
        process.Start();

        // 获取 Python 脚本输出（转录的文本）
        string output = process.StandardOutput.ReadToEnd();
        process.WaitForExit();

        // 弹出新窗体并显示转录结果
        TranscriptionForm.ShowTranscription(output);

        // 可选：将转录结果保存到文件中
        string outputTextFilePath = Path.Combine(audioFolder, "transcription.txt");
        File.WriteAllText(outputTextFilePath, output);
    }
    catch (Exception ex)
    {
        MessageBox.Show($"错误: {ex.Message}");
    }

}

个人资料