🛠️ 安装
pip install whisperplus git+https://github.com/huggingface/transformers
pip install flash-attn --no-build-isolation
🤗 模型中心
你可以在HuggingFace模型中心找到这些模型
🎙️ 使用方法
要使用whisperplus库,请按照以下步骤进行不同的任务:
🎵 YouTube链接转音频
from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3
from transformers import BitsAndBytesConfig, HqqConfig
import torch
url = "https://www.youtube.com/watch?v=di3rHkEZuUw"
audio_path = download_youtube_to_mp3(url, output_dir="downloads", filename="test")
hqq_config = HqqConfig(
nbits=4,
group_size=64,
quant_zero=False,
quant_scale=False,
axis=0,
offload_meta=False,
) # 默认使用axis=0
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
pipeline = SpeechToTextPipeline(
model_id="distil-whisper/distil-large-v3",
quant_config=hqq_config,
flash_attention_2=True,
)
transcript = pipeline(
audio_path=audio_path,
chunk_length_s=30,
stride_length_s=5,
max_new_tokens=128,
batch_size=100,
language="english",
return_timestamps=False,
)
print(transcript)
🍎 Apple MLX
from whisperplus.pipelines import mlx_whisper
from whisperplus import download_youtube_to_mp3
url = "https://www.youtube.com/watch?v=1__CAdTJ5JU"
audio_path = download_youtube_to_mp3(url)
text = mlx_whisper.transcribe(
audio_path, path_or_hf_repo="mlx-community/whisper-large-v3-mlx"
)["text"]
print(text)
🍏 Lightning Mlx Whisper
from whisperplus.pipelines.lightning_whisper_mlx import LightningWhisperMLX
from whisperplus import download_youtube_to_mp3
url = "https://www.youtube.com/watch?v=1__CAdTJ5JU"
audio_path = download_youtube_to_mp3(url)
whisper = LightningWhisperMLX(model="distil-large-v3", batch_size=12, quant=None)
output = whisper.transcribe(audio_path=audio_path)["text"]
📰 文本摘要
from whisperplus.pipelines.summarization import TextSummarizationPipeline
summarizer = TextSummarizationPipeline(model_id="facebook/bart-large-cnn")
summary = summarizer.summarize(transcript)
print(summary[0]["summary_text"])
📰 长文本支持摘要
from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline
summarizer = LongTextSummarizationPipeline(model_id="facebook/bart-large-cnn")
summary_text = summarizer.summarize(transcript)
print(summary_text)
💬 说话人分离
你必须确认以下两个模型的许可权限。
- https://huggingface.co/pyannote/speaker-diarization-3.1
- https://huggingface.co/pyannote/segmentation-3.0
pip install -r requirements/speaker_diarization.txt
pip install -U "huggingface_hub[cli]"
huggingface-cli login
from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline
from whisperplus import download_youtube_to_mp3, format_speech_to_dialogue
audio_path = download_youtube_to_mp3("https://www.youtube.com/watch?v=mRB14sFHw2E")
device = "cuda" # cpu 或 mps
pipeline = ASRDiarizationPipeline.from_pretrained(
asr_model="openai/whisper-large-v3",
diarizer_model="pyannote/speaker-diarization-3.1",
use_auth_token=False,
chunk_length_s=30,
device=device,
)
output_text = pipeline(audio_path, num_speakers=2, min_speaker=1, max_speaker=2)
dialogue = format_speech_to_dialogue(output_text)
print(dialogue)
⭐ RAG - 与视频对话(LanceDB)
pip install sentence-transformers ctransformers langchain
from whisperplus.pipelines.chatbot import ChatWithVideo
chat = ChatWithVideo(
input_file="trascript.txt",
llm_model_name="TheBloke/Mistral-7B-v0.1-GGUF",
llm_model_file="mistral-7b-v0.1.Q4_K_M.gguf",
llm_model_type="mistral",
embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
)
query = "这个视频讲的是什么?"
response = chat.run_query(query)
print(response)
🌠 RAG - 与视频对话(AutoLLM)
pip install autollm>=0.1.9
from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo
# service_context_params
system_prompt = """
你是一个友好的AI助手,帮助用户根据你能访问的文档找到与他们问题最相关和准确的答案。
回答问题时,主要依赖文档中的信息。
"""
query_wrapper_prompt = """
以下是文档信息。
---------------------
{context_str}
---------------------
使用文档信息并主要依赖它来回答查询。
查询:{query_str}
回答:
"""
chat = AutoLLMChatWithVideo(
input_file="input_dir", # mp3文件路径
openai_key="YOUR_OPENAI_KEY", # 可选
huggingface_key="YOUR_HUGGINGFACE_KEY", # 可选
llm_model="gpt-3.5-turbo",
llm_max_tokens="256",
llm_temperature="0.1",
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
embed_model="huggingface/BAAI/bge-large-zh", # "text-embedding-ada-002"
)
query = "这个视频讲的是什么?"
response = chat.run_query(query)
print(response)
🎙️ 文字转语音
from whisperplus.pipelines.text2speech import TextToSpeechPipeline
tts = TextToSpeechPipeline(model_id="suno/bark")
audio = tts(text="你好,世界", voice_preset="v2/en_speaker_6")
🎥 自动字幕
pip install moviepy
apt install imagemagick libmagick++-dev
cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml
from whisperplus.pipelines.whisper_autocaption import WhisperAutoCaptionPipeline
from whisperplus import download_youtube_to_mp4
video_path = download_youtube_to_mp4(
"https://www.youtube.com/watch?v=di3rHkEZuUw",
output_dir="downloads",
filename="test",
) # 可选
caption = WhisperAutoCaptionPipeline(model_id="openai/whisper-large-v3")
caption(video_path=video_path, output_path="output.mp4", language="chinese")
😍 贡献
pip install pre-commit
pre-commit install
pre-commit run --all-files
📜 许可证
本项目根据Apache License 2.0的条款进行许可。
🤗 引用
@misc{radford2022whisper,
doi = {10.48550/ARXIV.2212.04356},
url = {https://arxiv.org/abs/2212.04356},
author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
title = {Robust Speech Recognition via Large-Scale Weak Supervision},
publisher = {arXiv},
year = {2022},
copyright = {arXiv.org perpetual, non-exclusive license}
}