karesaeedff commited on
Commit
465ffc9
·
verified ·
1 Parent(s): fca03af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -93
app.py CHANGED
@@ -1,93 +1,92 @@
1
- import gradio as gr
2
- import librosa
3
- import numpy as np
4
- import torch
5
- from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor
6
- from tqdm import tqdm
7
- import tempfile
8
- import json
9
- import soundfile as sf
10
-
11
- # ==== 参数 ====
12
- SAMPLE_RATE = 16000
13
- WINDOW = 5
14
- STEP = 2
15
- MUSIC_THRESHOLD = 0.4
16
- VOICE_THRESHOLD = 0.3
17
- MIN_SING_DURATION = 8
18
-
19
- # ==== 手动加载 feature extractor ====
20
- music_model_id = "AI-Music-Detection/ai_music_detection_large_60s"
21
- music_feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
22
- music_model = AutoModelForAudioClassification.from_pretrained(music_model_id)
23
- music_pipe = pipeline(
24
- task="audio-classification",
25
- model=music_model,
26
- feature_extractor=music_feature_extractor
27
- )
28
-
29
- # 声音检测(用来区分是否人声)
30
- voice_pipe = pipeline(
31
- "audio-classification",
32
- model="superb/hubert-large-superb-sid"
33
- )
34
-
35
- def detect_singing(audio_path):
36
- wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
37
- duration = len(wav) / SAMPLE_RATE
38
- results = []
39
-
40
- for start in np.arange(0, duration - WINDOW, STEP):
41
- end = start + WINDOW
42
- snippet = wav[int(start * SAMPLE_RATE):int(end * SAMPLE_RATE)]
43
-
44
- # 音乐检测
45
- music_pred = music_pipe(snippet, sampling_rate=SAMPLE_RATE)
46
- music_score = max([p['score'] for p in music_pred if 'music' in p['label'].lower()] or [0])
47
-
48
- # 声音检测
49
- voice_pred = voice_pipe(snippet, sampling_rate=SAMPLE_RATE)
50
- voice_score = max([p['score'] for p in voice_pred if 'speech' in p['label'].lower()] or [0])
51
-
52
- if music_score > MUSIC_THRESHOLD and voice_score > VOICE_THRESHOLD:
53
- results.append((float(start), float(end)))
54
-
55
- # 合并连续窗口
56
- merged = []
57
- for seg in results:
58
- if not merged or seg[0] > merged[-1][1]:
59
- merged.append(list(seg))
60
- else:
61
- merged[-1][1] = seg[1]
62
- merged = [(s, e) for s, e in merged if e - s >= MIN_SING_DURATION]
63
- return merged
64
-
65
-
66
- def analyze_audio(file):
67
- if file is None:
68
- return "请上传音频文件", None
69
-
70
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
71
- data, sr = librosa.load(file.name, sr=SAMPLE_RATE)
72
- sf.write(tmp.name, data, sr)
73
- segments = detect_singing(tmp.name)
74
-
75
- if not segments:
76
- return "未检测到明显唱歌片段", json.dumps([], indent=2)
77
-
78
- json_output = json.dumps(
79
- [{"start": s, "end": e, "duration": round(e - s, 2)} for s, e in segments],
80
- indent=2
81
- )
82
- return f"检测到 {len(segments)} 段唱歌片段", json_output
83
-
84
-
85
- with gr.Blocks(title="🎵 Singing Segment Detector") as demo:
86
- gr.Markdown("# 🎤 自动识别唱歌片段\n上传音频文件,返回检测到的唱歌时间段 JSON。")
87
- audio_in = gr.Audio(type="filepath", label="上传音频文件(WAV)")
88
- btn = gr.Button("开始分析")
89
- status = gr.Textbox(label="分析状态", interactive=False)
90
- json_out = gr.Code(label="唱歌片段时间戳(JSON)", language="json")
91
- btn.click(fn=analyze_audio, inputs=[audio_in], outputs=[status, json_out])
92
-
93
- demo.launch()
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor
6
+ from tqdm import tqdm
7
+ import tempfile
8
+ import json
9
+ import soundfile as sf
10
+
11
+ # ==== 参数 ====
12
+ SAMPLE_RATE = 16000
13
+ WINDOW = 5
14
+ STEP = 2
15
+ MUSIC_THRESHOLD = 0.4
16
+ VOICE_THRESHOLD = 0.3
17
+ MIN_SING_DURATION = 8
18
+
19
+ # ==== 模型加载 ====
20
+ music_model_id = "AI-Music-Detection/ai_music_detection_large_60s"
21
+ music_feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
22
+ music_model = AutoModelForAudioClassification.from_pretrained(music_model_id)
23
+ music_pipe = pipeline(
24
+ task="audio-classification",
25
+ model=music_model,
26
+ feature_extractor=music_feature_extractor
27
+ )
28
+ voice_pipe = pipeline(
29
+ "audio-classification",
30
+ model="superb/hubert-large-superb-sid"
31
+ )
32
+
33
+ def detect_singing(audio_path):
34
+ wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
35
+ duration = len(wav) / SAMPLE_RATE
36
+ results = []
37
+
38
+ for start in np.arange(0, duration - WINDOW, STEP):
39
+ end = start + WINDOW
40
+ snippet = wav[int(start * SAMPLE_RATE):int(end * SAMPLE_RATE)]
41
+
42
+ music_pred = music_pipe(snippet, sampling_rate=SAMPLE_RATE)
43
+ music_score = max([p['score'] for p in music_pred if 'music' in p['label'].lower()] or [0])
44
+
45
+ voice_pred = voice_pipe(snippet, sampling_rate=SAMPLE_RATE)
46
+ voice_score = max([p['score'] for p in voice_pred if 'speech' in p['label'].lower()] or [0])
47
+
48
+ if music_score > MUSIC_THRESHOLD and voice_score > VOICE_THRESHOLD:
49
+ results.append((float(start), float(end)))
50
+
51
+ # 合并连续窗口
52
+ merged = []
53
+ for seg in results:
54
+ if not merged or seg[0] > merged[-1][1]:
55
+ merged.append(list(seg))
56
+ else:
57
+ merged[-1][1] = seg[1]
58
+ merged = [(s, e) for s, e in merged if e - s >= MIN_SING_DURATION]
59
+ return merged
60
+
61
+
62
+ def analyze_audio(file):
63
+ if file is None:
64
+ return "请上传音频文件", None
65
+
66
+ audio_path = file # type="filepath" 返回的是路径字符串
67
+
68
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
69
+ data, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
70
+ sf.write(tmp.name, data, sr)
71
+ segments = detect_singing(tmp.name)
72
+
73
+ if not segments:
74
+ return "未检测到明显唱歌片段", json.dumps([], indent=2)
75
+
76
+ json_output = json.dumps(
77
+ [{"start": s, "end": e, "duration": round(e - s, 2)} for s, e in segments],
78
+ indent=2
79
+ )
80
+ return f"检测到 {len(segments)} 段唱歌片段", json_output
81
+
82
+
83
+ # ==== Gradio UI ====
84
+ with gr.Blocks(title="🎵 Singing Segment Detector") as demo:
85
+ gr.Markdown("# 🎤 自动识别唱歌片段\n上传音频文件(从视频提取后),返回检测到的唱歌时间段 JSON。")
86
+ audio_in = gr.Audio(type="filepath", label="上传音频文件(WAV)")
87
+ btn = gr.Button("开始分析")
88
+ status = gr.Textbox(label="分析状态", interactive=False)
89
+ json_out = gr.Code(label="唱歌片段时间戳(JSON)", language="json")
90
+ btn.click(fn=analyze_audio, inputs=[audio_in], outputs=[status, json_out])
91
+
92
+ demo.launch()