Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
a5d8e64
1
Parent(s):
122c63e
add new tools
Browse files- mcp_server.py +322 -6
- requirements.txt +1 -0
- tools/audio_cleaning.py +258 -0
- tools/audio_cutting.py +0 -1
- tools/audio_insertion.py +374 -0
- tools/combine_tracks.py +4 -1
- tools/voice_replacement.py +145 -0
mcp_server.py
CHANGED
|
@@ -25,6 +25,12 @@ from tools.music_understanding import (
|
|
| 25 |
suggest_cutting_points,
|
| 26 |
analyze_genre_and_style,
|
| 27 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
|
|
@@ -112,7 +118,7 @@ def stretch_audio_to_bpm_wrapper(audio_path: str, target_bpm: float) -> str:
|
|
| 112 |
|
| 113 |
def extract_selected_stems_wrapper(
|
| 114 |
audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
|
| 115 |
-
) -> Tuple[str|None, str|None, str|None, str|None]:
|
| 116 |
"""
|
| 117 |
Extract selected stems from an audio file based on user choices.
|
| 118 |
|
|
@@ -157,7 +163,7 @@ def extract_selected_stems_wrapper(
|
|
| 157 |
if not stems_to_extract:
|
| 158 |
raise ValueError("At least one stem must be selected for extraction")
|
| 159 |
|
| 160 |
-
results= extract_selected_stems(audio_path, stems_to_extract)
|
| 161 |
|
| 162 |
vocals = results.get("vocals")
|
| 163 |
drums = results.get("drums")
|
|
@@ -268,7 +274,7 @@ def mute_time_windows_wrapper(
|
|
| 268 |
|
| 269 |
def extract_segments_wrapper(
|
| 270 |
audio_path: str, segments_str: str, format_val: str, join: bool
|
| 271 |
-
) -> Tuple[str, str|None, str|None, str|None]:
|
| 272 |
"""
|
| 273 |
Extract multiple segments (up to 4 segments) from an audio file and optionally join them.
|
| 274 |
|
|
@@ -682,7 +688,9 @@ def shift_to_key_wrapper(
|
|
| 682 |
# MCP Tool Wrappers with Documentation for MCP Server
|
| 683 |
|
| 684 |
|
| 685 |
-
def separate_audio_mcp(
|
|
|
|
|
|
|
| 686 |
"""
|
| 687 |
Separate audio into vocals, drums, bass, and other stems using Demucs neural network.
|
| 688 |
|
|
@@ -715,7 +723,12 @@ def separate_audio_mcp(audio_path: str, output_format: str = "wav") -> Tuple[str
|
|
| 715 |
)
|
| 716 |
return vocals, drums, bass, other
|
| 717 |
except Exception as e:
|
| 718 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
|
| 720 |
|
| 721 |
def combine_tracks_mcp(
|
|
@@ -777,7 +790,9 @@ def combine_tracks_mcp(
|
|
| 777 |
return f"Error combining tracks: {str(e)}"
|
| 778 |
|
| 779 |
|
| 780 |
-
def pitch_shift_with_semitones_mcp(
|
|
|
|
|
|
|
| 781 |
"""
|
| 782 |
Shift the pitch of an audio file by a specified number of semitones.
|
| 783 |
|
|
@@ -855,6 +870,7 @@ def align_songs_by_bpm_mcp(
|
|
| 855 |
)
|
| 856 |
# Apply target BPM by stretching both tracks
|
| 857 |
from tools.time_strech import stretch_to_bpm
|
|
|
|
| 858 |
aligned1 = stretch_to_bpm(result1, target_bpm, None, output_format)
|
| 859 |
aligned2 = stretch_to_bpm(result2, target_bpm, None, output_format)
|
| 860 |
return aligned1, aligned2
|
|
@@ -1136,6 +1152,209 @@ def analyze_genre_and_style_mcp(audio_path: str) -> str:
|
|
| 1136 |
return f"Error analyzing genre and style: {str(e)}"
|
| 1137 |
|
| 1138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1139 |
def create_interface() -> gr.TabbedInterface:
|
| 1140 |
"""
|
| 1141 |
Create and configure the complete Gradio interface with all audio processing tools.
|
|
@@ -1668,6 +1887,95 @@ def create_interface() -> gr.TabbedInterface:
|
|
| 1668 |
flagging_mode="never",
|
| 1669 |
)
|
| 1670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1671 |
return gr.TabbedInterface(
|
| 1672 |
[
|
| 1673 |
stem_interface,
|
|
@@ -1694,6 +2002,10 @@ def create_interface() -> gr.TabbedInterface:
|
|
| 1694 |
structure_interface,
|
| 1695 |
cutting_points_interface,
|
| 1696 |
genre_interface,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1697 |
],
|
| 1698 |
[
|
| 1699 |
"Stem Separation",
|
|
@@ -1720,6 +2032,10 @@ def create_interface() -> gr.TabbedInterface:
|
|
| 1720 |
"Song Structure",
|
| 1721 |
"Cutting Points",
|
| 1722 |
"Genre Analysis",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1723 |
],
|
| 1724 |
)
|
| 1725 |
|
|
|
|
| 25 |
suggest_cutting_points,
|
| 26 |
analyze_genre_and_style,
|
| 27 |
)
|
| 28 |
+
from tools.audio_cleaning import remove_noise
|
| 29 |
+
from tools.audio_insertion import (
|
| 30 |
+
insert_section,
|
| 31 |
+
replace_section,
|
| 32 |
+
)
|
| 33 |
+
from tools.voice_replacement import replace_voice_wrapper
|
| 34 |
|
| 35 |
|
| 36 |
def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
|
|
|
|
| 118 |
|
| 119 |
def extract_selected_stems_wrapper(
|
| 120 |
audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
|
| 121 |
+
) -> Tuple[str | None, str | None, str | None, str | None]:
|
| 122 |
"""
|
| 123 |
Extract selected stems from an audio file based on user choices.
|
| 124 |
|
|
|
|
| 163 |
if not stems_to_extract:
|
| 164 |
raise ValueError("At least one stem must be selected for extraction")
|
| 165 |
|
| 166 |
+
results = extract_selected_stems(audio_path, stems_to_extract)
|
| 167 |
|
| 168 |
vocals = results.get("vocals")
|
| 169 |
drums = results.get("drums")
|
|
|
|
| 274 |
|
| 275 |
def extract_segments_wrapper(
|
| 276 |
audio_path: str, segments_str: str, format_val: str, join: bool
|
| 277 |
+
) -> Tuple[str, str | None, str | None, str | None]:
|
| 278 |
"""
|
| 279 |
Extract multiple segments (up to 4 segments) from an audio file and optionally join them.
|
| 280 |
|
|
|
|
| 688 |
# MCP Tool Wrappers with Documentation for MCP Server
|
| 689 |
|
| 690 |
|
| 691 |
+
def separate_audio_mcp(
|
| 692 |
+
audio_path: str, output_format: str = "wav"
|
| 693 |
+
) -> Tuple[str, str, str, str]:
|
| 694 |
"""
|
| 695 |
Separate audio into vocals, drums, bass, and other stems using Demucs neural network.
|
| 696 |
|
|
|
|
| 723 |
)
|
| 724 |
return vocals, drums, bass, other
|
| 725 |
except Exception as e:
|
| 726 |
+
return (
|
| 727 |
+
f"Error separating audio: {str(e)}",
|
| 728 |
+
f"Error: {str(e)}",
|
| 729 |
+
f"Error: {str(e)}",
|
| 730 |
+
f"Error: {str(e)}",
|
| 731 |
+
)
|
| 732 |
|
| 733 |
|
| 734 |
def combine_tracks_mcp(
|
|
|
|
| 790 |
return f"Error combining tracks: {str(e)}"
|
| 791 |
|
| 792 |
|
| 793 |
+
def pitch_shift_with_semitones_mcp(
|
| 794 |
+
audio_path: str, semitones: int, output_format: str = "wav"
|
| 795 |
+
) -> str:
|
| 796 |
"""
|
| 797 |
Shift the pitch of an audio file by a specified number of semitones.
|
| 798 |
|
|
|
|
| 870 |
)
|
| 871 |
# Apply target BPM by stretching both tracks
|
| 872 |
from tools.time_strech import stretch_to_bpm
|
| 873 |
+
|
| 874 |
aligned1 = stretch_to_bpm(result1, target_bpm, None, output_format)
|
| 875 |
aligned2 = stretch_to_bpm(result2, target_bpm, None, output_format)
|
| 876 |
return aligned1, aligned2
|
|
|
|
| 1152 |
return f"Error analyzing genre and style: {str(e)}"
|
| 1153 |
|
| 1154 |
|
| 1155 |
+
def remove_noise_mcp(
|
| 1156 |
+
audio_path: str,
|
| 1157 |
+
noise_type: str = "general",
|
| 1158 |
+
sensitivity: float = 0.5,
|
| 1159 |
+
output_format: str = "wav",
|
| 1160 |
+
) -> str:
|
| 1161 |
+
"""
|
| 1162 |
+
Remove noise from audio using adaptive filtering and spectral subtraction.
|
| 1163 |
+
|
| 1164 |
+
This MCP wrapper provides noise removal capabilities for various types of
|
| 1165 |
+
unwanted audio artifacts including hiss, hum, rumble, and general background noise.
|
| 1166 |
+
|
| 1167 |
+
Args:
|
| 1168 |
+
audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
|
| 1169 |
+
noise_type: Type of noise to remove ('general', 'hiss', 'hum', 'rumble', 'background')
|
| 1170 |
+
sensitivity: Noise reduction sensitivity (0.0 to 1.0, default: 0.5)
|
| 1171 |
+
output_format: Output format for the cleaned audio ('wav' or 'mp3', default: 'wav')
|
| 1172 |
+
|
| 1173 |
+
Returns:
|
| 1174 |
+
Path to the cleaned audio file
|
| 1175 |
+
|
| 1176 |
+
Examples:
|
| 1177 |
+
>>> remove_noise_mcp("noisy_recording.wav", "hiss", 0.7, "wav")
|
| 1178 |
+
# Returns path to cleaned audio with reduced hiss
|
| 1179 |
+
|
| 1180 |
+
>>> remove_noise_mcp("podcast.mp3", "background", 0.3, "mp3")
|
| 1181 |
+
# Returns path to cleaned audio with reduced background noise
|
| 1182 |
+
|
| 1183 |
+
Note:
|
| 1184 |
+
- Higher sensitivity values remove more noise but may affect audio quality
|
| 1185 |
+
- Different noise types use specialized algorithms for optimal results
|
| 1186 |
+
- Processing time varies with audio length and noise complexity
|
| 1187 |
+
"""
|
| 1188 |
+
try:
|
| 1189 |
+
result = remove_noise(
|
| 1190 |
+
audio_path=audio_path,
|
| 1191 |
+
noise_type=noise_type,
|
| 1192 |
+
sensitivity=sensitivity,
|
| 1193 |
+
output_path=None,
|
| 1194 |
+
output_format=output_format,
|
| 1195 |
+
)
|
| 1196 |
+
return result
|
| 1197 |
+
except Exception as e:
|
| 1198 |
+
return f"Error removing noise: {str(e)}"
|
| 1199 |
+
|
| 1200 |
+
|
| 1201 |
+
def insert_section_mcp(
|
| 1202 |
+
audio_path: str,
|
| 1203 |
+
section_path: str,
|
| 1204 |
+
insert_time: float,
|
| 1205 |
+
crossfade_duration: float = 0.1,
|
| 1206 |
+
output_format: str = "wav",
|
| 1207 |
+
) -> str:
|
| 1208 |
+
"""
|
| 1209 |
+
Insert a section from one audio track into another at a precise time position.
|
| 1210 |
+
|
| 1211 |
+
This MCP wrapper allows inserting audio content (like an intro, advertisement,
|
| 1212 |
+
or sound effect) into an existing track at any position with smooth
|
| 1213 |
+
crossfading to avoid audible clicks or abrupt transitions.
|
| 1214 |
+
|
| 1215 |
+
Args:
|
| 1216 |
+
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
|
| 1217 |
+
section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
|
| 1218 |
+
insert_time: Position to insert the section (in seconds from start of main audio)
|
| 1219 |
+
crossfade_duration: Length of crossfade in seconds (default: 0.1)
|
| 1220 |
+
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
|
| 1221 |
+
|
| 1222 |
+
Returns:
|
| 1223 |
+
Path to the audio file with the section inserted
|
| 1224 |
+
|
| 1225 |
+
Examples:
|
| 1226 |
+
>>> insert_section_mcp("main_track.wav", "intro.wav", 5.0, 0.2, "wav")
|
| 1227 |
+
# Returns path to audio with intro inserted at 5 seconds
|
| 1228 |
+
|
| 1229 |
+
>>> insert_section_mcp("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "mp3")
|
| 1230 |
+
# Returns path to audio with ad inserted at 3 minutes
|
| 1231 |
+
|
| 1232 |
+
Note:
|
| 1233 |
+
- Insert position is measured from the start of the main audio
|
| 1234 |
+
- Crossfade prevents clicks and creates smooth transitions
|
| 1235 |
+
- If insert_time + section duration exceeds main audio duration, section is truncated
|
| 1236 |
+
"""
|
| 1237 |
+
try:
|
| 1238 |
+
result = insert_section(
|
| 1239 |
+
audio_path=audio_path,
|
| 1240 |
+
section_path=section_path,
|
| 1241 |
+
insert_time=insert_time,
|
| 1242 |
+
crossfade_duration=crossfade_duration,
|
| 1243 |
+
output_path=None,
|
| 1244 |
+
output_format=output_format,
|
| 1245 |
+
)
|
| 1246 |
+
return result
|
| 1247 |
+
except Exception as e:
|
| 1248 |
+
return f"Error inserting audio section: {str(e)}"
|
| 1249 |
+
|
| 1250 |
+
|
| 1251 |
+
def replace_section_mcp(
|
| 1252 |
+
audio_path: str,
|
| 1253 |
+
start_time: float,
|
| 1254 |
+
end_time: float,
|
| 1255 |
+
replacement_path: str,
|
| 1256 |
+
crossfade_duration: float = 0.1,
|
| 1257 |
+
output_format: str = "wav",
|
| 1258 |
+
) -> str:
|
| 1259 |
+
"""
|
| 1260 |
+
Replace a section of an audio track with another audio segment.
|
| 1261 |
+
|
| 1262 |
+
This MCP wrapper removes a specified time range from the main audio and
|
| 1263 |
+
replaces it with new content, using crossfades for smooth transitions.
|
| 1264 |
+
|
| 1265 |
+
Args:
|
| 1266 |
+
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
|
| 1267 |
+
start_time: Start time of section to replace (in seconds)
|
| 1268 |
+
end_time: End time of section to replace (in seconds)
|
| 1269 |
+
replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
|
| 1270 |
+
crossfade_duration: Length of crossfade in seconds (default: 0.1)
|
| 1271 |
+
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
|
| 1272 |
+
|
| 1273 |
+
Returns:
|
| 1274 |
+
Path to the audio file with the section replaced
|
| 1275 |
+
|
| 1276 |
+
Examples:
|
| 1277 |
+
>>> replace_section_mcp("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "wav")
|
| 1278 |
+
# Returns path to audio with 60-90s section replaced
|
| 1279 |
+
|
| 1280 |
+
>>> replace_section_mcp("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "mp3")
|
| 1281 |
+
# Returns path to audio with 2-minute section replaced
|
| 1282 |
+
|
| 1283 |
+
Note:
|
| 1284 |
+
- Start time must be less than end time
|
| 1285 |
+
- Crossfade prevents clicks at replacement boundaries
|
| 1286 |
+
- Replacement section is trimmed if longer than specified duration
|
| 1287 |
+
"""
|
| 1288 |
+
try:
|
| 1289 |
+
result = replace_section(
|
| 1290 |
+
audio_path=audio_path,
|
| 1291 |
+
start_time=start_time,
|
| 1292 |
+
end_time=end_time,
|
| 1293 |
+
replacement_path=replacement_path,
|
| 1294 |
+
crossfade_duration=crossfade_duration,
|
| 1295 |
+
output_path=None,
|
| 1296 |
+
output_format=output_format,
|
| 1297 |
+
)
|
| 1298 |
+
return result
|
| 1299 |
+
except Exception as e:
|
| 1300 |
+
return f"Error replacing audio section: {str(e)}"
|
| 1301 |
+
|
| 1302 |
+
|
| 1303 |
+
def replace_voice_mcp(
|
| 1304 |
+
source_audio_path: str,
|
| 1305 |
+
target_audio_path: str,
|
| 1306 |
+
diffusion_steps: int = 10,
|
| 1307 |
+
length_adjust: float = 1.0,
|
| 1308 |
+
inference_cfg_rate: float = 0.7,
|
| 1309 |
+
f0_condition: bool = False,
|
| 1310 |
+
auto_f0_adjust: bool = True,
|
| 1311 |
+
pitch_shift: int = 0,
|
| 1312 |
+
) -> str:
|
| 1313 |
+
"""
|
| 1314 |
+
Replace voice in source audio with voice from target audio using Seed-VC.
|
| 1315 |
+
|
| 1316 |
+
This MCP wrapper uses the Seed-VC Gradio space to perform voice conversion,
|
| 1317 |
+
replacing the voice characteristics in the source audio with those from
|
| 1318 |
+
the target audio while preserving the linguistic content and timing.
|
| 1319 |
+
|
| 1320 |
+
Args:
|
| 1321 |
+
source_audio_path: Path to the source audio file (voice to be replaced)
|
| 1322 |
+
target_audio_path: Path to the target audio file (voice to use)
|
| 1323 |
+
diffusion_steps: Number of diffusion steps for inference (default: 10)
|
| 1324 |
+
length_adjust: Length adjustment factor (default: 1.0)
|
| 1325 |
+
inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
|
| 1326 |
+
f0_condition: Whether to use F0 conditioning (default: False)
|
| 1327 |
+
auto_f0_adjust: Whether to auto-adjust F0 (default: True)
|
| 1328 |
+
pitch_shift: Pitch shift in semitones (default: 0)
|
| 1329 |
+
|
| 1330 |
+
Returns:
|
| 1331 |
+
Path to the generated voice-replaced audio file
|
| 1332 |
+
|
| 1333 |
+
Examples:
|
| 1334 |
+
>>> replace_voice_mcp("source.wav", "target_voice.wav")
|
| 1335 |
+
# Returns path to voice-replaced audio file
|
| 1336 |
+
|
| 1337 |
+
>>> replace_voice_mcp("speech.mp3", "singer.wav", diffusion_steps=15, pitch_shift=2)
|
| 1338 |
+
# Returns path to voice-replaced audio with custom settings
|
| 1339 |
+
|
| 1340 |
+
Note:
|
| 1341 |
+
- Uses Seed-VC model for high-quality voice conversion
|
| 1342 |
+
- Preserves linguistic content and timing from source audio
|
| 1343 |
+
- Applies voice characteristics from target audio
|
| 1344 |
+
- Processing time depends on diffusion steps and audio length
|
| 1345 |
+
"""
|
| 1346 |
+
return replace_voice_wrapper(
|
| 1347 |
+
source_audio_path=source_audio_path,
|
| 1348 |
+
target_audio_path=target_audio_path,
|
| 1349 |
+
diffusion_steps=diffusion_steps,
|
| 1350 |
+
length_adjust=length_adjust,
|
| 1351 |
+
inference_cfg_rate=inference_cfg_rate,
|
| 1352 |
+
f0_condition=f0_condition,
|
| 1353 |
+
auto_f0_adjust=auto_f0_adjust,
|
| 1354 |
+
pitch_shift=pitch_shift,
|
| 1355 |
+
)
|
| 1356 |
+
|
| 1357 |
+
|
| 1358 |
def create_interface() -> gr.TabbedInterface:
|
| 1359 |
"""
|
| 1360 |
Create and configure the complete Gradio interface with all audio processing tools.
|
|
|
|
| 1887 |
flagging_mode="never",
|
| 1888 |
)
|
| 1889 |
|
| 1890 |
+
# Tab 20: Audio Cleaning
|
| 1891 |
+
cleaning_interface = gr.Interface(
|
| 1892 |
+
fn=remove_noise_mcp,
|
| 1893 |
+
inputs=[
|
| 1894 |
+
gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
|
| 1895 |
+
gr.Dropdown(
|
| 1896 |
+
choices=["general", "hiss", "hum", "rumble", "background"],
|
| 1897 |
+
value="general",
|
| 1898 |
+
label="Noise Type",
|
| 1899 |
+
),
|
| 1900 |
+
gr.Slider(
|
| 1901 |
+
minimum=0.0, maximum=1.0, value=0.5, step=0.1, label="Sensitivity"
|
| 1902 |
+
),
|
| 1903 |
+
gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
|
| 1904 |
+
],
|
| 1905 |
+
outputs=gr.Audio(label="Cleaned Audio", type="filepath"),
|
| 1906 |
+
title="Audio Noise Removal",
|
| 1907 |
+
description="Remove various types of noise from audio using adaptive filtering and spectral subtraction.",
|
| 1908 |
+
examples=None,
|
| 1909 |
+
cache_examples=False,
|
| 1910 |
+
flagging_mode="never",
|
| 1911 |
+
)
|
| 1912 |
+
|
| 1913 |
+
# Tab 21: Insert Section
|
| 1914 |
+
insert_interface = gr.Interface(
|
| 1915 |
+
fn=insert_section_mcp,
|
| 1916 |
+
inputs=[
|
| 1917 |
+
gr.Audio(type="filepath", label="Main Audio File", sources=["upload"]),
|
| 1918 |
+
gr.Audio(type="filepath", label="Section to Insert", sources=["upload"]),
|
| 1919 |
+
gr.Number(value=5.0, label="Insert Time (seconds)"),
|
| 1920 |
+
gr.Number(value=0.1, label="Crossfade Duration (seconds)"),
|
| 1921 |
+
gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
|
| 1922 |
+
],
|
| 1923 |
+
outputs=gr.Audio(label="Audio with Insertion", type="filepath"),
|
| 1924 |
+
title="Insert Audio Section",
|
| 1925 |
+
description="Insert a section from one audio track into another at a precise time position.",
|
| 1926 |
+
examples=None,
|
| 1927 |
+
cache_examples=False,
|
| 1928 |
+
flagging_mode="never",
|
| 1929 |
+
)
|
| 1930 |
+
|
| 1931 |
+
# Tab 22: Replace Section
|
| 1932 |
+
replace_interface = gr.Interface(
|
| 1933 |
+
fn=replace_section_mcp,
|
| 1934 |
+
inputs=[
|
| 1935 |
+
gr.Audio(type="filepath", label="Main Audio File", sources=["upload"]),
|
| 1936 |
+
gr.Number(value=60.0, label="Start Time (seconds)"),
|
| 1937 |
+
gr.Number(value=90.0, label="End Time (seconds)"),
|
| 1938 |
+
gr.Audio(type="filepath", label="Replacement Section", sources=["upload"]),
|
| 1939 |
+
gr.Number(value=0.1, label="Crossfade Duration (seconds)"),
|
| 1940 |
+
gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
|
| 1941 |
+
],
|
| 1942 |
+
outputs=gr.Audio(label="Audio with Replacement", type="filepath"),
|
| 1943 |
+
title="Replace Audio Section",
|
| 1944 |
+
description="Replace a section of an audio track with another audio segment.",
|
| 1945 |
+
examples=None,
|
| 1946 |
+
cache_examples=False,
|
| 1947 |
+
flagging_mode="never",
|
| 1948 |
+
)
|
| 1949 |
+
|
| 1950 |
+
# Tab 23: Voice Replacement
|
| 1951 |
+
voice_replacement_interface = gr.Interface(
|
| 1952 |
+
fn=replace_voice_mcp,
|
| 1953 |
+
inputs=[
|
| 1954 |
+
gr.Audio(
|
| 1955 |
+
type="filepath",
|
| 1956 |
+
label="Source Audio (voice to be replaced)",
|
| 1957 |
+
sources=["upload"],
|
| 1958 |
+
),
|
| 1959 |
+
gr.Audio(
|
| 1960 |
+
type="filepath", label="Target Audio (voice to use)", sources=["upload"]
|
| 1961 |
+
),
|
| 1962 |
+
gr.Number(value=10, label="Diffusion Steps", minimum=1, maximum=50),
|
| 1963 |
+
gr.Number(value=1.0, label="Length Adjust", minimum=0.1, maximum=3.0),
|
| 1964 |
+
gr.Number(value=0.7, label="Inference CFG Rate", minimum=0.0, maximum=1.0),
|
| 1965 |
+
gr.Checkbox(value=False, label="F0 Condition"),
|
| 1966 |
+
gr.Checkbox(value=True, label="Auto F0 Adjust"),
|
| 1967 |
+
gr.Number(
|
| 1968 |
+
value=0, label="Pitch Shift (semitones)", minimum=-12, maximum=12
|
| 1969 |
+
),
|
| 1970 |
+
],
|
| 1971 |
+
outputs=gr.Audio(label="Voice-Replaced Audio", type="filepath"),
|
| 1972 |
+
title="Voice Replacement with Seed-VC",
|
| 1973 |
+
description="Replace voice in source audio with voice from target audio using Seed-VC AI model.",
|
| 1974 |
+
examples=None,
|
| 1975 |
+
cache_examples=False,
|
| 1976 |
+
flagging_mode="never",
|
| 1977 |
+
)
|
| 1978 |
+
|
| 1979 |
return gr.TabbedInterface(
|
| 1980 |
[
|
| 1981 |
stem_interface,
|
|
|
|
| 2002 |
structure_interface,
|
| 2003 |
cutting_points_interface,
|
| 2004 |
genre_interface,
|
| 2005 |
+
cleaning_interface,
|
| 2006 |
+
insert_interface,
|
| 2007 |
+
replace_interface,
|
| 2008 |
+
voice_replacement_interface,
|
| 2009 |
],
|
| 2010 |
[
|
| 2011 |
"Stem Separation",
|
|
|
|
| 2032 |
"Song Structure",
|
| 2033 |
"Cutting Points",
|
| 2034 |
"Genre Analysis",
|
| 2035 |
+
"Audio Cleaning",
|
| 2036 |
+
"Insert Section",
|
| 2037 |
+
"Replace Section",
|
| 2038 |
+
"Voice Replacement",
|
| 2039 |
],
|
| 2040 |
)
|
| 2041 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
librosa>=0.10.0
|
| 2 |
numpy>=1.24.0
|
|
|
|
| 3 |
torch~=2.8.0
|
| 4 |
torchaudio~=2.8.0
|
| 5 |
torchcodec~=0.8.0
|
|
|
|
| 1 |
librosa>=0.10.0
|
| 2 |
numpy>=1.24.0
|
| 3 |
+
scipy>=1.10.0
|
| 4 |
torch~=2.8.0
|
| 5 |
torchaudio~=2.8.0
|
| 6 |
torchcodec~=0.8.0
|
tools/audio_cleaning.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
import librosa
|
| 6 |
+
import numpy as np
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
from scipy.signal import butter, lfilter, filtfilt
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
|
| 12 |
+
"""Load audio file with standard settings."""
|
| 13 |
+
y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
|
| 14 |
+
return y, int(sr)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def detect_noise_profile(audio: np.ndarray, sample_rate: int) -> dict:
|
| 18 |
+
"""
|
| 19 |
+
Analyze audio to detect noise characteristics.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
audio: Audio data as numpy array
|
| 23 |
+
sample_rate: Sample rate of audio
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Dictionary with noise profile information
|
| 27 |
+
"""
|
| 28 |
+
# Compute spectral features for noise detection
|
| 29 |
+
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
| 30 |
+
magnitude = np.abs(stft)
|
| 31 |
+
|
| 32 |
+
# Identify noise floor (quiet parts)
|
| 33 |
+
noise_floor = np.percentile(magnitude, 10)
|
| 34 |
+
|
| 35 |
+
# Detect steady noise (consistent low-frequency content)
|
| 36 |
+
freqs = librosa.fft_frequencies(sr=sample_rate, n_fft=2048)
|
| 37 |
+
low_freq_mask = freqs < 200 # Below 200 Hz
|
| 38 |
+
steady_noise = np.mean(magnitude[:, low_freq_mask], axis=1)
|
| 39 |
+
|
| 40 |
+
# Detect hiss (high frequency noise)
|
| 41 |
+
high_freq_mask = freqs > 4000 # Above 4 kHz
|
| 42 |
+
hiss_level = np.mean(magnitude[:, high_freq_mask], axis=1)
|
| 43 |
+
|
| 44 |
+
# Compute overall noise characteristics
|
| 45 |
+
signal_power = np.mean(magnitude**2, axis=1)
|
| 46 |
+
noise_power = np.mean(magnitude**2, axis=1) - signal_power
|
| 47 |
+
snr_estimate = 10 * np.log10(signal_power / (noise_power + 1e-10))
|
| 48 |
+
|
| 49 |
+
return {
|
| 50 |
+
"noise_floor": float(noise_floor),
|
| 51 |
+
"steady_noise": float(steady_noise),
|
| 52 |
+
"hiss_level": float(hiss_level),
|
| 53 |
+
"snr_estimate": float(snr_estimate),
|
| 54 |
+
"has_significant_noise": bool(
|
| 55 |
+
steady_noise > noise_floor * 2 or hiss_level > noise_floor * 1.5
|
| 56 |
+
),
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def spectral_subtraction(
|
| 61 |
+
audio: np.ndarray, noise_profile: dict, sample_rate: int
|
| 62 |
+
) -> np.ndarray:
|
| 63 |
+
"""
|
| 64 |
+
Apply spectral subtraction to remove identified noise.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
audio: Input audio data
|
| 68 |
+
noise_profile: Noise profile from detect_noise_profile()
|
| 69 |
+
sample_rate: Sample rate of audio
|
| 70 |
+
|
| 71 |
+
Returns:
|
| 72 |
+
Cleaned audio data
|
| 73 |
+
"""
|
| 74 |
+
# Compute STFT of audio
|
| 75 |
+
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
| 76 |
+
magnitude = np.abs(stft)
|
| 77 |
+
phase = np.angle(stft)
|
| 78 |
+
|
| 79 |
+
# Create noise gate based on noise floor
|
| 80 |
+
noise_gate = np.minimum(magnitude / (noise_profile["noise_floor"] + 1e-10), 1.0)
|
| 81 |
+
|
| 82 |
+
# Apply gentle noise reduction
|
| 83 |
+
reduction_factor = 0.3 if noise_profile["has_significant_noise"] else 0.15
|
| 84 |
+
cleaned_magnitude = magnitude * (1 - noise_gate * reduction_factor)
|
| 85 |
+
|
| 86 |
+
# Reconstruct audio
|
| 87 |
+
cleaned_stft = cleaned_magnitude * np.exp(1j * phase)
|
| 88 |
+
cleaned_audio = librosa.istft(cleaned_stft, hop_length=512)
|
| 89 |
+
|
| 90 |
+
return cleaned_audio
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def adaptive_filter(
|
| 94 |
+
audio: np.ndarray, sample_rate: int, noise_type: str = "general"
|
| 95 |
+
) -> np.ndarray:
|
| 96 |
+
"""
|
| 97 |
+
Apply adaptive filtering based on noise type.
|
| 98 |
+
|
| 99 |
+
Args:
|
| 100 |
+
audio: Input audio data
|
| 101 |
+
sample_rate: Sample rate of audio
|
| 102 |
+
noise_type: Type of noise to address ('general', 'hiss', 'hum', 'background')
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
Filtered audio data
|
| 106 |
+
"""
|
| 107 |
+
if noise_type == "hiss":
|
| 108 |
+
# High-pass filter to reduce hiss (above 4kHz)
|
| 109 |
+
cutoff = 4000
|
| 110 |
+
b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
|
| 111 |
+
filtered_audio = lfilter(b, a, audio)
|
| 112 |
+
|
| 113 |
+
elif noise_type == "hum":
|
| 114 |
+
# Notch filter for common hum frequencies (50/60 Hz and harmonics)
|
| 115 |
+
# Apply multiple notch filters
|
| 116 |
+
filtered_audio = audio.copy()
|
| 117 |
+
hum_freqs = [50, 60, 100, 120, 180, 240] # Common power line harmonics
|
| 118 |
+
|
| 119 |
+
for freq in hum_freqs:
|
| 120 |
+
if freq < sample_rate / 2:
|
| 121 |
+
# Create notch filter
|
| 122 |
+
b, a = butter(
|
| 123 |
+
2,
|
| 124 |
+
[freq * 0.9, freq * 1.1],
|
| 125 |
+
fs=sample_rate,
|
| 126 |
+
btype="bandstop",
|
| 127 |
+
output="ba",
|
| 128 |
+
)
|
| 129 |
+
filtered_audio = lfilter(b, a, filtered_audio)
|
| 130 |
+
|
| 131 |
+
elif noise_type == "background":
|
| 132 |
+
# Spectral subtraction for background noise
|
| 133 |
+
noise_profile = detect_noise_profile(audio, sample_rate)
|
| 134 |
+
filtered_audio = spectral_subtraction(audio, noise_profile, sample_rate)
|
| 135 |
+
|
| 136 |
+
else:
|
| 137 |
+
# General broadband noise reduction
|
| 138 |
+
# Apply gentle low-pass filter
|
| 139 |
+
cutoff = int(min(8000, sample_rate // 2.5))
|
| 140 |
+
b, a = butter(4, cutoff, fs=sample_rate, btype="low", output="ba")
|
| 141 |
+
filtered_audio = lfilter(b, a, audio)
|
| 142 |
+
|
| 143 |
+
return filtered_audio
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def remove_noise(
|
| 147 |
+
audio_path: str,
|
| 148 |
+
noise_type: str = "general",
|
| 149 |
+
sensitivity: float = 0.5,
|
| 150 |
+
output_path: Optional[str] = None,
|
| 151 |
+
output_format: str = "wav",
|
| 152 |
+
) -> str:
|
| 153 |
+
"""
|
| 154 |
+
Remove noise from audio using adaptive filtering and spectral subtraction.
|
| 155 |
+
|
| 156 |
+
This function analyzes the audio to detect noise characteristics and applies
|
| 157 |
+
appropriate noise reduction techniques based on the noise type and sensitivity
|
| 158 |
+
settings. It supports various noise types including hiss, hum, rumble, and
|
| 159 |
+
general background noise.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
audio_path: Path to the audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
|
| 163 |
+
noise_type: Type of noise to remove ('general', 'hiss', 'hum', 'rumble', 'background')
|
| 164 |
+
- 'general': Broadband noise reduction
|
| 165 |
+
- 'hiss': High-frequency noise removal
|
| 166 |
+
- 'hum': Power line hum removal (50/60 Hz)
|
| 167 |
+
- 'rumble': Low-frequency rumble removal
|
| 168 |
+
- 'background': General background noise
|
| 169 |
+
sensitivity: Noise reduction sensitivity (0.0 to 1.0, default: 0.5)
|
| 170 |
+
Higher values remove more noise but may affect audio quality
|
| 171 |
+
output_path: Optional output directory (default: None, uses temp directory)
|
| 172 |
+
output_format: Output format for the cleaned audio ('wav' or 'mp3', default: 'wav')
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
Path to the cleaned audio file
|
| 176 |
+
|
| 177 |
+
Examples:
|
| 178 |
+
>>> remove_noise("noisy_recording.wav", "hiss", 0.7, "output", "wav")
|
| 179 |
+
# Returns 'path/to/noisy_recording_hiss_removed.wav' with reduced hiss
|
| 180 |
+
|
| 181 |
+
>>> remove_noise("podcast.mp3", "background", 0.3, "output", "mp3")
|
| 182 |
+
# Returns 'path/to/podcast_background_removed.mp3' with reduced background noise
|
| 183 |
+
|
| 184 |
+
Note:
|
| 185 |
+
- Higher sensitivity values remove more noise but may affect audio quality
|
| 186 |
+
- Different noise types use specialized algorithms for optimal results
|
| 187 |
+
- Processing time varies with audio length and noise complexity
|
| 188 |
+
- Preserves original audio quality and sample rate
|
| 189 |
+
- Works with mono or stereo audio files
|
| 190 |
+
"""
|
| 191 |
+
try:
|
| 192 |
+
# Load audio
|
| 193 |
+
audio, sample_rate = _load_audio(audio_path, mono=False)
|
| 194 |
+
|
| 195 |
+
# Apply noise reduction based on type and sensitivity
|
| 196 |
+
if noise_type == "hiss":
|
| 197 |
+
# High-pass filter for hiss removal
|
| 198 |
+
cutoff = 4000 - sensitivity * 2000 # 2000-4000 Hz range
|
| 199 |
+
b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
|
| 200 |
+
filtered_audio = filtfilt(b, a, audio)
|
| 201 |
+
|
| 202 |
+
elif noise_type == "hum":
|
| 203 |
+
# Multiple notch filters for harmonics
|
| 204 |
+
filtered_audio = audio.copy()
|
| 205 |
+
fundamental_freqs = [50, 60, 100] # Common power line fundamentals
|
| 206 |
+
|
| 207 |
+
for fundamental in fundamental_freqs:
|
| 208 |
+
if fundamental < sample_rate // 2:
|
| 209 |
+
# Filter fundamental and first few harmonics
|
| 210 |
+
for harmonic in range(1, 6):
|
| 211 |
+
freq = fundamental * harmonic
|
| 212 |
+
if freq < sample_rate // 2:
|
| 213 |
+
b, a = butter(
|
| 214 |
+
2,
|
| 215 |
+
[freq * 0.95, freq * 1.05],
|
| 216 |
+
fs=sample_rate,
|
| 217 |
+
btype="bandstop",
|
| 218 |
+
output="ba",
|
| 219 |
+
)
|
| 220 |
+
filtered_audio = filtfilt(b, a, filtered_audio)
|
| 221 |
+
|
| 222 |
+
elif noise_type == "rumble":
|
| 223 |
+
# High-pass filter for rumble removal
|
| 224 |
+
cutoff = 20 + sensitivity * 80 # 20-100 Hz range
|
| 225 |
+
b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
|
| 226 |
+
filtered_audio = filtfilt(b, a, audio)
|
| 227 |
+
|
| 228 |
+
else: # background or general
|
| 229 |
+
# General noise reduction
|
| 230 |
+
noise_profile = detect_noise_profile(audio, sample_rate)
|
| 231 |
+
filtered_audio = spectral_subtraction(audio, noise_profile, sample_rate)
|
| 232 |
+
# Apply based on sensitivity
|
| 233 |
+
strength = 0.2 + sensitivity * 0.6
|
| 234 |
+
filtered_audio = (1 - strength) * filtered_audio + strength * audio
|
| 235 |
+
|
| 236 |
+
# Normalize output
|
| 237 |
+
max_val = np.max(np.abs(filtered_audio))
|
| 238 |
+
if max_val > 0:
|
| 239 |
+
filtered_audio = filtered_audio / max_val * 0.95
|
| 240 |
+
|
| 241 |
+
# Save output
|
| 242 |
+
if output_path is None:
|
| 243 |
+
output_path = tempfile.mkdtemp(suffix="_noise_removed")
|
| 244 |
+
else:
|
| 245 |
+
os.makedirs(output_path, exist_ok=True)
|
| 246 |
+
|
| 247 |
+
# Generate output filename
|
| 248 |
+
input_filename = os.path.splitext(os.path.basename(audio_path))[0]
|
| 249 |
+
output_filename = f"{input_filename}_{noise_type}_removed.{output_format}"
|
| 250 |
+
output_file = os.path.join(output_path, output_filename)
|
| 251 |
+
|
| 252 |
+
# Save processed audio
|
| 253 |
+
sf.write(output_file, filtered_audio.T, sample_rate)
|
| 254 |
+
|
| 255 |
+
return output_file
|
| 256 |
+
|
| 257 |
+
except Exception as e:
|
| 258 |
+
raise RuntimeError(f"Error removing noise: {str(e)}")
|
tools/audio_cutting.py
CHANGED
|
@@ -43,7 +43,6 @@ def cut_audio(
|
|
| 43 |
# Get audio duration
|
| 44 |
duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
|
| 45 |
|
| 46 |
-
|
| 47 |
if start_time >= end_time:
|
| 48 |
raise ValueError(
|
| 49 |
f"Start time ({start_time}s) must be less than end time ({end_time}s)"
|
|
|
|
| 43 |
# Get audio duration
|
| 44 |
duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
|
| 45 |
|
|
|
|
| 46 |
if start_time >= end_time:
|
| 47 |
raise ValueError(
|
| 48 |
f"Start time ({start_time}s) must be less than end time ({end_time}s)"
|
tools/audio_insertion.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
import librosa
|
| 6 |
+
import numpy as np
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
|
| 11 |
+
"""Load audio file with standard settings."""
|
| 12 |
+
y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
|
| 13 |
+
return y, int(sr)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def detect_crossfade_point(
|
| 17 |
+
insert_position: float, audio_duration: float, crossfade_duration: float = 0.1
|
| 18 |
+
) -> tuple[float, float]:
|
| 19 |
+
"""
|
| 20 |
+
Calculate optimal crossfade points for seamless insertion.
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
insert_position: Where to insert the section (in seconds)
|
| 24 |
+
audio_duration: Total duration of the target audio (in seconds)
|
| 25 |
+
crossfade_duration: Length of crossfade (in seconds)
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
Tuple of (start_time, end_time) for crossfade region
|
| 29 |
+
"""
|
| 30 |
+
# Calculate crossfade boundaries
|
| 31 |
+
fade_start = max(0, insert_position - crossfade_duration / 2)
|
| 32 |
+
fade_end = min(audio_duration, insert_position + crossfade_duration / 2)
|
| 33 |
+
|
| 34 |
+
return fade_start, fade_end
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def apply_crossfade(
|
| 38 |
+
section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int
|
| 39 |
+
) -> np.ndarray:
|
| 40 |
+
"""
|
| 41 |
+
Apply crossfade between section and target audio.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
section: Audio section to insert
|
| 45 |
+
target: Target audio to insert into
|
| 46 |
+
crossfade_duration: Length of crossfade in seconds
|
| 47 |
+
sample_rate: Sample rate of audio
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Target audio with section inserted
|
| 51 |
+
"""
|
| 52 |
+
# Calculate crossfade samples
|
| 53 |
+
fade_samples = int(crossfade_duration * sample_rate)
|
| 54 |
+
|
| 55 |
+
# Create crossfade envelope
|
| 56 |
+
fade_in = np.linspace(0, 1, fade_samples)
|
| 57 |
+
fade_out = np.linspace(1, 0, fade_samples)
|
| 58 |
+
crossfade = fade_in * fade_out
|
| 59 |
+
|
| 60 |
+
# Apply crossfade to section end
|
| 61 |
+
section_end = section[-fade_samples:] if len(section) > fade_samples else section
|
| 62 |
+
section_end[:fade_samples] *= crossfade
|
| 63 |
+
|
| 64 |
+
# Insert section into target
|
| 65 |
+
insert_sample = int(len(target) * 0.5) # Insert at middle
|
| 66 |
+
result = np.insert(target, insert_sample, section_end, axis=0)
|
| 67 |
+
|
| 68 |
+
return result
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def insert_section(
|
| 72 |
+
audio_path: str,
|
| 73 |
+
section_path: str,
|
| 74 |
+
insert_time: float,
|
| 75 |
+
crossfade_duration: float = 0.1,
|
| 76 |
+
output_path: Optional[str] = None,
|
| 77 |
+
output_format: str = "wav",
|
| 78 |
+
) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Insert a section from one audio track into another at a precise time position.
|
| 81 |
+
|
| 82 |
+
This function allows you to insert audio content (like an intro, advertisement,
|
| 83 |
+
or sound effect) into an existing track at any position with smooth
|
| 84 |
+
crossfading to avoid audible clicks or abrupt transitions.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
|
| 88 |
+
section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
|
| 89 |
+
insert_time: Position to insert the section (in seconds from start of main audio)
|
| 90 |
+
crossfade_duration: Length of crossfade in seconds (default: 0.1)
|
| 91 |
+
Longer crossfades create smoother transitions but reduce clarity
|
| 92 |
+
output_path: Optional output directory (default: None, uses temp directory)
|
| 93 |
+
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
Path to the audio file with the section inserted
|
| 97 |
+
|
| 98 |
+
Examples:
|
| 99 |
+
>>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav")
|
| 100 |
+
# Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds
|
| 101 |
+
|
| 102 |
+
>>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3")
|
| 103 |
+
# Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes
|
| 104 |
+
|
| 105 |
+
Note:
|
| 106 |
+
- Insert position is measured from the start of the main audio
|
| 107 |
+
- Crossfade prevents clicks and creates smooth transitions
|
| 108 |
+
- If insert_time + section duration exceeds main audio duration, section is truncated
|
| 109 |
+
- Works with mono or stereo audio files
|
| 110 |
+
- Preserves original audio quality and sample rate
|
| 111 |
+
- Processing time depends on audio length and crossfade duration
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
# Load both audio files
|
| 115 |
+
main_audio, main_sr = _load_audio(audio_path, mono=False)
|
| 116 |
+
section_audio, section_sr = _load_audio(section_path, mono=False)
|
| 117 |
+
|
| 118 |
+
# Resample if needed
|
| 119 |
+
if main_sr != section_sr:
|
| 120 |
+
section_audio = librosa.resample(
|
| 121 |
+
section_audio, orig_sr=section_sr, target_sr=main_sr
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Calculate timing
|
| 125 |
+
main_duration = len(main_audio) / main_sr
|
| 126 |
+
|
| 127 |
+
# Validate insert position
|
| 128 |
+
if insert_time < 0:
|
| 129 |
+
raise ValueError("Insert time must be positive")
|
| 130 |
+
if insert_time > main_duration:
|
| 131 |
+
raise ValueError(
|
| 132 |
+
f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)"
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Calculate crossfade points
|
| 136 |
+
fade_start, fade_end = detect_crossfade_point(
|
| 137 |
+
insert_time, main_duration, crossfade_duration
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Extract main audio segments
|
| 141 |
+
main_before = main_audio[: int(fade_start * main_sr)]
|
| 142 |
+
main_after = main_audio[int(fade_end * main_sr) :]
|
| 143 |
+
|
| 144 |
+
# Apply crossfade and insert section
|
| 145 |
+
result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr)
|
| 146 |
+
|
| 147 |
+
# Combine all parts
|
| 148 |
+
final_audio = np.concatenate([main_before, result])
|
| 149 |
+
|
| 150 |
+
# Save output
|
| 151 |
+
if output_path is None:
|
| 152 |
+
output_path = tempfile.mkdtemp(suffix="_inserted")
|
| 153 |
+
else:
|
| 154 |
+
os.makedirs(output_path, exist_ok=True)
|
| 155 |
+
|
| 156 |
+
# Generate output filename
|
| 157 |
+
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
|
| 158 |
+
output_filename = f"{main_filename}_with_insertion.{output_format}"
|
| 159 |
+
output_file = os.path.join(output_path, output_filename)
|
| 160 |
+
|
| 161 |
+
# Save final audio
|
| 162 |
+
sf.write(output_file, final_audio.T, main_sr)
|
| 163 |
+
|
| 164 |
+
return output_file
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
raise RuntimeError(f"Error inserting audio section: {str(e)}")
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def insert_multiple_sections(
|
| 171 |
+
audio_path: str,
|
| 172 |
+
sections: list[tuple[str, float, float]],
|
| 173 |
+
crossfade_duration: float = 0.1,
|
| 174 |
+
output_path: Optional[str] = None,
|
| 175 |
+
output_format: str = "wav",
|
| 176 |
+
) -> str:
|
| 177 |
+
"""
|
| 178 |
+
Insert multiple sections into an audio track at specified positions.
|
| 179 |
+
|
| 180 |
+
This function allows inserting multiple audio sections (like multiple ads,
|
| 181 |
+
sound effects, or musical segments) into a main track with smooth
|
| 182 |
+
transitions between each insertion.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
|
| 186 |
+
sections: List of (section_path, insert_time) tuples
|
| 187 |
+
section_path: Path to audio section to insert
|
| 188 |
+
insert_time: Position to insert section (in seconds)
|
| 189 |
+
crossfade_duration: Length of crossfade in seconds (default: 0.1)
|
| 190 |
+
output_path: Optional output directory (default: None, uses temp directory)
|
| 191 |
+
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
Path to the audio file with all sections inserted
|
| 195 |
+
|
| 196 |
+
Examples:
|
| 197 |
+
>>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2)
|
| 198 |
+
# Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min
|
| 199 |
+
|
| 200 |
+
>>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3")
|
| 201 |
+
# Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute
|
| 202 |
+
|
| 203 |
+
Note:
|
| 204 |
+
- Sections are inserted in chronological order
|
| 205 |
+
- Each section gets crossfade at both start and end
|
| 206 |
+
- If sections overlap, later sections take precedence
|
| 207 |
+
- Total processing time increases with number of sections
|
| 208 |
+
- Works best with non-overlapping insertion times
|
| 209 |
+
"""
|
| 210 |
+
try:
|
| 211 |
+
# Load main audio
|
| 212 |
+
main_audio, main_sr = _load_audio(audio_path, mono=False)
|
| 213 |
+
main_duration = len(main_audio) / main_sr
|
| 214 |
+
current_audio = main_audio.copy()
|
| 215 |
+
|
| 216 |
+
# Sort sections by insert time
|
| 217 |
+
sorted_sections = sorted(sections, key=lambda x: x[1])
|
| 218 |
+
|
| 219 |
+
# Insert each section
|
| 220 |
+
for section_path, insert_time, _ in sorted_sections:
|
| 221 |
+
# Load section
|
| 222 |
+
section_audio, section_sr = _load_audio(section_path, mono=False)
|
| 223 |
+
|
| 224 |
+
# Resample if needed
|
| 225 |
+
if section_sr != main_sr:
|
| 226 |
+
section_audio = librosa.resample(
|
| 227 |
+
section_audio, orig_sr=section_sr, target_sr=main_sr
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Calculate crossfade points
|
| 231 |
+
fade_start, fade_end = detect_crossfade_point(
|
| 232 |
+
insert_time, main_duration, crossfade_duration
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
# Extract current audio segments
|
| 236 |
+
current_before = current_audio[: int(fade_start * main_sr)]
|
| 237 |
+
current_after = current_audio[int(fade_end * main_sr) :]
|
| 238 |
+
|
| 239 |
+
# Apply crossfade and insert section
|
| 240 |
+
section_with_fade = apply_crossfade(
|
| 241 |
+
section_audio, current_after, crossfade_duration, main_sr
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
# Update current audio
|
| 245 |
+
current_audio = np.concatenate([current_before, section_with_fade])
|
| 246 |
+
|
| 247 |
+
# Update duration for next insertion
|
| 248 |
+
main_duration = len(current_audio) / main_sr
|
| 249 |
+
|
| 250 |
+
# Save output
|
| 251 |
+
if output_path is None:
|
| 252 |
+
output_path = tempfile.mkdtemp(suffix="_multi_inserted")
|
| 253 |
+
else:
|
| 254 |
+
os.makedirs(output_path, exist_ok=True)
|
| 255 |
+
|
| 256 |
+
# Generate output filename
|
| 257 |
+
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
|
| 258 |
+
output_filename = f"{main_filename}_with_multiple_insertions.{output_format}"
|
| 259 |
+
output_file = os.path.join(output_path, output_filename)
|
| 260 |
+
|
| 261 |
+
# Save final audio
|
| 262 |
+
sf.write(output_file, current_audio.T, main_sr)
|
| 263 |
+
|
| 264 |
+
return output_file
|
| 265 |
+
|
| 266 |
+
except Exception as e:
|
| 267 |
+
raise RuntimeError(f"Error inserting multiple sections: {str(e)}")
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def replace_section(
|
| 271 |
+
audio_path: str,
|
| 272 |
+
start_time: float,
|
| 273 |
+
end_time: float,
|
| 274 |
+
replacement_path: str,
|
| 275 |
+
crossfade_duration: float = 0.1,
|
| 276 |
+
output_path: Optional[str] = None,
|
| 277 |
+
output_format: str = "wav",
|
| 278 |
+
) -> str:
|
| 279 |
+
"""
|
| 280 |
+
Replace a section of an audio track with another audio segment.
|
| 281 |
+
|
| 282 |
+
This function removes a specified time range from the main audio and
|
| 283 |
+
replaces it with new content, using crossfades for smooth transitions.
|
| 284 |
+
|
| 285 |
+
Args:
|
| 286 |
+
audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
|
| 287 |
+
start_time: Start time of section to replace (in seconds)
|
| 288 |
+
end_time: End time of section to replace (in seconds)
|
| 289 |
+
replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
|
| 290 |
+
crossfade_duration: Length of crossfade in seconds (default: 0.1)
|
| 291 |
+
output_path: Optional output directory (default: None, uses temp directory)
|
| 292 |
+
output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
|
| 293 |
+
|
| 294 |
+
Returns:
|
| 295 |
+
Path to the audio file with the section replaced
|
| 296 |
+
|
| 297 |
+
Examples:
|
| 298 |
+
>>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav")
|
| 299 |
+
# Returns 'path/to/song_replaced.wav' with 60-90s section replaced
|
| 300 |
+
|
| 301 |
+
>>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3")
|
| 302 |
+
# Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced
|
| 303 |
+
|
| 304 |
+
Note:
|
| 305 |
+
- Start time must be less than end time
|
| 306 |
+
- Crossfade prevents clicks at replacement boundaries
|
| 307 |
+
- Replacement section is trimmed if longer than specified duration
|
| 308 |
+
- Preserves original audio quality and sample rate
|
| 309 |
+
- Useful for fixing mistakes, updating content, or adding corrections
|
| 310 |
+
"""
|
| 311 |
+
try:
|
| 312 |
+
# Load both audio files
|
| 313 |
+
main_audio, main_sr = _load_audio(audio_path, mono=False)
|
| 314 |
+
replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False)
|
| 315 |
+
|
| 316 |
+
# Validate timing
|
| 317 |
+
if start_time >= end_time:
|
| 318 |
+
raise ValueError("Start time must be less than end time")
|
| 319 |
+
|
| 320 |
+
# Convert times to samples
|
| 321 |
+
start_sample = int(start_time * main_sr)
|
| 322 |
+
end_sample = int(end_time * main_sr)
|
| 323 |
+
|
| 324 |
+
# Extract main audio parts
|
| 325 |
+
main_before = main_audio[:start_sample]
|
| 326 |
+
main_after = main_audio[end_sample:]
|
| 327 |
+
|
| 328 |
+
# Resample replacement if needed
|
| 329 |
+
if replacement_sr != main_sr:
|
| 330 |
+
replacement_audio = librosa.resample(
|
| 331 |
+
replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
# Trim replacement to specified duration
|
| 335 |
+
replacement_duration = end_time - start_time
|
| 336 |
+
replacement_samples = int(replacement_duration * main_sr)
|
| 337 |
+
trimmed_replacement = (
|
| 338 |
+
replacement_audio[:replacement_samples]
|
| 339 |
+
if len(replacement_audio) > replacement_samples
|
| 340 |
+
else replacement_audio
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
# Apply crossfades
|
| 344 |
+
fade_samples = int(crossfade_duration * main_sr)
|
| 345 |
+
|
| 346 |
+
# Fade in replacement
|
| 347 |
+
fade_in = np.linspace(0, 1, fade_samples)
|
| 348 |
+
trimmed_replacement[:fade_samples] *= fade_in
|
| 349 |
+
|
| 350 |
+
# Fade out at end of replacement
|
| 351 |
+
fade_out = np.linspace(1, 0, fade_samples)
|
| 352 |
+
trimmed_replacement[-fade_samples:] *= fade_out
|
| 353 |
+
|
| 354 |
+
# Combine all parts
|
| 355 |
+
final_audio = np.concatenate([main_before, trimmed_replacement, main_after])
|
| 356 |
+
|
| 357 |
+
# Save output
|
| 358 |
+
if output_path is None:
|
| 359 |
+
output_path = tempfile.mkdtemp(suffix="_replaced")
|
| 360 |
+
else:
|
| 361 |
+
os.makedirs(output_path, exist_ok=True)
|
| 362 |
+
|
| 363 |
+
# Generate output filename
|
| 364 |
+
main_filename = os.path.splitext(os.path.basename(audio_path))[0]
|
| 365 |
+
output_filename = f"{main_filename}_replaced.{output_format}"
|
| 366 |
+
output_file = os.path.join(output_path, output_filename)
|
| 367 |
+
|
| 368 |
+
# Save final audio
|
| 369 |
+
sf.write(output_file, final_audio.T, main_sr)
|
| 370 |
+
|
| 371 |
+
return output_file
|
| 372 |
+
|
| 373 |
+
except Exception as e:
|
| 374 |
+
raise RuntimeError(f"Error replacing audio section: {str(e)}")
|
tools/combine_tracks.py
CHANGED
|
@@ -305,7 +305,10 @@ def create_medley(
|
|
| 305 |
|
| 306 |
if output_path is None:
|
| 307 |
tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-")
|
| 308 |
-
output =
|
|
|
|
|
|
|
|
|
|
| 309 |
else:
|
| 310 |
output = Path(output_path).expanduser().resolve()
|
| 311 |
output.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 305 |
|
| 306 |
if output_path is None:
|
| 307 |
tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-")
|
| 308 |
+
output = (
|
| 309 |
+
Path(tmp_dir)
|
| 310 |
+
/ f"{vocals.name}_{instrumental.name}_medley.{medley_extension}"
|
| 311 |
+
)
|
| 312 |
else:
|
| 313 |
output = Path(output_path).expanduser().resolve()
|
| 314 |
output.parent.mkdir(parents=True, exist_ok=True)
|
tools/voice_replacement.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from datetime import datetime
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from gradio_client import Client, handle_file
|
| 6 |
+
|
| 7 |
+
from tools.audio_info import validate_audio_path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def replace_voice(
|
| 11 |
+
source_audio_path: str,
|
| 12 |
+
target_audio_path: str,
|
| 13 |
+
diffusion_steps: int = 10,
|
| 14 |
+
length_adjust: float = 1.0,
|
| 15 |
+
inference_cfg_rate: float = 0.7,
|
| 16 |
+
f0_condition: bool = False,
|
| 17 |
+
auto_f0_adjust: bool = True,
|
| 18 |
+
pitch_shift: int = 0,
|
| 19 |
+
) -> str:
|
| 20 |
+
"""
|
| 21 |
+
Replace voice in source audio with voice from target audio using Seed-VC.
|
| 22 |
+
|
| 23 |
+
This function uses the Seed-VC Gradio space to perform voice conversion,
|
| 24 |
+
replacing the voice characteristics in the source audio with those from
|
| 25 |
+
the target audio while preserving the linguistic content and timing.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
source_audio_path: Path to the source audio file (voice to be replaced)
|
| 29 |
+
target_audio_path: Path to the target audio file (voice to use)
|
| 30 |
+
diffusion_steps: Number of diffusion steps for inference (default: 10)
|
| 31 |
+
length_adjust: Length adjustment factor (default: 1.0)
|
| 32 |
+
inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
|
| 33 |
+
f0_condition: Whether to use F0 conditioning (default: False)
|
| 34 |
+
auto_f0_adjust: Whether to auto-adjust F0 (default: True)
|
| 35 |
+
pitch_shift: Pitch shift in semitones (default: 0)
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Path to the generated voice-replaced audio file
|
| 39 |
+
|
| 40 |
+
Raises:
|
| 41 |
+
FileNotFoundError: If source or target audio files don't exist
|
| 42 |
+
ValueError: If parameters are invalid
|
| 43 |
+
RuntimeError: If voice replacement fails
|
| 44 |
+
"""
|
| 45 |
+
try:
|
| 46 |
+
# Validate input paths
|
| 47 |
+
source_abs_path = validate_audio_path(source_audio_path)
|
| 48 |
+
target_abs_path = validate_audio_path(target_audio_path)
|
| 49 |
+
|
| 50 |
+
# Validate parameters
|
| 51 |
+
if diffusion_steps < 1 or diffusion_steps > 50:
|
| 52 |
+
raise ValueError("diffusion_steps must be between 1 and 50")
|
| 53 |
+
if length_adjust <= 0:
|
| 54 |
+
raise ValueError("length_adjust must be positive")
|
| 55 |
+
if not 0 <= inference_cfg_rate <= 1:
|
| 56 |
+
raise ValueError("inference_cfg_rate must be between 0 and 1")
|
| 57 |
+
if pitch_shift < -12 or pitch_shift > 12:
|
| 58 |
+
raise ValueError("pitch_shift must be between -12 and 12 semitones")
|
| 59 |
+
|
| 60 |
+
# Initialize Seed-VC client
|
| 61 |
+
client = Client("Plachta/Seed-VC")
|
| 62 |
+
|
| 63 |
+
# Perform voice replacement
|
| 64 |
+
result = client.predict(
|
| 65 |
+
source_audio_path=handle_file(source_abs_path),
|
| 66 |
+
target_audio_path=handle_file(target_abs_path),
|
| 67 |
+
diffusion_steps=diffusion_steps,
|
| 68 |
+
length_adjust=length_adjust,
|
| 69 |
+
inference_cfg_rate=inference_cfg_rate,
|
| 70 |
+
f0_condition=f0_condition,
|
| 71 |
+
auto_f0_adjust=auto_f0_adjust,
|
| 72 |
+
pitch_shift=pitch_shift,
|
| 73 |
+
api_name="/predict_1",
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
# Create output directory
|
| 77 |
+
output_dir = Path("output")
|
| 78 |
+
output_dir.mkdir(exist_ok=True)
|
| 79 |
+
|
| 80 |
+
# Generate output filename with timestamp
|
| 81 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 82 |
+
source_name = Path(source_abs_path).stem
|
| 83 |
+
target_name = Path(target_abs_path).stem
|
| 84 |
+
output_filename = (
|
| 85 |
+
f"{source_name}_voice_replaced_by_{target_name}_{timestamp}.wav"
|
| 86 |
+
)
|
| 87 |
+
output_path = output_dir / output_filename
|
| 88 |
+
|
| 89 |
+
# Save the result
|
| 90 |
+
if isinstance(result, str) and os.path.exists(result):
|
| 91 |
+
# If result is a file path, copy it to output location
|
| 92 |
+
import shutil
|
| 93 |
+
|
| 94 |
+
shutil.copy2(result, output_path)
|
| 95 |
+
else:
|
| 96 |
+
# If result is audio data, save it using soundfile
|
| 97 |
+
import soundfile as sf
|
| 98 |
+
|
| 99 |
+
sf.write(str(output_path), result, 22050)
|
| 100 |
+
|
| 101 |
+
return str(output_path)
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
raise RuntimeError(f"Voice replacement failed: {str(e)}")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def replace_voice_wrapper(
|
| 108 |
+
source_audio_path: str,
|
| 109 |
+
target_audio_path: str,
|
| 110 |
+
diffusion_steps: int = 10,
|
| 111 |
+
length_adjust: float = 1.0,
|
| 112 |
+
inference_cfg_rate: float = 0.7,
|
| 113 |
+
f0_condition: bool = False,
|
| 114 |
+
auto_f0_adjust: bool = True,
|
| 115 |
+
pitch_shift: int = 0,
|
| 116 |
+
) -> str:
|
| 117 |
+
"""
|
| 118 |
+
Wrapper function for voice replacement with error handling for MCP integration.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
source_audio_path: Path to the source audio file
|
| 122 |
+
target_audio_path: Path to the target audio file
|
| 123 |
+
diffusion_steps: Number of diffusion steps (default: 10)
|
| 124 |
+
length_adjust: Length adjustment factor (default: 1.0)
|
| 125 |
+
inference_cfg_rate: CFG rate (default: 0.7)
|
| 126 |
+
f0_condition: Use F0 conditioning (default: False)
|
| 127 |
+
auto_f0_adjust: Auto-adjust F0 (default: True)
|
| 128 |
+
pitch_shift: Pitch shift in semitones (default: 0)
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
Path to generated audio file or error message
|
| 132 |
+
"""
|
| 133 |
+
try:
|
| 134 |
+
return replace_voice(
|
| 135 |
+
source_audio_path=source_audio_path,
|
| 136 |
+
target_audio_path=target_audio_path,
|
| 137 |
+
diffusion_steps=diffusion_steps,
|
| 138 |
+
length_adjust=length_adjust,
|
| 139 |
+
inference_cfg_rate=inference_cfg_rate,
|
| 140 |
+
f0_condition=f0_condition,
|
| 141 |
+
auto_f0_adjust=auto_f0_adjust,
|
| 142 |
+
pitch_shift=pitch_shift,
|
| 143 |
+
)
|
| 144 |
+
except Exception as e:
|
| 145 |
+
return f"Error: {str(e)}"
|