frascuchon HF Staff commited on
Commit
a5d8e64
·
1 Parent(s): 122c63e

add new tools

Browse files
mcp_server.py CHANGED
@@ -25,6 +25,12 @@ from tools.music_understanding import (
25
  suggest_cutting_points,
26
  analyze_genre_and_style,
27
  )
 
 
 
 
 
 
28
 
29
 
30
  def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
@@ -112,7 +118,7 @@ def stretch_audio_to_bpm_wrapper(audio_path: str, target_bpm: float) -> str:
112
 
113
  def extract_selected_stems_wrapper(
114
  audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
115
- ) -> Tuple[str|None, str|None, str|None, str|None]:
116
  """
117
  Extract selected stems from an audio file based on user choices.
118
 
@@ -157,7 +163,7 @@ def extract_selected_stems_wrapper(
157
  if not stems_to_extract:
158
  raise ValueError("At least one stem must be selected for extraction")
159
 
160
- results= extract_selected_stems(audio_path, stems_to_extract)
161
 
162
  vocals = results.get("vocals")
163
  drums = results.get("drums")
@@ -268,7 +274,7 @@ def mute_time_windows_wrapper(
268
 
269
  def extract_segments_wrapper(
270
  audio_path: str, segments_str: str, format_val: str, join: bool
271
- ) -> Tuple[str, str|None, str|None, str|None]:
272
  """
273
  Extract multiple segments (up to 4 segments) from an audio file and optionally join them.
274
 
@@ -682,7 +688,9 @@ def shift_to_key_wrapper(
682
  # MCP Tool Wrappers with Documentation for MCP Server
683
 
684
 
685
- def separate_audio_mcp(audio_path: str, output_format: str = "wav") -> Tuple[str, str, str, str]:
 
 
686
  """
687
  Separate audio into vocals, drums, bass, and other stems using Demucs neural network.
688
 
@@ -715,7 +723,12 @@ def separate_audio_mcp(audio_path: str, output_format: str = "wav") -> Tuple[str
715
  )
716
  return vocals, drums, bass, other
717
  except Exception as e:
718
- return f"Error separating audio: {str(e)}", f"Error: {str(e)}", f"Error: {str(e)}", f"Error: {str(e)}"
 
 
 
 
 
719
 
720
 
721
  def combine_tracks_mcp(
@@ -777,7 +790,9 @@ def combine_tracks_mcp(
777
  return f"Error combining tracks: {str(e)}"
778
 
779
 
780
- def pitch_shift_with_semitones_mcp(audio_path: str, semitones: int, output_format: str = "wav") -> str:
 
 
781
  """
782
  Shift the pitch of an audio file by a specified number of semitones.
783
 
@@ -855,6 +870,7 @@ def align_songs_by_bpm_mcp(
855
  )
856
  # Apply target BPM by stretching both tracks
857
  from tools.time_strech import stretch_to_bpm
 
858
  aligned1 = stretch_to_bpm(result1, target_bpm, None, output_format)
859
  aligned2 = stretch_to_bpm(result2, target_bpm, None, output_format)
860
  return aligned1, aligned2
@@ -1136,6 +1152,209 @@ def analyze_genre_and_style_mcp(audio_path: str) -> str:
1136
  return f"Error analyzing genre and style: {str(e)}"
1137
 
1138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1139
  def create_interface() -> gr.TabbedInterface:
1140
  """
1141
  Create and configure the complete Gradio interface with all audio processing tools.
@@ -1668,6 +1887,95 @@ def create_interface() -> gr.TabbedInterface:
1668
  flagging_mode="never",
1669
  )
1670
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1671
  return gr.TabbedInterface(
1672
  [
1673
  stem_interface,
@@ -1694,6 +2002,10 @@ def create_interface() -> gr.TabbedInterface:
1694
  structure_interface,
1695
  cutting_points_interface,
1696
  genre_interface,
 
 
 
 
1697
  ],
1698
  [
1699
  "Stem Separation",
@@ -1720,6 +2032,10 @@ def create_interface() -> gr.TabbedInterface:
1720
  "Song Structure",
1721
  "Cutting Points",
1722
  "Genre Analysis",
 
 
 
 
1723
  ],
1724
  )
1725
 
 
25
  suggest_cutting_points,
26
  analyze_genre_and_style,
27
  )
28
+ from tools.audio_cleaning import remove_noise
29
+ from tools.audio_insertion import (
30
+ insert_section,
31
+ replace_section,
32
+ )
33
+ from tools.voice_replacement import replace_voice_wrapper
34
 
35
 
36
  def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
 
118
 
119
  def extract_selected_stems_wrapper(
120
  audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
121
+ ) -> Tuple[str | None, str | None, str | None, str | None]:
122
  """
123
  Extract selected stems from an audio file based on user choices.
124
 
 
163
  if not stems_to_extract:
164
  raise ValueError("At least one stem must be selected for extraction")
165
 
166
+ results = extract_selected_stems(audio_path, stems_to_extract)
167
 
168
  vocals = results.get("vocals")
169
  drums = results.get("drums")
 
274
 
275
  def extract_segments_wrapper(
276
  audio_path: str, segments_str: str, format_val: str, join: bool
277
+ ) -> Tuple[str, str | None, str | None, str | None]:
278
  """
279
  Extract multiple segments (up to 4 segments) from an audio file and optionally join them.
280
 
 
688
  # MCP Tool Wrappers with Documentation for MCP Server
689
 
690
 
691
+ def separate_audio_mcp(
692
+ audio_path: str, output_format: str = "wav"
693
+ ) -> Tuple[str, str, str, str]:
694
  """
695
  Separate audio into vocals, drums, bass, and other stems using Demucs neural network.
696
 
 
723
  )
724
  return vocals, drums, bass, other
725
  except Exception as e:
726
+ return (
727
+ f"Error separating audio: {str(e)}",
728
+ f"Error: {str(e)}",
729
+ f"Error: {str(e)}",
730
+ f"Error: {str(e)}",
731
+ )
732
 
733
 
734
  def combine_tracks_mcp(
 
790
  return f"Error combining tracks: {str(e)}"
791
 
792
 
793
+ def pitch_shift_with_semitones_mcp(
794
+ audio_path: str, semitones: int, output_format: str = "wav"
795
+ ) -> str:
796
  """
797
  Shift the pitch of an audio file by a specified number of semitones.
798
 
 
870
  )
871
  # Apply target BPM by stretching both tracks
872
  from tools.time_strech import stretch_to_bpm
873
+
874
  aligned1 = stretch_to_bpm(result1, target_bpm, None, output_format)
875
  aligned2 = stretch_to_bpm(result2, target_bpm, None, output_format)
876
  return aligned1, aligned2
 
1152
  return f"Error analyzing genre and style: {str(e)}"
1153
 
1154
 
1155
+ def remove_noise_mcp(
1156
+ audio_path: str,
1157
+ noise_type: str = "general",
1158
+ sensitivity: float = 0.5,
1159
+ output_format: str = "wav",
1160
+ ) -> str:
1161
+ """
1162
+ Remove noise from audio using adaptive filtering and spectral subtraction.
1163
+
1164
+ This MCP wrapper provides noise removal capabilities for various types of
1165
+ unwanted audio artifacts including hiss, hum, rumble, and general background noise.
1166
+
1167
+ Args:
1168
+ audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
1169
+ noise_type: Type of noise to remove ('general', 'hiss', 'hum', 'rumble', 'background')
1170
+ sensitivity: Noise reduction sensitivity (0.0 to 1.0, default: 0.5)
1171
+ output_format: Output format for the cleaned audio ('wav' or 'mp3', default: 'wav')
1172
+
1173
+ Returns:
1174
+ Path to the cleaned audio file
1175
+
1176
+ Examples:
1177
+ >>> remove_noise_mcp("noisy_recording.wav", "hiss", 0.7, "wav")
1178
+ # Returns path to cleaned audio with reduced hiss
1179
+
1180
+ >>> remove_noise_mcp("podcast.mp3", "background", 0.3, "mp3")
1181
+ # Returns path to cleaned audio with reduced background noise
1182
+
1183
+ Note:
1184
+ - Higher sensitivity values remove more noise but may affect audio quality
1185
+ - Different noise types use specialized algorithms for optimal results
1186
+ - Processing time varies with audio length and noise complexity
1187
+ """
1188
+ try:
1189
+ result = remove_noise(
1190
+ audio_path=audio_path,
1191
+ noise_type=noise_type,
1192
+ sensitivity=sensitivity,
1193
+ output_path=None,
1194
+ output_format=output_format,
1195
+ )
1196
+ return result
1197
+ except Exception as e:
1198
+ return f"Error removing noise: {str(e)}"
1199
+
1200
+
1201
+ def insert_section_mcp(
1202
+ audio_path: str,
1203
+ section_path: str,
1204
+ insert_time: float,
1205
+ crossfade_duration: float = 0.1,
1206
+ output_format: str = "wav",
1207
+ ) -> str:
1208
+ """
1209
+ Insert a section from one audio track into another at a precise time position.
1210
+
1211
+ This MCP wrapper allows inserting audio content (like an intro, advertisement,
1212
+ or sound effect) into an existing track at any position with smooth
1213
+ crossfading to avoid audible clicks or abrupt transitions.
1214
+
1215
+ Args:
1216
+ audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
1217
+ section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
1218
+ insert_time: Position to insert the section (in seconds from start of main audio)
1219
+ crossfade_duration: Length of crossfade in seconds (default: 0.1)
1220
+ output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
1221
+
1222
+ Returns:
1223
+ Path to the audio file with the section inserted
1224
+
1225
+ Examples:
1226
+ >>> insert_section_mcp("main_track.wav", "intro.wav", 5.0, 0.2, "wav")
1227
+ # Returns path to audio with intro inserted at 5 seconds
1228
+
1229
+ >>> insert_section_mcp("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "mp3")
1230
+ # Returns path to audio with ad inserted at 3 minutes
1231
+
1232
+ Note:
1233
+ - Insert position is measured from the start of the main audio
1234
+ - Crossfade prevents clicks and creates smooth transitions
1235
+ - If insert_time + section duration exceeds main audio duration, section is truncated
1236
+ """
1237
+ try:
1238
+ result = insert_section(
1239
+ audio_path=audio_path,
1240
+ section_path=section_path,
1241
+ insert_time=insert_time,
1242
+ crossfade_duration=crossfade_duration,
1243
+ output_path=None,
1244
+ output_format=output_format,
1245
+ )
1246
+ return result
1247
+ except Exception as e:
1248
+ return f"Error inserting audio section: {str(e)}"
1249
+
1250
+
1251
+ def replace_section_mcp(
1252
+ audio_path: str,
1253
+ start_time: float,
1254
+ end_time: float,
1255
+ replacement_path: str,
1256
+ crossfade_duration: float = 0.1,
1257
+ output_format: str = "wav",
1258
+ ) -> str:
1259
+ """
1260
+ Replace a section of an audio track with another audio segment.
1261
+
1262
+ This MCP wrapper removes a specified time range from the main audio and
1263
+ replaces it with new content, using crossfades for smooth transitions.
1264
+
1265
+ Args:
1266
+ audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
1267
+ start_time: Start time of section to replace (in seconds)
1268
+ end_time: End time of section to replace (in seconds)
1269
+ replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
1270
+ crossfade_duration: Length of crossfade in seconds (default: 0.1)
1271
+ output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
1272
+
1273
+ Returns:
1274
+ Path to the audio file with the section replaced
1275
+
1276
+ Examples:
1277
+ >>> replace_section_mcp("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "wav")
1278
+ # Returns path to audio with 60-90s section replaced
1279
+
1280
+ >>> replace_section_mcp("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "mp3")
1281
+ # Returns path to audio with 2-minute section replaced
1282
+
1283
+ Note:
1284
+ - Start time must be less than end time
1285
+ - Crossfade prevents clicks at replacement boundaries
1286
+ - Replacement section is trimmed if longer than specified duration
1287
+ """
1288
+ try:
1289
+ result = replace_section(
1290
+ audio_path=audio_path,
1291
+ start_time=start_time,
1292
+ end_time=end_time,
1293
+ replacement_path=replacement_path,
1294
+ crossfade_duration=crossfade_duration,
1295
+ output_path=None,
1296
+ output_format=output_format,
1297
+ )
1298
+ return result
1299
+ except Exception as e:
1300
+ return f"Error replacing audio section: {str(e)}"
1301
+
1302
+
1303
+ def replace_voice_mcp(
1304
+ source_audio_path: str,
1305
+ target_audio_path: str,
1306
+ diffusion_steps: int = 10,
1307
+ length_adjust: float = 1.0,
1308
+ inference_cfg_rate: float = 0.7,
1309
+ f0_condition: bool = False,
1310
+ auto_f0_adjust: bool = True,
1311
+ pitch_shift: int = 0,
1312
+ ) -> str:
1313
+ """
1314
+ Replace voice in source audio with voice from target audio using Seed-VC.
1315
+
1316
+ This MCP wrapper uses the Seed-VC Gradio space to perform voice conversion,
1317
+ replacing the voice characteristics in the source audio with those from
1318
+ the target audio while preserving the linguistic content and timing.
1319
+
1320
+ Args:
1321
+ source_audio_path: Path to the source audio file (voice to be replaced)
1322
+ target_audio_path: Path to the target audio file (voice to use)
1323
+ diffusion_steps: Number of diffusion steps for inference (default: 10)
1324
+ length_adjust: Length adjustment factor (default: 1.0)
1325
+ inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
1326
+ f0_condition: Whether to use F0 conditioning (default: False)
1327
+ auto_f0_adjust: Whether to auto-adjust F0 (default: True)
1328
+ pitch_shift: Pitch shift in semitones (default: 0)
1329
+
1330
+ Returns:
1331
+ Path to the generated voice-replaced audio file
1332
+
1333
+ Examples:
1334
+ >>> replace_voice_mcp("source.wav", "target_voice.wav")
1335
+ # Returns path to voice-replaced audio file
1336
+
1337
+ >>> replace_voice_mcp("speech.mp3", "singer.wav", diffusion_steps=15, pitch_shift=2)
1338
+ # Returns path to voice-replaced audio with custom settings
1339
+
1340
+ Note:
1341
+ - Uses Seed-VC model for high-quality voice conversion
1342
+ - Preserves linguistic content and timing from source audio
1343
+ - Applies voice characteristics from target audio
1344
+ - Processing time depends on diffusion steps and audio length
1345
+ """
1346
+ return replace_voice_wrapper(
1347
+ source_audio_path=source_audio_path,
1348
+ target_audio_path=target_audio_path,
1349
+ diffusion_steps=diffusion_steps,
1350
+ length_adjust=length_adjust,
1351
+ inference_cfg_rate=inference_cfg_rate,
1352
+ f0_condition=f0_condition,
1353
+ auto_f0_adjust=auto_f0_adjust,
1354
+ pitch_shift=pitch_shift,
1355
+ )
1356
+
1357
+
1358
  def create_interface() -> gr.TabbedInterface:
1359
  """
1360
  Create and configure the complete Gradio interface with all audio processing tools.
 
1887
  flagging_mode="never",
1888
  )
1889
 
1890
+ # Tab 20: Audio Cleaning
1891
+ cleaning_interface = gr.Interface(
1892
+ fn=remove_noise_mcp,
1893
+ inputs=[
1894
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
1895
+ gr.Dropdown(
1896
+ choices=["general", "hiss", "hum", "rumble", "background"],
1897
+ value="general",
1898
+ label="Noise Type",
1899
+ ),
1900
+ gr.Slider(
1901
+ minimum=0.0, maximum=1.0, value=0.5, step=0.1, label="Sensitivity"
1902
+ ),
1903
+ gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
1904
+ ],
1905
+ outputs=gr.Audio(label="Cleaned Audio", type="filepath"),
1906
+ title="Audio Noise Removal",
1907
+ description="Remove various types of noise from audio using adaptive filtering and spectral subtraction.",
1908
+ examples=None,
1909
+ cache_examples=False,
1910
+ flagging_mode="never",
1911
+ )
1912
+
1913
+ # Tab 21: Insert Section
1914
+ insert_interface = gr.Interface(
1915
+ fn=insert_section_mcp,
1916
+ inputs=[
1917
+ gr.Audio(type="filepath", label="Main Audio File", sources=["upload"]),
1918
+ gr.Audio(type="filepath", label="Section to Insert", sources=["upload"]),
1919
+ gr.Number(value=5.0, label="Insert Time (seconds)"),
1920
+ gr.Number(value=0.1, label="Crossfade Duration (seconds)"),
1921
+ gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
1922
+ ],
1923
+ outputs=gr.Audio(label="Audio with Insertion", type="filepath"),
1924
+ title="Insert Audio Section",
1925
+ description="Insert a section from one audio track into another at a precise time position.",
1926
+ examples=None,
1927
+ cache_examples=False,
1928
+ flagging_mode="never",
1929
+ )
1930
+
1931
+ # Tab 22: Replace Section
1932
+ replace_interface = gr.Interface(
1933
+ fn=replace_section_mcp,
1934
+ inputs=[
1935
+ gr.Audio(type="filepath", label="Main Audio File", sources=["upload"]),
1936
+ gr.Number(value=60.0, label="Start Time (seconds)"),
1937
+ gr.Number(value=90.0, label="End Time (seconds)"),
1938
+ gr.Audio(type="filepath", label="Replacement Section", sources=["upload"]),
1939
+ gr.Number(value=0.1, label="Crossfade Duration (seconds)"),
1940
+ gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
1941
+ ],
1942
+ outputs=gr.Audio(label="Audio with Replacement", type="filepath"),
1943
+ title="Replace Audio Section",
1944
+ description="Replace a section of an audio track with another audio segment.",
1945
+ examples=None,
1946
+ cache_examples=False,
1947
+ flagging_mode="never",
1948
+ )
1949
+
1950
+ # Tab 23: Voice Replacement
1951
+ voice_replacement_interface = gr.Interface(
1952
+ fn=replace_voice_mcp,
1953
+ inputs=[
1954
+ gr.Audio(
1955
+ type="filepath",
1956
+ label="Source Audio (voice to be replaced)",
1957
+ sources=["upload"],
1958
+ ),
1959
+ gr.Audio(
1960
+ type="filepath", label="Target Audio (voice to use)", sources=["upload"]
1961
+ ),
1962
+ gr.Number(value=10, label="Diffusion Steps", minimum=1, maximum=50),
1963
+ gr.Number(value=1.0, label="Length Adjust", minimum=0.1, maximum=3.0),
1964
+ gr.Number(value=0.7, label="Inference CFG Rate", minimum=0.0, maximum=1.0),
1965
+ gr.Checkbox(value=False, label="F0 Condition"),
1966
+ gr.Checkbox(value=True, label="Auto F0 Adjust"),
1967
+ gr.Number(
1968
+ value=0, label="Pitch Shift (semitones)", minimum=-12, maximum=12
1969
+ ),
1970
+ ],
1971
+ outputs=gr.Audio(label="Voice-Replaced Audio", type="filepath"),
1972
+ title="Voice Replacement with Seed-VC",
1973
+ description="Replace voice in source audio with voice from target audio using Seed-VC AI model.",
1974
+ examples=None,
1975
+ cache_examples=False,
1976
+ flagging_mode="never",
1977
+ )
1978
+
1979
  return gr.TabbedInterface(
1980
  [
1981
  stem_interface,
 
2002
  structure_interface,
2003
  cutting_points_interface,
2004
  genre_interface,
2005
+ cleaning_interface,
2006
+ insert_interface,
2007
+ replace_interface,
2008
+ voice_replacement_interface,
2009
  ],
2010
  [
2011
  "Stem Separation",
 
2032
  "Song Structure",
2033
  "Cutting Points",
2034
  "Genre Analysis",
2035
+ "Audio Cleaning",
2036
+ "Insert Section",
2037
+ "Replace Section",
2038
+ "Voice Replacement",
2039
  ],
2040
  )
2041
 
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  librosa>=0.10.0
2
  numpy>=1.24.0
 
3
  torch~=2.8.0
4
  torchaudio~=2.8.0
5
  torchcodec~=0.8.0
 
1
  librosa>=0.10.0
2
  numpy>=1.24.0
3
+ scipy>=1.10.0
4
  torch~=2.8.0
5
  torchaudio~=2.8.0
6
  torchcodec~=0.8.0
tools/audio_cleaning.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from typing import Optional
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from scipy.signal import butter, lfilter, filtfilt
9
+
10
+
11
+ def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
12
+ """Load audio file with standard settings."""
13
+ y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
14
+ return y, int(sr)
15
+
16
+
17
+ def detect_noise_profile(audio: np.ndarray, sample_rate: int) -> dict:
18
+ """
19
+ Analyze audio to detect noise characteristics.
20
+
21
+ Args:
22
+ audio: Audio data as numpy array
23
+ sample_rate: Sample rate of audio
24
+
25
+ Returns:
26
+ Dictionary with noise profile information
27
+ """
28
+ # Compute spectral features for noise detection
29
+ stft = librosa.stft(audio, n_fft=2048, hop_length=512)
30
+ magnitude = np.abs(stft)
31
+
32
+ # Identify noise floor (quiet parts)
33
+ noise_floor = np.percentile(magnitude, 10)
34
+
35
+ # Detect steady noise (consistent low-frequency content)
36
+ freqs = librosa.fft_frequencies(sr=sample_rate, n_fft=2048)
37
+ low_freq_mask = freqs < 200 # Below 200 Hz
38
+ steady_noise = np.mean(magnitude[:, low_freq_mask], axis=1)
39
+
40
+ # Detect hiss (high frequency noise)
41
+ high_freq_mask = freqs > 4000 # Above 4 kHz
42
+ hiss_level = np.mean(magnitude[:, high_freq_mask], axis=1)
43
+
44
+ # Compute overall noise characteristics
45
+ signal_power = np.mean(magnitude**2, axis=1)
46
+ noise_power = np.mean(magnitude**2, axis=1) - signal_power
47
+ snr_estimate = 10 * np.log10(signal_power / (noise_power + 1e-10))
48
+
49
+ return {
50
+ "noise_floor": float(noise_floor),
51
+ "steady_noise": float(steady_noise),
52
+ "hiss_level": float(hiss_level),
53
+ "snr_estimate": float(snr_estimate),
54
+ "has_significant_noise": bool(
55
+ steady_noise > noise_floor * 2 or hiss_level > noise_floor * 1.5
56
+ ),
57
+ }
58
+
59
+
60
+ def spectral_subtraction(
61
+ audio: np.ndarray, noise_profile: dict, sample_rate: int
62
+ ) -> np.ndarray:
63
+ """
64
+ Apply spectral subtraction to remove identified noise.
65
+
66
+ Args:
67
+ audio: Input audio data
68
+ noise_profile: Noise profile from detect_noise_profile()
69
+ sample_rate: Sample rate of audio
70
+
71
+ Returns:
72
+ Cleaned audio data
73
+ """
74
+ # Compute STFT of audio
75
+ stft = librosa.stft(audio, n_fft=2048, hop_length=512)
76
+ magnitude = np.abs(stft)
77
+ phase = np.angle(stft)
78
+
79
+ # Create noise gate based on noise floor
80
+ noise_gate = np.minimum(magnitude / (noise_profile["noise_floor"] + 1e-10), 1.0)
81
+
82
+ # Apply gentle noise reduction
83
+ reduction_factor = 0.3 if noise_profile["has_significant_noise"] else 0.15
84
+ cleaned_magnitude = magnitude * (1 - noise_gate * reduction_factor)
85
+
86
+ # Reconstruct audio
87
+ cleaned_stft = cleaned_magnitude * np.exp(1j * phase)
88
+ cleaned_audio = librosa.istft(cleaned_stft, hop_length=512)
89
+
90
+ return cleaned_audio
91
+
92
+
93
+ def adaptive_filter(
94
+ audio: np.ndarray, sample_rate: int, noise_type: str = "general"
95
+ ) -> np.ndarray:
96
+ """
97
+ Apply adaptive filtering based on noise type.
98
+
99
+ Args:
100
+ audio: Input audio data
101
+ sample_rate: Sample rate of audio
102
+ noise_type: Type of noise to address ('general', 'hiss', 'hum', 'background')
103
+
104
+ Returns:
105
+ Filtered audio data
106
+ """
107
+ if noise_type == "hiss":
108
+ # High-pass filter to reduce hiss (above 4kHz)
109
+ cutoff = 4000
110
+ b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
111
+ filtered_audio = lfilter(b, a, audio)
112
+
113
+ elif noise_type == "hum":
114
+ # Notch filter for common hum frequencies (50/60 Hz and harmonics)
115
+ # Apply multiple notch filters
116
+ filtered_audio = audio.copy()
117
+ hum_freqs = [50, 60, 100, 120, 180, 240] # Common power line harmonics
118
+
119
+ for freq in hum_freqs:
120
+ if freq < sample_rate / 2:
121
+ # Create notch filter
122
+ b, a = butter(
123
+ 2,
124
+ [freq * 0.9, freq * 1.1],
125
+ fs=sample_rate,
126
+ btype="bandstop",
127
+ output="ba",
128
+ )
129
+ filtered_audio = lfilter(b, a, filtered_audio)
130
+
131
+ elif noise_type == "background":
132
+ # Spectral subtraction for background noise
133
+ noise_profile = detect_noise_profile(audio, sample_rate)
134
+ filtered_audio = spectral_subtraction(audio, noise_profile, sample_rate)
135
+
136
+ else:
137
+ # General broadband noise reduction
138
+ # Apply gentle low-pass filter
139
+ cutoff = int(min(8000, sample_rate // 2.5))
140
+ b, a = butter(4, cutoff, fs=sample_rate, btype="low", output="ba")
141
+ filtered_audio = lfilter(b, a, audio)
142
+
143
+ return filtered_audio
144
+
145
+
146
+ def remove_noise(
147
+ audio_path: str,
148
+ noise_type: str = "general",
149
+ sensitivity: float = 0.5,
150
+ output_path: Optional[str] = None,
151
+ output_format: str = "wav",
152
+ ) -> str:
153
+ """
154
+ Remove noise from audio using adaptive filtering and spectral subtraction.
155
+
156
+ This function analyzes the audio to detect noise characteristics and applies
157
+ appropriate noise reduction techniques based on the noise type and sensitivity
158
+ settings. It supports various noise types including hiss, hum, rumble, and
159
+ general background noise.
160
+
161
+ Args:
162
+ audio_path: Path to the audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
163
+ noise_type: Type of noise to remove ('general', 'hiss', 'hum', 'rumble', 'background')
164
+ - 'general': Broadband noise reduction
165
+ - 'hiss': High-frequency noise removal
166
+ - 'hum': Power line hum removal (50/60 Hz)
167
+ - 'rumble': Low-frequency rumble removal
168
+ - 'background': General background noise
169
+ sensitivity: Noise reduction sensitivity (0.0 to 1.0, default: 0.5)
170
+ Higher values remove more noise but may affect audio quality
171
+ output_path: Optional output directory (default: None, uses temp directory)
172
+ output_format: Output format for the cleaned audio ('wav' or 'mp3', default: 'wav')
173
+
174
+ Returns:
175
+ Path to the cleaned audio file
176
+
177
+ Examples:
178
+ >>> remove_noise("noisy_recording.wav", "hiss", 0.7, "output", "wav")
179
+ # Returns 'path/to/noisy_recording_hiss_removed.wav' with reduced hiss
180
+
181
+ >>> remove_noise("podcast.mp3", "background", 0.3, "output", "mp3")
182
+ # Returns 'path/to/podcast_background_removed.mp3' with reduced background noise
183
+
184
+ Note:
185
+ - Higher sensitivity values remove more noise but may affect audio quality
186
+ - Different noise types use specialized algorithms for optimal results
187
+ - Processing time varies with audio length and noise complexity
188
+ - Preserves original audio quality and sample rate
189
+ - Works with mono or stereo audio files
190
+ """
191
+ try:
192
+ # Load audio
193
+ audio, sample_rate = _load_audio(audio_path, mono=False)
194
+
195
+ # Apply noise reduction based on type and sensitivity
196
+ if noise_type == "hiss":
197
+ # High-pass filter for hiss removal
198
+ cutoff = 4000 - sensitivity * 2000 # 2000-4000 Hz range
199
+ b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
200
+ filtered_audio = filtfilt(b, a, audio)
201
+
202
+ elif noise_type == "hum":
203
+ # Multiple notch filters for harmonics
204
+ filtered_audio = audio.copy()
205
+ fundamental_freqs = [50, 60, 100] # Common power line fundamentals
206
+
207
+ for fundamental in fundamental_freqs:
208
+ if fundamental < sample_rate // 2:
209
+ # Filter fundamental and first few harmonics
210
+ for harmonic in range(1, 6):
211
+ freq = fundamental * harmonic
212
+ if freq < sample_rate // 2:
213
+ b, a = butter(
214
+ 2,
215
+ [freq * 0.95, freq * 1.05],
216
+ fs=sample_rate,
217
+ btype="bandstop",
218
+ output="ba",
219
+ )
220
+ filtered_audio = filtfilt(b, a, filtered_audio)
221
+
222
+ elif noise_type == "rumble":
223
+ # High-pass filter for rumble removal
224
+ cutoff = 20 + sensitivity * 80 # 20-100 Hz range
225
+ b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
226
+ filtered_audio = filtfilt(b, a, audio)
227
+
228
+ else: # background or general
229
+ # General noise reduction
230
+ noise_profile = detect_noise_profile(audio, sample_rate)
231
+ filtered_audio = spectral_subtraction(audio, noise_profile, sample_rate)
232
+ # Apply based on sensitivity
233
+ strength = 0.2 + sensitivity * 0.6
234
+ filtered_audio = (1 - strength) * filtered_audio + strength * audio
235
+
236
+ # Normalize output
237
+ max_val = np.max(np.abs(filtered_audio))
238
+ if max_val > 0:
239
+ filtered_audio = filtered_audio / max_val * 0.95
240
+
241
+ # Save output
242
+ if output_path is None:
243
+ output_path = tempfile.mkdtemp(suffix="_noise_removed")
244
+ else:
245
+ os.makedirs(output_path, exist_ok=True)
246
+
247
+ # Generate output filename
248
+ input_filename = os.path.splitext(os.path.basename(audio_path))[0]
249
+ output_filename = f"{input_filename}_{noise_type}_removed.{output_format}"
250
+ output_file = os.path.join(output_path, output_filename)
251
+
252
+ # Save processed audio
253
+ sf.write(output_file, filtered_audio.T, sample_rate)
254
+
255
+ return output_file
256
+
257
+ except Exception as e:
258
+ raise RuntimeError(f"Error removing noise: {str(e)}")
tools/audio_cutting.py CHANGED
@@ -43,7 +43,6 @@ def cut_audio(
43
  # Get audio duration
44
  duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
45
 
46
-
47
  if start_time >= end_time:
48
  raise ValueError(
49
  f"Start time ({start_time}s) must be less than end time ({end_time}s)"
 
43
  # Get audio duration
44
  duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
45
 
 
46
  if start_time >= end_time:
47
  raise ValueError(
48
  f"Start time ({start_time}s) must be less than end time ({end_time}s)"
tools/audio_insertion.py ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from typing import Optional
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import soundfile as sf
8
+
9
+
10
+ def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
11
+ """Load audio file with standard settings."""
12
+ y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
13
+ return y, int(sr)
14
+
15
+
16
+ def detect_crossfade_point(
17
+ insert_position: float, audio_duration: float, crossfade_duration: float = 0.1
18
+ ) -> tuple[float, float]:
19
+ """
20
+ Calculate optimal crossfade points for seamless insertion.
21
+
22
+ Args:
23
+ insert_position: Where to insert the section (in seconds)
24
+ audio_duration: Total duration of the target audio (in seconds)
25
+ crossfade_duration: Length of crossfade (in seconds)
26
+
27
+ Returns:
28
+ Tuple of (start_time, end_time) for crossfade region
29
+ """
30
+ # Calculate crossfade boundaries
31
+ fade_start = max(0, insert_position - crossfade_duration / 2)
32
+ fade_end = min(audio_duration, insert_position + crossfade_duration / 2)
33
+
34
+ return fade_start, fade_end
35
+
36
+
37
+ def apply_crossfade(
38
+ section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int
39
+ ) -> np.ndarray:
40
+ """
41
+ Apply crossfade between section and target audio.
42
+
43
+ Args:
44
+ section: Audio section to insert
45
+ target: Target audio to insert into
46
+ crossfade_duration: Length of crossfade in seconds
47
+ sample_rate: Sample rate of audio
48
+
49
+ Returns:
50
+ Target audio with section inserted
51
+ """
52
+ # Calculate crossfade samples
53
+ fade_samples = int(crossfade_duration * sample_rate)
54
+
55
+ # Create crossfade envelope
56
+ fade_in = np.linspace(0, 1, fade_samples)
57
+ fade_out = np.linspace(1, 0, fade_samples)
58
+ crossfade = fade_in * fade_out
59
+
60
+ # Apply crossfade to section end
61
+ section_end = section[-fade_samples:] if len(section) > fade_samples else section
62
+ section_end[:fade_samples] *= crossfade
63
+
64
+ # Insert section into target
65
+ insert_sample = int(len(target) * 0.5) # Insert at middle
66
+ result = np.insert(target, insert_sample, section_end, axis=0)
67
+
68
+ return result
69
+
70
+
71
+ def insert_section(
72
+ audio_path: str,
73
+ section_path: str,
74
+ insert_time: float,
75
+ crossfade_duration: float = 0.1,
76
+ output_path: Optional[str] = None,
77
+ output_format: str = "wav",
78
+ ) -> str:
79
+ """
80
+ Insert a section from one audio track into another at a precise time position.
81
+
82
+ This function allows you to insert audio content (like an intro, advertisement,
83
+ or sound effect) into an existing track at any position with smooth
84
+ crossfading to avoid audible clicks or abrupt transitions.
85
+
86
+ Args:
87
+ audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
88
+ section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
89
+ insert_time: Position to insert the section (in seconds from start of main audio)
90
+ crossfade_duration: Length of crossfade in seconds (default: 0.1)
91
+ Longer crossfades create smoother transitions but reduce clarity
92
+ output_path: Optional output directory (default: None, uses temp directory)
93
+ output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
94
+
95
+ Returns:
96
+ Path to the audio file with the section inserted
97
+
98
+ Examples:
99
+ >>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav")
100
+ # Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds
101
+
102
+ >>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3")
103
+ # Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes
104
+
105
+ Note:
106
+ - Insert position is measured from the start of the main audio
107
+ - Crossfade prevents clicks and creates smooth transitions
108
+ - If insert_time + section duration exceeds main audio duration, section is truncated
109
+ - Works with mono or stereo audio files
110
+ - Preserves original audio quality and sample rate
111
+ - Processing time depends on audio length and crossfade duration
112
+ """
113
+ try:
114
+ # Load both audio files
115
+ main_audio, main_sr = _load_audio(audio_path, mono=False)
116
+ section_audio, section_sr = _load_audio(section_path, mono=False)
117
+
118
+ # Resample if needed
119
+ if main_sr != section_sr:
120
+ section_audio = librosa.resample(
121
+ section_audio, orig_sr=section_sr, target_sr=main_sr
122
+ )
123
+
124
+ # Calculate timing
125
+ main_duration = len(main_audio) / main_sr
126
+
127
+ # Validate insert position
128
+ if insert_time < 0:
129
+ raise ValueError("Insert time must be positive")
130
+ if insert_time > main_duration:
131
+ raise ValueError(
132
+ f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)"
133
+ )
134
+
135
+ # Calculate crossfade points
136
+ fade_start, fade_end = detect_crossfade_point(
137
+ insert_time, main_duration, crossfade_duration
138
+ )
139
+
140
+ # Extract main audio segments
141
+ main_before = main_audio[: int(fade_start * main_sr)]
142
+ main_after = main_audio[int(fade_end * main_sr) :]
143
+
144
+ # Apply crossfade and insert section
145
+ result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr)
146
+
147
+ # Combine all parts
148
+ final_audio = np.concatenate([main_before, result])
149
+
150
+ # Save output
151
+ if output_path is None:
152
+ output_path = tempfile.mkdtemp(suffix="_inserted")
153
+ else:
154
+ os.makedirs(output_path, exist_ok=True)
155
+
156
+ # Generate output filename
157
+ main_filename = os.path.splitext(os.path.basename(audio_path))[0]
158
+ output_filename = f"{main_filename}_with_insertion.{output_format}"
159
+ output_file = os.path.join(output_path, output_filename)
160
+
161
+ # Save final audio
162
+ sf.write(output_file, final_audio.T, main_sr)
163
+
164
+ return output_file
165
+
166
+ except Exception as e:
167
+ raise RuntimeError(f"Error inserting audio section: {str(e)}")
168
+
169
+
170
+ def insert_multiple_sections(
171
+ audio_path: str,
172
+ sections: list[tuple[str, float, float]],
173
+ crossfade_duration: float = 0.1,
174
+ output_path: Optional[str] = None,
175
+ output_format: str = "wav",
176
+ ) -> str:
177
+ """
178
+ Insert multiple sections into an audio track at specified positions.
179
+
180
+ This function allows inserting multiple audio sections (like multiple ads,
181
+ sound effects, or musical segments) into a main track with smooth
182
+ transitions between each insertion.
183
+
184
+ Args:
185
+ audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
186
+ sections: List of (section_path, insert_time) tuples
187
+ section_path: Path to audio section to insert
188
+ insert_time: Position to insert section (in seconds)
189
+ crossfade_duration: Length of crossfade in seconds (default: 0.1)
190
+ output_path: Optional output directory (default: None, uses temp directory)
191
+ output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
192
+
193
+ Returns:
194
+ Path to the audio file with all sections inserted
195
+
196
+ Examples:
197
+ >>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2)
198
+ # Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min
199
+
200
+ >>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3")
201
+ # Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute
202
+
203
+ Note:
204
+ - Sections are inserted in chronological order
205
+ - Each section gets crossfade at both start and end
206
+ - If sections overlap, later sections take precedence
207
+ - Total processing time increases with number of sections
208
+ - Works best with non-overlapping insertion times
209
+ """
210
+ try:
211
+ # Load main audio
212
+ main_audio, main_sr = _load_audio(audio_path, mono=False)
213
+ main_duration = len(main_audio) / main_sr
214
+ current_audio = main_audio.copy()
215
+
216
+ # Sort sections by insert time
217
+ sorted_sections = sorted(sections, key=lambda x: x[1])
218
+
219
+ # Insert each section
220
+ for section_path, insert_time, _ in sorted_sections:
221
+ # Load section
222
+ section_audio, section_sr = _load_audio(section_path, mono=False)
223
+
224
+ # Resample if needed
225
+ if section_sr != main_sr:
226
+ section_audio = librosa.resample(
227
+ section_audio, orig_sr=section_sr, target_sr=main_sr
228
+ )
229
+
230
+ # Calculate crossfade points
231
+ fade_start, fade_end = detect_crossfade_point(
232
+ insert_time, main_duration, crossfade_duration
233
+ )
234
+
235
+ # Extract current audio segments
236
+ current_before = current_audio[: int(fade_start * main_sr)]
237
+ current_after = current_audio[int(fade_end * main_sr) :]
238
+
239
+ # Apply crossfade and insert section
240
+ section_with_fade = apply_crossfade(
241
+ section_audio, current_after, crossfade_duration, main_sr
242
+ )
243
+
244
+ # Update current audio
245
+ current_audio = np.concatenate([current_before, section_with_fade])
246
+
247
+ # Update duration for next insertion
248
+ main_duration = len(current_audio) / main_sr
249
+
250
+ # Save output
251
+ if output_path is None:
252
+ output_path = tempfile.mkdtemp(suffix="_multi_inserted")
253
+ else:
254
+ os.makedirs(output_path, exist_ok=True)
255
+
256
+ # Generate output filename
257
+ main_filename = os.path.splitext(os.path.basename(audio_path))[0]
258
+ output_filename = f"{main_filename}_with_multiple_insertions.{output_format}"
259
+ output_file = os.path.join(output_path, output_filename)
260
+
261
+ # Save final audio
262
+ sf.write(output_file, current_audio.T, main_sr)
263
+
264
+ return output_file
265
+
266
+ except Exception as e:
267
+ raise RuntimeError(f"Error inserting multiple sections: {str(e)}")
268
+
269
+
270
+ def replace_section(
271
+ audio_path: str,
272
+ start_time: float,
273
+ end_time: float,
274
+ replacement_path: str,
275
+ crossfade_duration: float = 0.1,
276
+ output_path: Optional[str] = None,
277
+ output_format: str = "wav",
278
+ ) -> str:
279
+ """
280
+ Replace a section of an audio track with another audio segment.
281
+
282
+ This function removes a specified time range from the main audio and
283
+ replaces it with new content, using crossfades for smooth transitions.
284
+
285
+ Args:
286
+ audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
287
+ start_time: Start time of section to replace (in seconds)
288
+ end_time: End time of section to replace (in seconds)
289
+ replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
290
+ crossfade_duration: Length of crossfade in seconds (default: 0.1)
291
+ output_path: Optional output directory (default: None, uses temp directory)
292
+ output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
293
+
294
+ Returns:
295
+ Path to the audio file with the section replaced
296
+
297
+ Examples:
298
+ >>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav")
299
+ # Returns 'path/to/song_replaced.wav' with 60-90s section replaced
300
+
301
+ >>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3")
302
+ # Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced
303
+
304
+ Note:
305
+ - Start time must be less than end time
306
+ - Crossfade prevents clicks at replacement boundaries
307
+ - Replacement section is trimmed if longer than specified duration
308
+ - Preserves original audio quality and sample rate
309
+ - Useful for fixing mistakes, updating content, or adding corrections
310
+ """
311
+ try:
312
+ # Load both audio files
313
+ main_audio, main_sr = _load_audio(audio_path, mono=False)
314
+ replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False)
315
+
316
+ # Validate timing
317
+ if start_time >= end_time:
318
+ raise ValueError("Start time must be less than end time")
319
+
320
+ # Convert times to samples
321
+ start_sample = int(start_time * main_sr)
322
+ end_sample = int(end_time * main_sr)
323
+
324
+ # Extract main audio parts
325
+ main_before = main_audio[:start_sample]
326
+ main_after = main_audio[end_sample:]
327
+
328
+ # Resample replacement if needed
329
+ if replacement_sr != main_sr:
330
+ replacement_audio = librosa.resample(
331
+ replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
332
+ )
333
+
334
+ # Trim replacement to specified duration
335
+ replacement_duration = end_time - start_time
336
+ replacement_samples = int(replacement_duration * main_sr)
337
+ trimmed_replacement = (
338
+ replacement_audio[:replacement_samples]
339
+ if len(replacement_audio) > replacement_samples
340
+ else replacement_audio
341
+ )
342
+
343
+ # Apply crossfades
344
+ fade_samples = int(crossfade_duration * main_sr)
345
+
346
+ # Fade in replacement
347
+ fade_in = np.linspace(0, 1, fade_samples)
348
+ trimmed_replacement[:fade_samples] *= fade_in
349
+
350
+ # Fade out at end of replacement
351
+ fade_out = np.linspace(1, 0, fade_samples)
352
+ trimmed_replacement[-fade_samples:] *= fade_out
353
+
354
+ # Combine all parts
355
+ final_audio = np.concatenate([main_before, trimmed_replacement, main_after])
356
+
357
+ # Save output
358
+ if output_path is None:
359
+ output_path = tempfile.mkdtemp(suffix="_replaced")
360
+ else:
361
+ os.makedirs(output_path, exist_ok=True)
362
+
363
+ # Generate output filename
364
+ main_filename = os.path.splitext(os.path.basename(audio_path))[0]
365
+ output_filename = f"{main_filename}_replaced.{output_format}"
366
+ output_file = os.path.join(output_path, output_filename)
367
+
368
+ # Save final audio
369
+ sf.write(output_file, final_audio.T, main_sr)
370
+
371
+ return output_file
372
+
373
+ except Exception as e:
374
+ raise RuntimeError(f"Error replacing audio section: {str(e)}")
tools/combine_tracks.py CHANGED
@@ -305,7 +305,10 @@ def create_medley(
305
 
306
  if output_path is None:
307
  tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-")
308
- output = Path(tmp_dir) / f"{vocals.name}_{instrumental.name}_medley.{medley_extension}"
 
 
 
309
  else:
310
  output = Path(output_path).expanduser().resolve()
311
  output.parent.mkdir(parents=True, exist_ok=True)
 
305
 
306
  if output_path is None:
307
  tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-")
308
+ output = (
309
+ Path(tmp_dir)
310
+ / f"{vocals.name}_{instrumental.name}_medley.{medley_extension}"
311
+ )
312
  else:
313
  output = Path(output_path).expanduser().resolve()
314
  output.parent.mkdir(parents=True, exist_ok=True)
tools/voice_replacement.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ from gradio_client import Client, handle_file
6
+
7
+ from tools.audio_info import validate_audio_path
8
+
9
+
10
+ def replace_voice(
11
+ source_audio_path: str,
12
+ target_audio_path: str,
13
+ diffusion_steps: int = 10,
14
+ length_adjust: float = 1.0,
15
+ inference_cfg_rate: float = 0.7,
16
+ f0_condition: bool = False,
17
+ auto_f0_adjust: bool = True,
18
+ pitch_shift: int = 0,
19
+ ) -> str:
20
+ """
21
+ Replace voice in source audio with voice from target audio using Seed-VC.
22
+
23
+ This function uses the Seed-VC Gradio space to perform voice conversion,
24
+ replacing the voice characteristics in the source audio with those from
25
+ the target audio while preserving the linguistic content and timing.
26
+
27
+ Args:
28
+ source_audio_path: Path to the source audio file (voice to be replaced)
29
+ target_audio_path: Path to the target audio file (voice to use)
30
+ diffusion_steps: Number of diffusion steps for inference (default: 10)
31
+ length_adjust: Length adjustment factor (default: 1.0)
32
+ inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
33
+ f0_condition: Whether to use F0 conditioning (default: False)
34
+ auto_f0_adjust: Whether to auto-adjust F0 (default: True)
35
+ pitch_shift: Pitch shift in semitones (default: 0)
36
+
37
+ Returns:
38
+ Path to the generated voice-replaced audio file
39
+
40
+ Raises:
41
+ FileNotFoundError: If source or target audio files don't exist
42
+ ValueError: If parameters are invalid
43
+ RuntimeError: If voice replacement fails
44
+ """
45
+ try:
46
+ # Validate input paths
47
+ source_abs_path = validate_audio_path(source_audio_path)
48
+ target_abs_path = validate_audio_path(target_audio_path)
49
+
50
+ # Validate parameters
51
+ if diffusion_steps < 1 or diffusion_steps > 50:
52
+ raise ValueError("diffusion_steps must be between 1 and 50")
53
+ if length_adjust <= 0:
54
+ raise ValueError("length_adjust must be positive")
55
+ if not 0 <= inference_cfg_rate <= 1:
56
+ raise ValueError("inference_cfg_rate must be between 0 and 1")
57
+ if pitch_shift < -12 or pitch_shift > 12:
58
+ raise ValueError("pitch_shift must be between -12 and 12 semitones")
59
+
60
+ # Initialize Seed-VC client
61
+ client = Client("Plachta/Seed-VC")
62
+
63
+ # Perform voice replacement
64
+ result = client.predict(
65
+ source_audio_path=handle_file(source_abs_path),
66
+ target_audio_path=handle_file(target_abs_path),
67
+ diffusion_steps=diffusion_steps,
68
+ length_adjust=length_adjust,
69
+ inference_cfg_rate=inference_cfg_rate,
70
+ f0_condition=f0_condition,
71
+ auto_f0_adjust=auto_f0_adjust,
72
+ pitch_shift=pitch_shift,
73
+ api_name="/predict_1",
74
+ )
75
+
76
+ # Create output directory
77
+ output_dir = Path("output")
78
+ output_dir.mkdir(exist_ok=True)
79
+
80
+ # Generate output filename with timestamp
81
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
82
+ source_name = Path(source_abs_path).stem
83
+ target_name = Path(target_abs_path).stem
84
+ output_filename = (
85
+ f"{source_name}_voice_replaced_by_{target_name}_{timestamp}.wav"
86
+ )
87
+ output_path = output_dir / output_filename
88
+
89
+ # Save the result
90
+ if isinstance(result, str) and os.path.exists(result):
91
+ # If result is a file path, copy it to output location
92
+ import shutil
93
+
94
+ shutil.copy2(result, output_path)
95
+ else:
96
+ # If result is audio data, save it using soundfile
97
+ import soundfile as sf
98
+
99
+ sf.write(str(output_path), result, 22050)
100
+
101
+ return str(output_path)
102
+
103
+ except Exception as e:
104
+ raise RuntimeError(f"Voice replacement failed: {str(e)}")
105
+
106
+
107
+ def replace_voice_wrapper(
108
+ source_audio_path: str,
109
+ target_audio_path: str,
110
+ diffusion_steps: int = 10,
111
+ length_adjust: float = 1.0,
112
+ inference_cfg_rate: float = 0.7,
113
+ f0_condition: bool = False,
114
+ auto_f0_adjust: bool = True,
115
+ pitch_shift: int = 0,
116
+ ) -> str:
117
+ """
118
+ Wrapper function for voice replacement with error handling for MCP integration.
119
+
120
+ Args:
121
+ source_audio_path: Path to the source audio file
122
+ target_audio_path: Path to the target audio file
123
+ diffusion_steps: Number of diffusion steps (default: 10)
124
+ length_adjust: Length adjustment factor (default: 1.0)
125
+ inference_cfg_rate: CFG rate (default: 0.7)
126
+ f0_condition: Use F0 conditioning (default: False)
127
+ auto_f0_adjust: Auto-adjust F0 (default: True)
128
+ pitch_shift: Pitch shift in semitones (default: 0)
129
+
130
+ Returns:
131
+ Path to generated audio file or error message
132
+ """
133
+ try:
134
+ return replace_voice(
135
+ source_audio_path=source_audio_path,
136
+ target_audio_path=target_audio_path,
137
+ diffusion_steps=diffusion_steps,
138
+ length_adjust=length_adjust,
139
+ inference_cfg_rate=inference_cfg_rate,
140
+ f0_condition=f0_condition,
141
+ auto_f0_adjust=auto_f0_adjust,
142
+ pitch_shift=pitch_shift,
143
+ )
144
+ except Exception as e:
145
+ return f"Error: {str(e)}"