Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

App Files Files Community

frascuchon HF Staff commited on 12 days ago

Commit

a5d8e64

1 Parent(s): 122c63e

add new tools

Browse files

Files changed (7) hide show

mcp_server.py +322 -6
requirements.txt +1 -0
tools/audio_cleaning.py +258 -0
tools/audio_cutting.py +0 -1
tools/audio_insertion.py +374 -0
tools/combine_tracks.py +4 -1
tools/voice_replacement.py +145 -0

mcp_server.py CHANGED Viewed

@@ -25,6 +25,12 @@ from tools.music_understanding import (
     suggest_cutting_points,
     analyze_genre_and_style,
 )
 def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
@@ -112,7 +118,7 @@ def stretch_audio_to_bpm_wrapper(audio_path: str, target_bpm: float) -> str:
 def extract_selected_stems_wrapper(
     audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
-) -> Tuple[str|None, str|None, str|None, str|None]:
     """
     Extract selected stems from an audio file based on user choices.
@@ -157,7 +163,7 @@ def extract_selected_stems_wrapper(
     if not stems_to_extract:
         raise ValueError("At least one stem must be selected for extraction")
-    results= extract_selected_stems(audio_path, stems_to_extract)
     vocals = results.get("vocals")
     drums = results.get("drums")
@@ -268,7 +274,7 @@ def mute_time_windows_wrapper(
 def extract_segments_wrapper(
     audio_path: str, segments_str: str, format_val: str, join: bool
-) -> Tuple[str, str|None, str|None, str|None]:
     """
     Extract multiple segments (up to 4 segments) from an audio file and optionally join them.
@@ -682,7 +688,9 @@ def shift_to_key_wrapper(
 # MCP Tool Wrappers with Documentation for MCP Server
-def separate_audio_mcp(audio_path: str, output_format: str = "wav") -> Tuple[str, str, str, str]:
     """
     Separate audio into vocals, drums, bass, and other stems using Demucs neural network.
@@ -715,7 +723,12 @@ def separate_audio_mcp(audio_path: str, output_format: str = "wav") -> Tuple[str
         )
         return vocals, drums, bass, other
     except Exception as e:
-        return f"Error separating audio: {str(e)}", f"Error: {str(e)}", f"Error: {str(e)}", f"Error: {str(e)}"
 def combine_tracks_mcp(
@@ -777,7 +790,9 @@ def combine_tracks_mcp(
         return f"Error combining tracks: {str(e)}"
-def pitch_shift_with_semitones_mcp(audio_path: str, semitones: int, output_format: str = "wav") -> str:
     """
     Shift the pitch of an audio file by a specified number of semitones.
@@ -855,6 +870,7 @@ def align_songs_by_bpm_mcp(
         )
         # Apply target BPM by stretching both tracks
         from tools.time_strech import stretch_to_bpm
         aligned1 = stretch_to_bpm(result1, target_bpm, None, output_format)
         aligned2 = stretch_to_bpm(result2, target_bpm, None, output_format)
         return aligned1, aligned2
@@ -1136,6 +1152,209 @@ def analyze_genre_and_style_mcp(audio_path: str) -> str:
         return f"Error analyzing genre and style: {str(e)}"
 def create_interface() -> gr.TabbedInterface:
     """
     Create and configure the complete Gradio interface with all audio processing tools.
@@ -1668,6 +1887,95 @@ def create_interface() -> gr.TabbedInterface:
         flagging_mode="never",
     )
     return gr.TabbedInterface(
         [
             stem_interface,
@@ -1694,6 +2002,10 @@ def create_interface() -> gr.TabbedInterface:
             structure_interface,
             cutting_points_interface,
             genre_interface,
         ],
         [
             "Stem Separation",
@@ -1720,6 +2032,10 @@ def create_interface() -> gr.TabbedInterface:
             "Song Structure",
             "Cutting Points",
             "Genre Analysis",
         ],
     )

     suggest_cutting_points,
     analyze_genre_and_style,
 )
+from tools.audio_cleaning import remove_noise
+from tools.audio_insertion import (
+    insert_section,
+    replace_section,
+)
+from tools.voice_replacement import replace_voice_wrapper
 def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
 def extract_selected_stems_wrapper(
     audio_path: str, vocals: bool, drums: bool, bass: bool, other: bool
+) -> Tuple[str | None, str | None, str | None, str | None]:
     """
     Extract selected stems from an audio file based on user choices.
     if not stems_to_extract:
         raise ValueError("At least one stem must be selected for extraction")
+    results = extract_selected_stems(audio_path, stems_to_extract)
     vocals = results.get("vocals")
     drums = results.get("drums")
 def extract_segments_wrapper(
     audio_path: str, segments_str: str, format_val: str, join: bool
+) -> Tuple[str, str | None, str | None, str | None]:
     """
     Extract multiple segments (up to 4 segments) from an audio file and optionally join them.
 # MCP Tool Wrappers with Documentation for MCP Server
+def separate_audio_mcp(
+    audio_path: str, output_format: str = "wav"
+) -> Tuple[str, str, str, str]:
     """
     Separate audio into vocals, drums, bass, and other stems using Demucs neural network.
         )
         return vocals, drums, bass, other
     except Exception as e:
+        return (
+            f"Error separating audio: {str(e)}",
+            f"Error: {str(e)}",
+            f"Error: {str(e)}",
+            f"Error: {str(e)}",
+        )
 def combine_tracks_mcp(
         return f"Error combining tracks: {str(e)}"
+def pitch_shift_with_semitones_mcp(
+    audio_path: str, semitones: int, output_format: str = "wav"
+) -> str:
     """
     Shift the pitch of an audio file by a specified number of semitones.
         )
         # Apply target BPM by stretching both tracks
         from tools.time_strech import stretch_to_bpm
         aligned1 = stretch_to_bpm(result1, target_bpm, None, output_format)
         aligned2 = stretch_to_bpm(result2, target_bpm, None, output_format)
         return aligned1, aligned2
         return f"Error analyzing genre and style: {str(e)}"
+def remove_noise_mcp(
+    audio_path: str,
+    noise_type: str = "general",
+    sensitivity: float = 0.5,
+    output_format: str = "wav",
+) -> str:
+    """
+    Remove noise from audio using adaptive filtering and spectral subtraction.
+    This MCP wrapper provides noise removal capabilities for various types of
+    unwanted audio artifacts including hiss, hum, rumble, and general background noise.
+    Args:
+        audio_path: Path to the input audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
+        noise_type: Type of noise to remove ('general', 'hiss', 'hum', 'rumble', 'background')
+        sensitivity: Noise reduction sensitivity (0.0 to 1.0, default: 0.5)
+        output_format: Output format for the cleaned audio ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the cleaned audio file
+    Examples:
+        >>> remove_noise_mcp("noisy_recording.wav", "hiss", 0.7, "wav")
+        # Returns path to cleaned audio with reduced hiss
+        >>> remove_noise_mcp("podcast.mp3", "background", 0.3, "mp3")
+        # Returns path to cleaned audio with reduced background noise
+    Note:
+        - Higher sensitivity values remove more noise but may affect audio quality
+        - Different noise types use specialized algorithms for optimal results
+        - Processing time varies with audio length and noise complexity
+    """
+    try:
+        result = remove_noise(
+            audio_path=audio_path,
+            noise_type=noise_type,
+            sensitivity=sensitivity,
+            output_path=None,
+            output_format=output_format,
+        )
+        return result
+    except Exception as e:
+        return f"Error removing noise: {str(e)}"
+def insert_section_mcp(
+    audio_path: str,
+    section_path: str,
+    insert_time: float,
+    crossfade_duration: float = 0.1,
+    output_format: str = "wav",
+) -> str:
+    """
+    Insert a section from one audio track into another at a precise time position.
+    This MCP wrapper allows inserting audio content (like an intro, advertisement,
+    or sound effect) into an existing track at any position with smooth
+    crossfading to avoid audible clicks or abrupt transitions.
+    Args:
+        audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
+        section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
+        insert_time: Position to insert the section (in seconds from start of main audio)
+        crossfade_duration: Length of crossfade in seconds (default: 0.1)
+        output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the audio file with the section inserted
+    Examples:
+        >>> insert_section_mcp("main_track.wav", "intro.wav", 5.0, 0.2, "wav")
+        # Returns path to audio with intro inserted at 5 seconds
+        >>> insert_section_mcp("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "mp3")
+        # Returns path to audio with ad inserted at 3 minutes
+    Note:
+        - Insert position is measured from the start of the main audio
+        - Crossfade prevents clicks and creates smooth transitions
+        - If insert_time + section duration exceeds main audio duration, section is truncated
+    """
+    try:
+        result = insert_section(
+            audio_path=audio_path,
+            section_path=section_path,
+            insert_time=insert_time,
+            crossfade_duration=crossfade_duration,
+            output_path=None,
+            output_format=output_format,
+        )
+        return result
+    except Exception as e:
+        return f"Error inserting audio section: {str(e)}"
+def replace_section_mcp(
+    audio_path: str,
+    start_time: float,
+    end_time: float,
+    replacement_path: str,
+    crossfade_duration: float = 0.1,
+    output_format: str = "wav",
+) -> str:
+    """
+    Replace a section of an audio track with another audio segment.
+    This MCP wrapper removes a specified time range from the main audio and
+    replaces it with new content, using crossfades for smooth transitions.
+    Args:
+        audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
+        start_time: Start time of section to replace (in seconds)
+        end_time: End time of section to replace (in seconds)
+        replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
+        crossfade_duration: Length of crossfade in seconds (default: 0.1)
+        output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the audio file with the section replaced
+    Examples:
+        >>> replace_section_mcp("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "wav")
+        # Returns path to audio with 60-90s section replaced
+        >>> replace_section_mcp("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "mp3")
+        # Returns path to audio with 2-minute section replaced
+    Note:
+        - Start time must be less than end time
+        - Crossfade prevents clicks at replacement boundaries
+        - Replacement section is trimmed if longer than specified duration
+    """
+    try:
+        result = replace_section(
+            audio_path=audio_path,
+            start_time=start_time,
+            end_time=end_time,
+            replacement_path=replacement_path,
+            crossfade_duration=crossfade_duration,
+            output_path=None,
+            output_format=output_format,
+        )
+        return result
+    except Exception as e:
+        return f"Error replacing audio section: {str(e)}"
+def replace_voice_mcp(
+    source_audio_path: str,
+    target_audio_path: str,
+    diffusion_steps: int = 10,
+    length_adjust: float = 1.0,
+    inference_cfg_rate: float = 0.7,
+    f0_condition: bool = False,
+    auto_f0_adjust: bool = True,
+    pitch_shift: int = 0,
+) -> str:
+    """
+    Replace voice in source audio with voice from target audio using Seed-VC.
+    This MCP wrapper uses the Seed-VC Gradio space to perform voice conversion,
+    replacing the voice characteristics in the source audio with those from
+    the target audio while preserving the linguistic content and timing.
+    Args:
+        source_audio_path: Path to the source audio file (voice to be replaced)
+        target_audio_path: Path to the target audio file (voice to use)
+        diffusion_steps: Number of diffusion steps for inference (default: 10)
+        length_adjust: Length adjustment factor (default: 1.0)
+        inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
+        f0_condition: Whether to use F0 conditioning (default: False)
+        auto_f0_adjust: Whether to auto-adjust F0 (default: True)
+        pitch_shift: Pitch shift in semitones (default: 0)
+    Returns:
+        Path to the generated voice-replaced audio file
+    Examples:
+        >>> replace_voice_mcp("source.wav", "target_voice.wav")
+        # Returns path to voice-replaced audio file
+        >>> replace_voice_mcp("speech.mp3", "singer.wav", diffusion_steps=15, pitch_shift=2)
+        # Returns path to voice-replaced audio with custom settings
+    Note:
+        - Uses Seed-VC model for high-quality voice conversion
+        - Preserves linguistic content and timing from source audio
+        - Applies voice characteristics from target audio
+        - Processing time depends on diffusion steps and audio length
+    """
+    return replace_voice_wrapper(
+        source_audio_path=source_audio_path,
+        target_audio_path=target_audio_path,
+        diffusion_steps=diffusion_steps,
+        length_adjust=length_adjust,
+        inference_cfg_rate=inference_cfg_rate,
+        f0_condition=f0_condition,
+        auto_f0_adjust=auto_f0_adjust,
+        pitch_shift=pitch_shift,
+    )
 def create_interface() -> gr.TabbedInterface:
     """
     Create and configure the complete Gradio interface with all audio processing tools.
         flagging_mode="never",
     )
+    # Tab 20: Audio Cleaning
+    cleaning_interface = gr.Interface(
+        fn=remove_noise_mcp,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Dropdown(
+                choices=["general", "hiss", "hum", "rumble", "background"],
+                value="general",
+                label="Noise Type",
+            ),
+            gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.5, step=0.1, label="Sensitivity"
+            ),
+            gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
+        ],
+        outputs=gr.Audio(label="Cleaned Audio", type="filepath"),
+        title="Audio Noise Removal",
+        description="Remove various types of noise from audio using adaptive filtering and spectral subtraction.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 21: Insert Section
+    insert_interface = gr.Interface(
+        fn=insert_section_mcp,
+        inputs=[
+            gr.Audio(type="filepath", label="Main Audio File", sources=["upload"]),
+            gr.Audio(type="filepath", label="Section to Insert", sources=["upload"]),
+            gr.Number(value=5.0, label="Insert Time (seconds)"),
+            gr.Number(value=0.1, label="Crossfade Duration (seconds)"),
+            gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
+        ],
+        outputs=gr.Audio(label="Audio with Insertion", type="filepath"),
+        title="Insert Audio Section",
+        description="Insert a section from one audio track into another at a precise time position.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 22: Replace Section
+    replace_interface = gr.Interface(
+        fn=replace_section_mcp,
+        inputs=[
+            gr.Audio(type="filepath", label="Main Audio File", sources=["upload"]),
+            gr.Number(value=60.0, label="Start Time (seconds)"),
+            gr.Number(value=90.0, label="End Time (seconds)"),
+            gr.Audio(type="filepath", label="Replacement Section", sources=["upload"]),
+            gr.Number(value=0.1, label="Crossfade Duration (seconds)"),
+            gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
+        ],
+        outputs=gr.Audio(label="Audio with Replacement", type="filepath"),
+        title="Replace Audio Section",
+        description="Replace a section of an audio track with another audio segment.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 23: Voice Replacement
+    voice_replacement_interface = gr.Interface(
+        fn=replace_voice_mcp,
+        inputs=[
+            gr.Audio(
+                type="filepath",
+                label="Source Audio (voice to be replaced)",
+                sources=["upload"],
+            ),
+            gr.Audio(
+                type="filepath", label="Target Audio (voice to use)", sources=["upload"]
+            ),
+            gr.Number(value=10, label="Diffusion Steps", minimum=1, maximum=50),
+            gr.Number(value=1.0, label="Length Adjust", minimum=0.1, maximum=3.0),
+            gr.Number(value=0.7, label="Inference CFG Rate", minimum=0.0, maximum=1.0),
+            gr.Checkbox(value=False, label="F0 Condition"),
+            gr.Checkbox(value=True, label="Auto F0 Adjust"),
+            gr.Number(
+                value=0, label="Pitch Shift (semitones)", minimum=-12, maximum=12
+            ),
+        ],
+        outputs=gr.Audio(label="Voice-Replaced Audio", type="filepath"),
+        title="Voice Replacement with Seed-VC",
+        description="Replace voice in source audio with voice from target audio using Seed-VC AI model.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
     return gr.TabbedInterface(
         [
             stem_interface,
             structure_interface,
             cutting_points_interface,
             genre_interface,
+            cleaning_interface,
+            insert_interface,
+            replace_interface,
+            voice_replacement_interface,
         ],
         [
             "Stem Separation",
             "Song Structure",
             "Cutting Points",
             "Genre Analysis",
+            "Audio Cleaning",
+            "Insert Section",
+            "Replace Section",
+            "Voice Replacement",
         ],
     )

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 librosa>=0.10.0
 numpy>=1.24.0
 torch~=2.8.0
 torchaudio~=2.8.0
 torchcodec~=0.8.0

 librosa>=0.10.0
 numpy>=1.24.0
+scipy>=1.10.0
 torch~=2.8.0
 torchaudio~=2.8.0
 torchcodec~=0.8.0

tools/audio_cleaning.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import os
+import tempfile
+from typing import Optional
+import librosa
+import numpy as np
+import soundfile as sf
+from scipy.signal import butter, lfilter, filtfilt
+def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
+    """Load audio file with standard settings."""
+    y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
+    return y, int(sr)
+def detect_noise_profile(audio: np.ndarray, sample_rate: int) -> dict:
+    """
+    Analyze audio to detect noise characteristics.
+    Args:
+        audio: Audio data as numpy array
+        sample_rate: Sample rate of audio
+    Returns:
+        Dictionary with noise profile information
+    """
+    # Compute spectral features for noise detection
+    stft = librosa.stft(audio, n_fft=2048, hop_length=512)
+    magnitude = np.abs(stft)
+    # Identify noise floor (quiet parts)
+    noise_floor = np.percentile(magnitude, 10)
+    # Detect steady noise (consistent low-frequency content)
+    freqs = librosa.fft_frequencies(sr=sample_rate, n_fft=2048)
+    low_freq_mask = freqs < 200  # Below 200 Hz
+    steady_noise = np.mean(magnitude[:, low_freq_mask], axis=1)
+    # Detect hiss (high frequency noise)
+    high_freq_mask = freqs > 4000  # Above 4 kHz
+    hiss_level = np.mean(magnitude[:, high_freq_mask], axis=1)
+    # Compute overall noise characteristics
+    signal_power = np.mean(magnitude**2, axis=1)
+    noise_power = np.mean(magnitude**2, axis=1) - signal_power
+    snr_estimate = 10 * np.log10(signal_power / (noise_power + 1e-10))
+    return {
+        "noise_floor": float(noise_floor),
+        "steady_noise": float(steady_noise),
+        "hiss_level": float(hiss_level),
+        "snr_estimate": float(snr_estimate),
+        "has_significant_noise": bool(
+            steady_noise > noise_floor * 2 or hiss_level > noise_floor * 1.5
+        ),
+    }
+def spectral_subtraction(
+    audio: np.ndarray, noise_profile: dict, sample_rate: int
+) -> np.ndarray:
+    """
+    Apply spectral subtraction to remove identified noise.
+    Args:
+        audio: Input audio data
+        noise_profile: Noise profile from detect_noise_profile()
+        sample_rate: Sample rate of audio
+    Returns:
+        Cleaned audio data
+    """
+    # Compute STFT of audio
+    stft = librosa.stft(audio, n_fft=2048, hop_length=512)
+    magnitude = np.abs(stft)
+    phase = np.angle(stft)
+    # Create noise gate based on noise floor
+    noise_gate = np.minimum(magnitude / (noise_profile["noise_floor"] + 1e-10), 1.0)
+    # Apply gentle noise reduction
+    reduction_factor = 0.3 if noise_profile["has_significant_noise"] else 0.15
+    cleaned_magnitude = magnitude * (1 - noise_gate * reduction_factor)
+    # Reconstruct audio
+    cleaned_stft = cleaned_magnitude * np.exp(1j * phase)
+    cleaned_audio = librosa.istft(cleaned_stft, hop_length=512)
+    return cleaned_audio
+def adaptive_filter(
+    audio: np.ndarray, sample_rate: int, noise_type: str = "general"
+) -> np.ndarray:
+    """
+    Apply adaptive filtering based on noise type.
+    Args:
+        audio: Input audio data
+        sample_rate: Sample rate of audio
+        noise_type: Type of noise to address ('general', 'hiss', 'hum', 'background')
+    Returns:
+        Filtered audio data
+    """
+    if noise_type == "hiss":
+        # High-pass filter to reduce hiss (above 4kHz)
+        cutoff = 4000
+        b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
+        filtered_audio = lfilter(b, a, audio)
+    elif noise_type == "hum":
+        # Notch filter for common hum frequencies (50/60 Hz and harmonics)
+        # Apply multiple notch filters
+        filtered_audio = audio.copy()
+        hum_freqs = [50, 60, 100, 120, 180, 240]  # Common power line harmonics
+        for freq in hum_freqs:
+            if freq < sample_rate / 2:
+                # Create notch filter
+                b, a = butter(
+                    2,
+                    [freq * 0.9, freq * 1.1],
+                    fs=sample_rate,
+                    btype="bandstop",
+                    output="ba",
+                )
+                filtered_audio = lfilter(b, a, filtered_audio)
+    elif noise_type == "background":
+        # Spectral subtraction for background noise
+        noise_profile = detect_noise_profile(audio, sample_rate)
+        filtered_audio = spectral_subtraction(audio, noise_profile, sample_rate)
+    else:
+        # General broadband noise reduction
+        # Apply gentle low-pass filter
+        cutoff = int(min(8000, sample_rate // 2.5))
+        b, a = butter(4, cutoff, fs=sample_rate, btype="low", output="ba")
+        filtered_audio = lfilter(b, a, audio)
+    return filtered_audio
+def remove_noise(
+    audio_path: str,
+    noise_type: str = "general",
+    sensitivity: float = 0.5,
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+) -> str:
+    """
+    Remove noise from audio using adaptive filtering and spectral subtraction.
+    This function analyzes the audio to detect noise characteristics and applies
+    appropriate noise reduction techniques based on the noise type and sensitivity
+    settings. It supports various noise types including hiss, hum, rumble, and
+    general background noise.
+    Args:
+        audio_path: Path to the audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
+        noise_type: Type of noise to remove ('general', 'hiss', 'hum', 'rumble', 'background')
+                   - 'general': Broadband noise reduction
+                   - 'hiss': High-frequency noise removal
+                   - 'hum': Power line hum removal (50/60 Hz)
+                   - 'rumble': Low-frequency rumble removal
+                   - 'background': General background noise
+        sensitivity: Noise reduction sensitivity (0.0 to 1.0, default: 0.5)
+                     Higher values remove more noise but may affect audio quality
+        output_path: Optional output directory (default: None, uses temp directory)
+        output_format: Output format for the cleaned audio ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the cleaned audio file
+    Examples:
+        >>> remove_noise("noisy_recording.wav", "hiss", 0.7, "output", "wav")
+        # Returns 'path/to/noisy_recording_hiss_removed.wav' with reduced hiss
+        >>> remove_noise("podcast.mp3", "background", 0.3, "output", "mp3")
+        # Returns 'path/to/podcast_background_removed.mp3' with reduced background noise
+    Note:
+        - Higher sensitivity values remove more noise but may affect audio quality
+        - Different noise types use specialized algorithms for optimal results
+        - Processing time varies with audio length and noise complexity
+        - Preserves original audio quality and sample rate
+        - Works with mono or stereo audio files
+    """
+    try:
+        # Load audio
+        audio, sample_rate = _load_audio(audio_path, mono=False)
+        # Apply noise reduction based on type and sensitivity
+        if noise_type == "hiss":
+            # High-pass filter for hiss removal
+            cutoff = 4000 - sensitivity * 2000  # 2000-4000 Hz range
+            b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
+            filtered_audio = filtfilt(b, a, audio)
+        elif noise_type == "hum":
+            # Multiple notch filters for harmonics
+            filtered_audio = audio.copy()
+            fundamental_freqs = [50, 60, 100]  # Common power line fundamentals
+            for fundamental in fundamental_freqs:
+                if fundamental < sample_rate // 2:
+                    # Filter fundamental and first few harmonics
+                    for harmonic in range(1, 6):
+                        freq = fundamental * harmonic
+                        if freq < sample_rate // 2:
+                            b, a = butter(
+                                2,
+                                [freq * 0.95, freq * 1.05],
+                                fs=sample_rate,
+                                btype="bandstop",
+                                output="ba",
+                            )
+                            filtered_audio = filtfilt(b, a, filtered_audio)
+        elif noise_type == "rumble":
+            # High-pass filter for rumble removal
+            cutoff = 20 + sensitivity * 80  # 20-100 Hz range
+            b, a = butter(4, cutoff, fs=sample_rate, btype="high", output="ba")
+            filtered_audio = filtfilt(b, a, audio)
+        else:  # background or general
+            # General noise reduction
+            noise_profile = detect_noise_profile(audio, sample_rate)
+            filtered_audio = spectral_subtraction(audio, noise_profile, sample_rate)
+            # Apply based on sensitivity
+            strength = 0.2 + sensitivity * 0.6
+            filtered_audio = (1 - strength) * filtered_audio + strength * audio
+        # Normalize output
+        max_val = np.max(np.abs(filtered_audio))
+        if max_val > 0:
+            filtered_audio = filtered_audio / max_val * 0.95
+        # Save output
+        if output_path is None:
+            output_path = tempfile.mkdtemp(suffix="_noise_removed")
+        else:
+            os.makedirs(output_path, exist_ok=True)
+        # Generate output filename
+        input_filename = os.path.splitext(os.path.basename(audio_path))[0]
+        output_filename = f"{input_filename}_{noise_type}_removed.{output_format}"
+        output_file = os.path.join(output_path, output_filename)
+        # Save processed audio
+        sf.write(output_file, filtered_audio.T, sample_rate)
+        return output_file
+    except Exception as e:
+        raise RuntimeError(f"Error removing noise: {str(e)}")

tools/audio_cutting.py CHANGED Viewed

@@ -43,7 +43,6 @@ def cut_audio(
         # Get audio duration
         duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
         if start_time >= end_time:
             raise ValueError(
                 f"Start time ({start_time}s) must be less than end time ({end_time}s)"

         # Get audio duration
         duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
         if start_time >= end_time:
             raise ValueError(
                 f"Start time ({start_time}s) must be less than end time ({end_time}s)"

tools/audio_insertion.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import os
+import tempfile
+from typing import Optional
+import librosa
+import numpy as np
+import soundfile as sf
+def _load_audio(audio_path: str, mono: bool = False) -> tuple[np.ndarray, int]:
+    """Load audio file with standard settings."""
+    y, sr = librosa.load(audio_path, sr=None, mono=mono, res_type="soxr_vhq")
+    return y, int(sr)
+def detect_crossfade_point(
+    insert_position: float, audio_duration: float, crossfade_duration: float = 0.1
+) -> tuple[float, float]:
+    """
+    Calculate optimal crossfade points for seamless insertion.
+    Args:
+        insert_position: Where to insert the section (in seconds)
+        audio_duration: Total duration of the target audio (in seconds)
+        crossfade_duration: Length of crossfade (in seconds)
+    Returns:
+        Tuple of (start_time, end_time) for crossfade region
+    """
+    # Calculate crossfade boundaries
+    fade_start = max(0, insert_position - crossfade_duration / 2)
+    fade_end = min(audio_duration, insert_position + crossfade_duration / 2)
+    return fade_start, fade_end
+def apply_crossfade(
+    section: np.ndarray, target: np.ndarray, crossfade_duration: float, sample_rate: int
+) -> np.ndarray:
+    """
+    Apply crossfade between section and target audio.
+    Args:
+        section: Audio section to insert
+        target: Target audio to insert into
+        crossfade_duration: Length of crossfade in seconds
+        sample_rate: Sample rate of audio
+    Returns:
+        Target audio with section inserted
+    """
+    # Calculate crossfade samples
+    fade_samples = int(crossfade_duration * sample_rate)
+    # Create crossfade envelope
+    fade_in = np.linspace(0, 1, fade_samples)
+    fade_out = np.linspace(1, 0, fade_samples)
+    crossfade = fade_in * fade_out
+    # Apply crossfade to section end
+    section_end = section[-fade_samples:] if len(section) > fade_samples else section
+    section_end[:fade_samples] *= crossfade
+    # Insert section into target
+    insert_sample = int(len(target) * 0.5)  # Insert at middle
+    result = np.insert(target, insert_sample, section_end, axis=0)
+    return result
+def insert_section(
+    audio_path: str,
+    section_path: str,
+    insert_time: float,
+    crossfade_duration: float = 0.1,
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+) -> str:
+    """
+    Insert a section from one audio track into another at a precise time position.
+    This function allows you to insert audio content (like an intro, advertisement,
+    or sound effect) into an existing track at any position with smooth
+    crossfading to avoid audible clicks or abrupt transitions.
+    Args:
+        audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
+        section_path: Path to the audio section to insert (supports common formats: WAV, MP3, FLAC, M4A)
+        insert_time: Position to insert the section (in seconds from start of main audio)
+        crossfade_duration: Length of crossfade in seconds (default: 0.1)
+                             Longer crossfades create smoother transitions but reduce clarity
+        output_path: Optional output directory (default: None, uses temp directory)
+        output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the audio file with the section inserted
+    Examples:
+        >>> insert_section("main_track.wav", "intro.wav", 5.0, 0.2, "output", "wav")
+        # Returns 'path/to/main_with_intro.wav' with intro inserted at 5 seconds
+        >>> insert_section("podcast.mp3", "advertisement.mp3", 180.0, 0.5, "output", "mp3")
+        # Returns 'path/to/podcast_with_ad.mp3' with ad inserted at 3 minutes
+    Note:
+        - Insert position is measured from the start of the main audio
+        - Crossfade prevents clicks and creates smooth transitions
+        - If insert_time + section duration exceeds main audio duration, section is truncated
+        - Works with mono or stereo audio files
+        - Preserves original audio quality and sample rate
+        - Processing time depends on audio length and crossfade duration
+    """
+    try:
+        # Load both audio files
+        main_audio, main_sr = _load_audio(audio_path, mono=False)
+        section_audio, section_sr = _load_audio(section_path, mono=False)
+        # Resample if needed
+        if main_sr != section_sr:
+            section_audio = librosa.resample(
+                section_audio, orig_sr=section_sr, target_sr=main_sr
+            )
+        # Calculate timing
+        main_duration = len(main_audio) / main_sr
+        # Validate insert position
+        if insert_time < 0:
+            raise ValueError("Insert time must be positive")
+        if insert_time > main_duration:
+            raise ValueError(
+                f"Insert time ({insert_time}s) exceeds main audio duration ({main_duration}s)"
+            )
+        # Calculate crossfade points
+        fade_start, fade_end = detect_crossfade_point(
+            insert_time, main_duration, crossfade_duration
+        )
+        # Extract main audio segments
+        main_before = main_audio[: int(fade_start * main_sr)]
+        main_after = main_audio[int(fade_end * main_sr) :]
+        # Apply crossfade and insert section
+        result = apply_crossfade(section_audio, main_after, crossfade_duration, main_sr)
+        # Combine all parts
+        final_audio = np.concatenate([main_before, result])
+        # Save output
+        if output_path is None:
+            output_path = tempfile.mkdtemp(suffix="_inserted")
+        else:
+            os.makedirs(output_path, exist_ok=True)
+        # Generate output filename
+        main_filename = os.path.splitext(os.path.basename(audio_path))[0]
+        output_filename = f"{main_filename}_with_insertion.{output_format}"
+        output_file = os.path.join(output_path, output_filename)
+        # Save final audio
+        sf.write(output_file, final_audio.T, main_sr)
+        return output_file
+    except Exception as e:
+        raise RuntimeError(f"Error inserting audio section: {str(e)}")
+def insert_multiple_sections(
+    audio_path: str,
+    sections: list[tuple[str, float, float]],
+    crossfade_duration: float = 0.1,
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+) -> str:
+    """
+    Insert multiple sections into an audio track at specified positions.
+    This function allows inserting multiple audio sections (like multiple ads,
+    sound effects, or musical segments) into a main track with smooth
+    transitions between each insertion.
+    Args:
+        audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
+        sections: List of (section_path, insert_time) tuples
+                  section_path: Path to audio section to insert
+                  insert_time: Position to insert section (in seconds)
+        crossfade_duration: Length of crossfade in seconds (default: 0.1)
+        output_path: Optional output directory (default: None, uses temp directory)
+        output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the audio file with all sections inserted
+    Examples:
+        >>> insert_multiple_sections("track.wav", [("intro.wav", 0), ("ad1.wav", 30), ("ad2.wav", 180)], 0.2)
+        # Returns 'path/to/track_with_insertions.wav' with intro at start, ads at 30s and 3min
+        >>> insert_multiple_sections("podcast.mp3", [("sponsor.wav", 60)], 0.3, "output", "mp3")
+        # Returns 'path/to/podcast_with_sponsor.wav' with sponsor segment at 1 minute
+    Note:
+        - Sections are inserted in chronological order
+        - Each section gets crossfade at both start and end
+        - If sections overlap, later sections take precedence
+        - Total processing time increases with number of sections
+        - Works best with non-overlapping insertion times
+    """
+    try:
+        # Load main audio
+        main_audio, main_sr = _load_audio(audio_path, mono=False)
+        main_duration = len(main_audio) / main_sr
+        current_audio = main_audio.copy()
+        # Sort sections by insert time
+        sorted_sections = sorted(sections, key=lambda x: x[1])
+        # Insert each section
+        for section_path, insert_time, _ in sorted_sections:
+            # Load section
+            section_audio, section_sr = _load_audio(section_path, mono=False)
+            # Resample if needed
+            if section_sr != main_sr:
+                section_audio = librosa.resample(
+                    section_audio, orig_sr=section_sr, target_sr=main_sr
+                )
+            # Calculate crossfade points
+            fade_start, fade_end = detect_crossfade_point(
+                insert_time, main_duration, crossfade_duration
+            )
+            # Extract current audio segments
+            current_before = current_audio[: int(fade_start * main_sr)]
+            current_after = current_audio[int(fade_end * main_sr) :]
+            # Apply crossfade and insert section
+            section_with_fade = apply_crossfade(
+                section_audio, current_after, crossfade_duration, main_sr
+            )
+            # Update current audio
+            current_audio = np.concatenate([current_before, section_with_fade])
+            # Update duration for next insertion
+            main_duration = len(current_audio) / main_sr
+        # Save output
+        if output_path is None:
+            output_path = tempfile.mkdtemp(suffix="_multi_inserted")
+        else:
+            os.makedirs(output_path, exist_ok=True)
+        # Generate output filename
+        main_filename = os.path.splitext(os.path.basename(audio_path))[0]
+        output_filename = f"{main_filename}_with_multiple_insertions.{output_format}"
+        output_file = os.path.join(output_path, output_filename)
+        # Save final audio
+        sf.write(output_file, current_audio.T, main_sr)
+        return output_file
+    except Exception as e:
+        raise RuntimeError(f"Error inserting multiple sections: {str(e)}")
+def replace_section(
+    audio_path: str,
+    start_time: float,
+    end_time: float,
+    replacement_path: str,
+    crossfade_duration: float = 0.1,
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+) -> str:
+    """
+    Replace a section of an audio track with another audio segment.
+    This function removes a specified time range from the main audio and
+    replaces it with new content, using crossfades for smooth transitions.
+    Args:
+        audio_path: Path to the main audio file or URL (supports common formats: WAV, MP3, FLAC, M4A)
+        start_time: Start time of section to replace (in seconds)
+        end_time: End time of section to replace (in seconds)
+        replacement_path: Path to the replacement audio segment (supports common formats: WAV, MP3, FLAC, M4A)
+        crossfade_duration: Length of crossfade in seconds (default: 0.1)
+        output_path: Optional output directory (default: None, uses temp directory)
+        output_format: Output format for the final audio ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the audio file with the section replaced
+    Examples:
+        >>> replace_section("song.wav", 60.0, 90.0, "new_verse.wav", 0.2, "output", "wav")
+        # Returns 'path/to/song_replaced.wav' with 60-90s section replaced
+        >>> replace_section("podcast.mp3", 120.0, 150.0, "correction.wav", 0.3, "output", "mp3")
+        # Returns 'path/to/podcast_replaced.mp3' with 2-minute section replaced
+    Note:
+        - Start time must be less than end time
+        - Crossfade prevents clicks at replacement boundaries
+        - Replacement section is trimmed if longer than specified duration
+        - Preserves original audio quality and sample rate
+        - Useful for fixing mistakes, updating content, or adding corrections
+    """
+    try:
+        # Load both audio files
+        main_audio, main_sr = _load_audio(audio_path, mono=False)
+        replacement_audio, replacement_sr = _load_audio(replacement_path, mono=False)
+        # Validate timing
+        if start_time >= end_time:
+            raise ValueError("Start time must be less than end time")
+        # Convert times to samples
+        start_sample = int(start_time * main_sr)
+        end_sample = int(end_time * main_sr)
+        # Extract main audio parts
+        main_before = main_audio[:start_sample]
+        main_after = main_audio[end_sample:]
+        # Resample replacement if needed
+        if replacement_sr != main_sr:
+            replacement_audio = librosa.resample(
+                replacement_audio, orig_sr=replacement_sr, target_sr=main_sr
+            )
+        # Trim replacement to specified duration
+        replacement_duration = end_time - start_time
+        replacement_samples = int(replacement_duration * main_sr)
+        trimmed_replacement = (
+            replacement_audio[:replacement_samples]
+            if len(replacement_audio) > replacement_samples
+            else replacement_audio
+        )
+        # Apply crossfades
+        fade_samples = int(crossfade_duration * main_sr)
+        # Fade in replacement
+        fade_in = np.linspace(0, 1, fade_samples)
+        trimmed_replacement[:fade_samples] *= fade_in
+        # Fade out at end of replacement
+        fade_out = np.linspace(1, 0, fade_samples)
+        trimmed_replacement[-fade_samples:] *= fade_out
+        # Combine all parts
+        final_audio = np.concatenate([main_before, trimmed_replacement, main_after])
+        # Save output
+        if output_path is None:
+            output_path = tempfile.mkdtemp(suffix="_replaced")
+        else:
+            os.makedirs(output_path, exist_ok=True)
+        # Generate output filename
+        main_filename = os.path.splitext(os.path.basename(audio_path))[0]
+        output_filename = f"{main_filename}_replaced.{output_format}"
+        output_file = os.path.join(output_path, output_filename)
+        # Save final audio
+        sf.write(output_file, final_audio.T, main_sr)
+        return output_file
+    except Exception as e:
+        raise RuntimeError(f"Error replacing audio section: {str(e)}")

tools/combine_tracks.py CHANGED Viewed

@@ -305,7 +305,10 @@ def create_medley(
     if output_path is None:
         tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-")
-        output = Path(tmp_dir) / f"{vocals.name}_{instrumental.name}_medley.{medley_extension}"
     else:
         output = Path(output_path).expanduser().resolve()
         output.parent.mkdir(parents=True, exist_ok=True)

     if output_path is None:
         tmp_dir = tempfile.mkdtemp(prefix="mcp-medley-")
+        output = (
+            Path(tmp_dir)
+            / f"{vocals.name}_{instrumental.name}_medley.{medley_extension}"
+        )
     else:
         output = Path(output_path).expanduser().resolve()
         output.parent.mkdir(parents=True, exist_ok=True)

tools/voice_replacement.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import os
+from datetime import datetime
+from pathlib import Path
+from gradio_client import Client, handle_file
+from tools.audio_info import validate_audio_path
+def replace_voice(
+    source_audio_path: str,
+    target_audio_path: str,
+    diffusion_steps: int = 10,
+    length_adjust: float = 1.0,
+    inference_cfg_rate: float = 0.7,
+    f0_condition: bool = False,
+    auto_f0_adjust: bool = True,
+    pitch_shift: int = 0,
+) -> str:
+    """
+    Replace voice in source audio with voice from target audio using Seed-VC.
+    This function uses the Seed-VC Gradio space to perform voice conversion,
+    replacing the voice characteristics in the source audio with those from
+    the target audio while preserving the linguistic content and timing.
+    Args:
+        source_audio_path: Path to the source audio file (voice to be replaced)
+        target_audio_path: Path to the target audio file (voice to use)
+        diffusion_steps: Number of diffusion steps for inference (default: 10)
+        length_adjust: Length adjustment factor (default: 1.0)
+        inference_cfg_rate: Classifier-free guidance rate (default: 0.7)
+        f0_condition: Whether to use F0 conditioning (default: False)
+        auto_f0_adjust: Whether to auto-adjust F0 (default: True)
+        pitch_shift: Pitch shift in semitones (default: 0)
+    Returns:
+        Path to the generated voice-replaced audio file
+    Raises:
+        FileNotFoundError: If source or target audio files don't exist
+        ValueError: If parameters are invalid
+        RuntimeError: If voice replacement fails
+    """
+    try:
+        # Validate input paths
+        source_abs_path = validate_audio_path(source_audio_path)
+        target_abs_path = validate_audio_path(target_audio_path)
+        # Validate parameters
+        if diffusion_steps < 1 or diffusion_steps > 50:
+            raise ValueError("diffusion_steps must be between 1 and 50")
+        if length_adjust <= 0:
+            raise ValueError("length_adjust must be positive")
+        if not 0 <= inference_cfg_rate <= 1:
+            raise ValueError("inference_cfg_rate must be between 0 and 1")
+        if pitch_shift < -12 or pitch_shift > 12:
+            raise ValueError("pitch_shift must be between -12 and 12 semitones")
+        # Initialize Seed-VC client
+        client = Client("Plachta/Seed-VC")
+        # Perform voice replacement
+        result = client.predict(
+            source_audio_path=handle_file(source_abs_path),
+            target_audio_path=handle_file(target_abs_path),
+            diffusion_steps=diffusion_steps,
+            length_adjust=length_adjust,
+            inference_cfg_rate=inference_cfg_rate,
+            f0_condition=f0_condition,
+            auto_f0_adjust=auto_f0_adjust,
+            pitch_shift=pitch_shift,
+            api_name="/predict_1",
+        )
+        # Create output directory
+        output_dir = Path("output")
+        output_dir.mkdir(exist_ok=True)
+        # Generate output filename with timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        source_name = Path(source_abs_path).stem
+        target_name = Path(target_abs_path).stem
+        output_filename = (
+            f"{source_name}_voice_replaced_by_{target_name}_{timestamp}.wav"
+        )
+        output_path = output_dir / output_filename
+        # Save the result
+        if isinstance(result, str) and os.path.exists(result):
+            # If result is a file path, copy it to output location
+            import shutil
+            shutil.copy2(result, output_path)
+        else:
+            # If result is audio data, save it using soundfile
+            import soundfile as sf
+            sf.write(str(output_path), result, 22050)
+        return str(output_path)
+    except Exception as e:
+        raise RuntimeError(f"Voice replacement failed: {str(e)}")
+def replace_voice_wrapper(
+    source_audio_path: str,
+    target_audio_path: str,
+    diffusion_steps: int = 10,
+    length_adjust: float = 1.0,
+    inference_cfg_rate: float = 0.7,
+    f0_condition: bool = False,
+    auto_f0_adjust: bool = True,
+    pitch_shift: int = 0,
+) -> str:
+    """
+    Wrapper function for voice replacement with error handling for MCP integration.
+    Args:
+        source_audio_path: Path to the source audio file
+        target_audio_path: Path to the target audio file
+        diffusion_steps: Number of diffusion steps (default: 10)
+        length_adjust: Length adjustment factor (default: 1.0)
+        inference_cfg_rate: CFG rate (default: 0.7)
+        f0_condition: Use F0 conditioning (default: False)
+        auto_f0_adjust: Auto-adjust F0 (default: True)
+        pitch_shift: Pitch shift in semitones (default: 0)
+    Returns:
+        Path to generated audio file or error message
+    """
+    try:
+        return replace_voice(
+            source_audio_path=source_audio_path,
+            target_audio_path=target_audio_path,
+            diffusion_steps=diffusion_steps,
+            length_adjust=length_adjust,
+            inference_cfg_rate=inference_cfg_rate,
+            f0_condition=f0_condition,
+            auto_f0_adjust=auto_f0_adjust,
+            pitch_shift=pitch_shift,
+        )
+    except Exception as e:
+        return f"Error: {str(e)}"