Contents

Audio Processing with Python

Processing wave files and implementing FIR filters

Audio processing with Python

Introduction

The weather was bad today. So I stayed at home and decided to learn something new. I remember to study digital filters in the faculty and it was somehow boring. At the time it was not so easy to create useful and practical examples.

Fortunately nowadays Python exists and it’s really easy to play with sound processing as can be seen on this page.

Most of the code found on this page was created using snippets found on the internet.

Processing wave files and plotting spectrograms

import numpy as np
import matplotlib.pyplot as plt
import wave
from scipy.io import wavfile
import contextlib


# from http://stackoverflow.com/questions/2226853/interpreting-wav-data/2227174#2227174
def interpret_wav(raw_bytes, n_frames, n_channels, sample_width, interleaved = True):

    if sample_width == 1:
        dtype = np.uint8 # unsigned char
    elif sample_width == 2:
        dtype = np.int16 # signed 2-byte short
    else:
        raise ValueError("Only supports 8 and 16 bit audio formats.")

    channels = np.frombuffer(raw_bytes, dtype=dtype)

    if interleaved:
        # channels are interleaved, i.e. sample N of channel M follows sample N of channel M-1 in raw data
        channels.shape = (n_frames, n_channels)
        channels = channels.T
    else:
        # channels are not interleaved. All samples from channel M occur before all samples from channel M-1
        channels.shape = (n_channels, n_frames)

    return channels

def get_start_end_frames(nFrames, sampleRate, tStart=None, tEnd=None):

    if tStart and tStart*sampleRate<nFrames:
        start = tStart*sampleRate
    else:
        start = 0

    if tEnd and tEnd*sampleRate<nFrames and tEnd*sampleRate>start:
        end = tEnd*sampleRate
    else:
        end = nFrames

    return (start,end,end-start)

def extract_audio(fname, tStart=None, tEnd=None):
    with contextlib.closing(wave.open(fname,'rb')) as spf:
        sampleRate = spf.getframerate()
        ampWidth = spf.getsampwidth()
        nChannels = spf.getnchannels()
        nFrames = spf.getnframes()

        startFrame, endFrame, segFrames = get_start_end_frames(nFrames, sampleRate, tStart, tEnd)

        # Extract Raw Audio from multi-channel Wav File
        spf.setpos(startFrame)
        sig = spf.readframes(segFrames)
        spf.close()

        channels = interpret_wav(sig, segFrames, nChannels, ampWidth, True)

        return (channels, nChannels, sampleRate, ampWidth, nFrames)

def convert_to_mono(channels, nChannels, outputType):
    if nChannels == 2:
        samples = np.mean(np.array([channels[0], channels[1]]), axis=0)  # Convert to mono
    else:
        samples = channels[0]

    return samples.astype(outputType)

def plot_specgram(samples, sampleRate, tStart=None, tEnd=None):
    plt.figure(figsize=(20,10))
    plt.specgram(samples, Fs=sampleRate, NFFT=1024, noverlap=192, cmap='nipy_spectral', xextent=(tStart,tEnd))
    plt.ylabel('Frequency [Hz]')
    plt.xlabel('Time [sec]')
    plt.show()

def plot_audio_samples(title, samples, sampleRate, tStart=None, tEnd=None):
    if not tStart:
        tStart = 0

    if not tEnd or tStart>tEnd:
        tEnd = len(samples)/sampleRate

    f, axarr = plt.subplots(2, sharex=True, figsize=(20,10))
    axarr[0].set_title(title)
    axarr[0].plot(np.linspace(tStart, tEnd, len(samples)), samples)
    axarr[1].specgram(samples, Fs=sampleRate, NFFT=1024, noverlap=192, cmap='nipy_spectral', xextent=(tStart,tEnd))
    #get_specgram(axarr[1], samples, sampleRate, tStart, tEnd)

    axarr[0].set_ylabel('Amplitude')
    axarr[1].set_ylabel('Frequency [Hz]')
    plt.xlabel('Time [sec]')

    plt.show()

tStart=0
tEnd=20

channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('sultans.wav', tStart, tEnd)
samples = convert_to_mono(channels, nChannels, np.int16)

plot_audio_samples("Sultans of Swing - First 20s", samples, sampleRate, tStart, tEnd)

wavfile.write('sultans_20s.wav', sampleRate, samples)
!ffmpeg -y -loglevel panic -i sultans_20s.wav sultans_20s.mp3
Audio waveform and spectrogram analysis

Figure 1: Waveform and spectrogram analysis of Sultans of Swing - first 20 seconds showing amplitude over time and frequency distribution.

Processed audio:

Spectograms: Example 2

tStart=0
tEnd=20

channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('about.wav', tStart, tEnd)
samples = convert_to_mono(channels, nChannels, np.int16)

plot_audio_samples("About a Girl - First 20s", samples, sampleRate, tStart, tEnd)

wavfile.write('about_20s.wav', sampleRate, samples)
!ffmpeg -y -loglevel panic -i about_20s.wav about_20s.mp3
Audio waveform and spectrogram of About a Girl

Figure 2: Waveform and spectrogram analysis of About a Girl - first 20 seconds displaying the characteristic whistle components in the frequency domain.

Processed audio:

Filtering the whistle in “About a Girl” intro

Now lets use a digital filter to extract the whistle between the 13 and 15s in the “About a Girl” intro. In this case a band pass FIR is used. The low pass, high pass and band reject are also implemented as they is be used later.

def fir_high_pass(samples, fs, fH, N, outputType):
    # Referece: https://fiiir.com

    fH = fH / fs

    # Compute sinc filter.
    h = np.sinc(2 * fH * (np.arange(N) - (N - 1) / 2.))
    # Apply window.
    h *= np.hamming(N)
    # Normalize to get unity gain.
    h /= np.sum(h)
    # Create a high-pass filter from the low-pass filter through spectral inversion.
    h = -h
    h[int((N - 1) / 2)] += 1
    # Applying the filter to a signal s can be as simple as writing
    s = np.convolve(samples, h).astype(outputType)
    return s


def fir_low_pass(samples, fs, fL, N, outputType):
    # Referece: https://fiiir.com

    fL = fL / fs

    # Compute sinc filter.
    h = np.sinc(2 * fL * (np.arange(N) - (N - 1) / 2.))
    # Apply window.
    h *= np.hamming(N)
    # Normalize to get unity gain.
    h /= np.sum(h)
    # Applying the filter to a signal s can be as simple as writing
    s = np.convolve(samples, h).astype(outputType)
    return s

def fir_band_reject(samples, fs, fL, fH, NL, NH, outputType):
    # Referece: https://fiiir.com

    fH = fH / fs
    fL = fL / fs

    # Compute a low-pass filter with cutoff frequency fL.
    hlpf = np.sinc(2 * fL * (np.arange(NL) - (NL - 1) / 2.))
    hlpf *= np.blackman(NL)
    hlpf /= np.sum(hlpf)
    # Compute a high-pass filter with cutoff frequency fH.
    hhpf = np.sinc(2 * fH * (np.arange(NH) - (NH - 1) / 2.))
    hhpf *= np.blackman(NH)
    hhpf /= np.sum(hhpf)
    hhpf = -hhpf
    hhpf[int((NH - 1) / 2)] += 1
    # Add both filters.
    if NH >= NL:
        h = hhpf
        h[int((NH - NL) / 2) : int((NH - NL) / 2 + NL)] += hlpf
    else:
        h = hlpf
        h[int((NL - NH) / 2) : int((NL - NH) / 2 + NH)] += hhpf
    # Applying the filter to a signal s can be as simple as writing
    s = np.convolve(samples, h).astype(outputType)

    return s

def fir_band_pass(samples, fs, fL, fH, NL, NH, outputType):
    # Referece: https://fiiir.com

    fH = fH / fs
    fL = fL / fs

    # Compute a low-pass filter with cutoff frequency fH.
    hlpf = np.sinc(2 * fH * (np.arange(NH) - (NH - 1) / 2.))
    hlpf *= np.blackman(NH)
    hlpf /= np.sum(hlpf)
    # Compute a high-pass filter with cutoff frequency fL.
    hhpf = np.sinc(2 * fL * (np.arange(NL) - (NL - 1) / 2.))
    hhpf *= np.blackman(NL)
    hhpf /= np.sum(hhpf)
    hhpf = -hhpf
    hhpf[int((NL - 1) / 2)] += 1
    # Convolve both filters.
    h = np.convolve(hlpf, hhpf)
    # Applying the filter to a signal s can be as simple as writing
    s = np.convolve(samples, h).astype(outputType)

    return s

tStart = 12
tEnd = 15

channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('about.wav', tStart, tEnd)
samples = convert_to_mono(channels, nChannels, np.int16)

plot_audio_samples("About a Girl section - Before Filtering", samples, sampleRate, tStart, tEnd)

wavfile.write('about_original.wav', sampleRate, samples)
!ffmpeg -y -loglevel panic -i about_original.wav about_original.mp3
Unfiltered audio segment with whistle components

Figure 3: About a Girl audio segment before filtering - whistle components visible at 2400-2900Hz, 5000Hz, and 7500Hz in the spectrogram.

Before filtering:

It is possible to see the whistle in the spectrogram. There are three components in the following bands:

  • 2400 to 2900Hz
  • Around 5000Hz
  • Around 7500Hz

The predominant sound thought is contained in the first band, and thats the one we will try to filter.

samples_filtered = fir_band_pass(samples, sampleRate, 2400, 2900, 461, 461, np.int16)
samples_filtered = samples_filtered * 2 # Sound amplification
plot_audio_samples("About a Girl section - After Filtering", samples_filtered, sampleRate, tStart, tEnd)

wavfile.write('about_whistle.wav', sampleRate, samples_filtered)
!ffmpeg -y -loglevel panic -i about_whistle.wav about_whistle.mp3
Band-pass filtered audio isolating whistle

Figure 4: About a Girl audio segment after FIR band-pass filtering (2400-2900Hz) - isolated whistle component with amplified signal.

After filtering:

The result is not perfect. But it’s possible to get the point.

Removing voice from song: attempt 1

tStart = 0
tEnd = 20

channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('sultans.wav', tStart, tEnd)
samples = convert_to_mono(channels, nChannels, np.int16)

plot_audio_samples("Sultans of Swing - Before Filtering", samples, sampleRate, tStart, tEnd)

wavfile.write('sultans_original.wav', sampleRate, samples)
!ffmpeg -y -loglevel panic -i sultans_original.wav sultans_original.mp3
Original Sultans of Swing before voice removal

Figure 5: Sultans of Swing original audio before voice removal - complete frequency spectrum including vocal range.

Before filtering:

On this attempt, very sharp passband filter is used to remove the frequencies associated with the voice. After some tuning, the cutoff frequencies were selected to be around 300Hz for the low pass filter and 6660Hz for the high pass filter. Two passes was used in this case.

lp_samples_filtered = fir_low_pass(samples, sampleRate, 300, 461, np.int16)               # First pass
lp_samples_filtered = fir_low_pass(lp_samples_filtered, sampleRate, 250, 461, np.int16)   # Second pass

hp_samples_filtered = fir_high_pass(samples, sampleRate, 6600, 461, np.int16)             # First pass
hp_samples_filtered = fir_high_pass(hp_samples_filtered, sampleRate, 6600, 461, np.int16) # Second pass

samples_filtered = np.mean(np.array([lp_samples_filtered, hp_samples_filtered]), axis=0).astype(np.int16)

plot_audio_samples("Sultans of Swing - After Filtering 1", samples_filtered, sampleRate, tStart, tEnd)

wavfile.write('sultans_novoice1.wav', sampleRate, samples_filtered)
!ffmpeg -y -loglevel panic -i sultans_novoice1.wav sultans_novoice1.mp3
Voice removal using frequency filtering

Figure 6: Sultans of Swing after voice removal attempt 1 - using dual-pass low-pass (300Hz) and high-pass (6600Hz) FIR filters to eliminate vocal frequencies.

After filtering:

The resulting sound does not sound very natural. But the voice was filtered!

Removing voice from song: attempt 2

Apparently, a widely used technique to remove voice from songs is to mix both channels (left and right) together. Since the voice is very similar in both channels, when subtracting them, the voice will cancel.

channels, nChannels, sampleRate, ampWidth, nFrames = extract_audio('sultans.wav', tStart, tEnd)
samples_no_voice = (channels[0]-channels[1]).astype(np.int16)
plot_audio_samples("Sultans of Swing - After Filtering 2", samples_no_voice, sampleRate, tStart, tEnd)

wavfile.write('sultans_novoice2.wav', sampleRate, samples_no_voice)
!ffmpeg -y -loglevel panic -i sultans_novoice2.wav sultans_novoice2.mp3
Voice removal using stereo channel subtraction

Figure 7: Sultans of Swing after voice removal attempt 2 - using stereo channel subtraction technique where left channel minus right channel cancels center-panned vocals.

After filtering:

I really like the result because there’s a lot of reverb and echo.

Removing voice from song: mixing attempts

In the third attempt, both attempts #1 and #2 are mixed together.

lp_samples_filtered.resize(samples_no_voice.shape)
hp_samples_filtered.resize(samples_no_voice.shape)

samples = ((samples_no_voice+lp_samples_filtered+hp_samples_filtered)/3).astype(np.int16)

plot_audio_samples("Sultans of Swing - After Filtering 1+2", samples_no_voice, sampleRate, tStart, tEnd)
wavfile.write('sultans_novoice3.wav', sampleRate, samples_no_voice)
!ffmpeg -y -loglevel panic -i sultans_novoice3.wav sultans_novoice3.mp3
Combined voice removal techniques

Figure 8: Sultans of Swing after combining voice removal methods - mixing frequency filtering and stereo channel subtraction techniques for enhanced vocal suppression.

After filtering:

Seems almost the same as attempt #2.

Made with Jupyter Notebooks.

SoftwareVersion
Python3.6.4 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
IPython6.2.1
OSDarwin 17.4.0 x86_64 i386 64bit
scipy1.0.0
numpy1.14.0
matplotlib2.1.2
version_information1.0.3