A customized loss function is usually not needed for genre classification.
A combined model a song split into multiple prediction windows can be setup with Multiple Instance Learning (MIL).
MIL is a supervised learning approach where the label not on each independent sample (instances), but instead of a "bag" (unordered set) of instances.
In your case the instance is each 5 second window of MFCC features, and the bag is the entire song.
In Keras we use TimeDistributed
layer to execute our model for all windows.
Then we combine the result using GlobalAveragePooling1D
, effectively
implementing mean-voting across the windows. This is more easily differentiable than majority voting.
Below is a runnable example:
import math
import keras
import librosa
import pandas
import numpy
import sklearn
def window_model(n_bands, n_frames, n_classes, hidden=32):
from keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D
out_units = 1 if n_classes == 2 else n_classes
out_activation = 'sigmoid' if n_classes == 2 else 'softmax'
shape = (n_bands, n_frames, 1)
# Basic CNN model
# An MLP could also be used, but may need to reshape on input and output
model = keras.Sequential([
Conv2D(16, (3,3), input_shape=shape),
MaxPooling2D((2,3)),
Conv2D(16, (3,3)),
MaxPooling2D((2,2)),
Flatten(),
Dense(hidden, activation='relu'),
Dense(hidden, activation='relu'),
Dense(out_units, activation=out_activation),
])
return model
def song_model(n_bands, n_frames, n_windows, n_classes=3):
from keras.layers import Input, TimeDistributed, GlobalAveragePooling1D
# Create the frame-wise model, will be reused across all frames
base = window_model(n_bands, n_frames, n_classes)
# GlobalAveragePooling1D expects a 'channel' dimension at end
shape = (n_windows, n_bands, n_frames, 1)
print('Frame model')
base.summary()
model = keras.Sequential([
TimeDistributed(base, input_shape=shape),
GlobalAveragePooling1D(),
])
print('Song model')
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='SGD', metrics=['acc'])
return model
def extract_features(path, sample_rate, n_bands, hop_length, n_frames, window_length, song_length):
# melspectrogram might perform better with CNNs
from librosa.feature import mfcc
# Load a fixed length section of sound
# Might need to pad if some songs are too short
y, sr = librosa.load(path, sr=sample_rate, offset=0, duration=song_length)
assert sr == sample_rate, sr
_song_length = len(y)/sample_rate
assert _song_length == song_length, _song_length
# Split into windows
window_samples = int(sample_rate * window_length)
window_hop = window_samples//2 # use 50% overlap
windows = librosa.util.frame(y, frame_length=window_samples, hop_length=window_hop)
# Calculate features for each window
features = []
for w in range(windows.shape[1]):
win = windows[:, w]
f = mfcc(y=win, sr=sample_rate, n_mfcc=n_bands,
hop_length=hop_length, n_fft=2*hop_length)
f = numpy.expand_dims(f, -1) # add channels dimension
features.append(f)
features = numpy.stack(features)
return features
def main():
# Settings for our model
n_bands = 13 # MFCCs
sample_rate = 22050
hop_length = 512
window_length = 5.0
song_length_max = 1.0*60
n_frames = math.ceil(window_length / (hop_length/sample_rate))
n_windows = math.floor(song_length_max / (window_length/2))-1
model = song_model(n_bands, n_frames, n_windows)
# Generate some example data
ex = librosa.util.example_audio_file()
examples = 8
numpy.random.seed(2)
songs = pandas.DataFrame({
'path': [ex] * examples,
'genre': numpy.random.choice([ 'rock', 'metal', 'blues' ], size=examples),
})
assert len(songs.genre.unique() == 3)
print('Song data')
print(songs)
def get_features(path):
f = extract_features(path, sample_rate, n_bands,
hop_length, n_frames, window_length, song_length_max)
return f
from sklearn.preprocessing import LabelBinarizer
binarizer = LabelBinarizer()
y = binarizer.fit_transform(songs.genre.values)
print('y', y.shape, y)
features = numpy.stack([ get_features(p) for p in songs.path ])
print('features', features.shape)
model.fit(features, y)
if __name__ == '__main__':
main()
The example outputs the inner and combined model summaries:
Frame model
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d_1 (Conv2D) (None, 11, 214, 16) 160
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 71, 16) 0
_________________________________________________________________
conv2d_2 (Conv2D) (None, 3, 69, 16) 2320
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 1, 34, 16) 0
_________________________________________________________________
flatten_1 (Flatten) (None, 544) 0
_________________________________________________________________
dense_1 (Dense) (None, 32) 17440
_________________________________________________________________
dense_2 (Dense) (None, 32) 1056
_________________________________________________________________
dense_3 (Dense) (None, 3) 99
=================================================================
Total params: 21,075
Trainable params: 21,075
Non-trainable params: 0
_________________________________________________________________
Song model
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
time_distributed_1 (TimeDist (None, 23, 3) 21075
_________________________________________________________________
global_average_pooling1d_1 ( (None, 3) 0
=================================================================
Total params: 21,075
Trainable params: 21,075
Non-trainable params: 0
_________________________________________________________________
And the shape of the feature vector fed to the model:
features (8, 23, 13, 216, 1)
8 songs, 23 windows each, with 13 MFCC bands, 216 frames in each window.
And a fifth dimension sized 1 to make Keras happy...