AudioRNN/AudioRNN.py at master · MerlinPCarson/AudioRNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import numpy as np
import soundfile as sf
import librosa
import argparse
import pickle
import h5py
from tqdm import tqdm
import glob, os, time, math
from scipy import signal
import AudioRNNData as DataGen
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Keras packages model
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, CuDNNGRU, CuDNNLSTM, Dropout, BatchNormalization, concatenate
from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, EarlyStopping #, CoolDown
from keras.optimizers import Adam
from keras.utils import to_categorical


EMPHASISCOEFF = -0.95


class SaveAudioCallback(Callback):
    def __init__(self, ckpt_freq, gen_length, sample_rate, time_steps, audio_context, batch_size):
        super(SaveAudioCallback, self).__init__()
        self.ckpt_freq = ckpt_freq
        self.audio_context = audio_context
        self.gen_length = gen_length
        self.sample_rate = sample_rate
        self.time_steps = time_steps
        self.audio_context = audio_context
        self.batch_size = batch_size

    def on_epoch_end(self, epoch, logs={}):
        if (epoch+1)%self.ckpt_freq==0:
            ts = str(int(time.time()))
            audio_file = os.path.join('output/', 'ckpt_'+ts+'.wav')
            audio = generate_audio(self.model, self.gen_length, self.sample_rate, self.time_steps, self.audio_context, self.batch_size)
            write_audio(post_process(audio).astype('int16'), audio_file, self.sample_rate)


def write_audio(samples, file_name, sample_rate):

    sf.write(file_name, samples, sample_rate, subtype='PCM_16')
    print('Audio saved to disk.')

def load_data(datadir, sample_rate):

    concat_data = np.array([],dtype='float32')

    for waveFile in glob.glob(os.path.join(datadir, '**', '*.wav'),recursive=True):
        print('loading',waveFile)
        data, sr = sf.read(waveFile, dtype='float32')
        if sr != sample_rate:
            data = data.T
            data = librosa.resample(data, sr, sample_rate)
            concat_data = np.append(concat_data,data)
        else:
            concat_data = np.append(concat_data, data)

    return concat_data

def pre_process(samples):
    print('Pre-Processing data')

    # preemphasis
    proc_samples = signal.lfilter( [1, EMPHASISCOEFF], [1], samples )

    return proc_samples

def post_process(samples):

    # deemphasis
    proc_samples = signal.lfilter( [1], [1, EMPHASISCOEFF], samples )

    return proc_samples

def standardize(data):
   return (data - data.min())/(data.max() - data.min())

def from_ulaw(samples):
    print('Decoding mu-law samples')
    dec_samples = []
    for sample in tqdm(samples):
        ampl_val_8 = ((((sample) / 255.0) - 0.5) * 2.0)
        ampl_val_16 = (np.sign(ampl_val_8) * (1/256.0) * ((1 + 256.0)**abs(ampl_val_8) - 1)) * 2**15
        dec_samples.append(ampl_val_16)
    return np.array(dec_samples, dtype=np.float)

def to_ulaw(samples):
    print('Encoding mu-law samples')
    enc_samples = [int((np.sign(sample) * (np.log(1 + 256*abs(sample)) / (
            np.log(1+256))) + 1)/2.0 * 255) for sample in tqdm(samples)]
    return np.array(enc_samples, dtype=np.uint8)

def scale_data(samples):
    samples = samples - samples.min()
    samples = samples / (samples.max() - samples.min())
    samples = (samples - 0.5) * 2
    return samples

def model(batch_size, time_steps, num_neurons):
    x_in = Input(batch_shape=(batch_size, time_steps, 1))
    x = Dense(num_neurons, activation='relu')(x_in)
    x = BatchNormalization()(x)
    rnn_in1 = concatenate([x_in, x])
    x = CuDNNGRU(num_neurons, return_sequences=True, stateful=False)(rnn_in1)
    x = BatchNormalization()(x)
    rnn_in2 = concatenate([x_in, x])
    #x = Dropout(0.4)(x)
    x = CuDNNGRU(num_neurons, return_sequences=False, stateful=False)(rnn_in2)
    x = BatchNormalization()(x)
    #x = Dropout(0.4)(x)
    x = Dense(256, activation='softmax')(x)
    model = Model(inputs=[x_in], outputs=[x])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

def create_dataset(data, time_steps, time_shift):
    print('Creating Dataset')
    X = []
    Y = []
    for frame_start in range(0, len(data)-time_steps - 1, time_shift):
        # get frame and normalize
        frame = data[frame_start:frame_start+time_steps]
        # append frame of data to dataset
        X.append(frame.reshape(time_steps,1))
        # get ulaw encoded sample after frame for target
        Y.append(data[frame_start+time_steps])

    return np.array(X, dtype='float32'), to_categorical(Y, num_classes=256)

def load_from_HDF5(data_file, num_examples, start_example=0):
    print("Loading data from HDF5 file.")
    with h5py.File(data_file, 'r') as hf:

        x_train = hf['x_train'][start_example:start_example+num_examples]
        y_train = hf['y_train'][start_example:start_example+num_examples]

    return x_train.astype('float32'), y_train.astype('uint8')

def load_audio_from_HDF5(data_file):
    print("Loading audio data from HDF5 file.")
    with h5py.File(data_file, 'r') as hf:
        return np.array(hf['AudioRNNData'])

def save_audio_to_HDF5(data, data_file):
    print("Saving audio data to HDF5 file.")
    with h5py.File(data_file, 'w') as hf:
        hf.create_dataset('AudioRNNData', data=data)

def generate_audio(AudioRNN, gen_length, sample_rate, time_steps, audio_prompt, batch_size):
    audio = []
    audio_prompt /= 255     # normalize data
    print(f"Generating {gen_length} secs of audio.")
    for sample in tqdm(range(int(gen_length*sample_rate))):
        output = AudioRNN.predict(audio_prompt.reshape(batch_size,time_steps,1))
        pred_sample = np.argmax(output)
        audio.append(pred_sample)
        pred_sample /= 255
        audio_prompt = np.append(audio_prompt, pred_sample)
        audio_prompt = audio_prompt[1:]

    return from_ulaw(audio)


def main():
    print(f"AudioRNN starting at {time.ctime()}")
    script_dir = os.path.dirname(os.path.realpath(__file__))

    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--datadir", help="root directory of data", default="McGillSmall16k")
    parser.add_argument("-sr", "--samplerate", help="audio sample rate", type=int, default=8000)
    parser.add_argument("-df", "--datafile", help="HDF5 file to save data to", default="AudioRNNData.h5")
    parser.add_argument("-af", "--audiodatafile", help="HDF5 file to save PCM data to", default="AudioData.h5")
    parser.add_argument("-m", "--modelfile", help="create audio from existing model file", default="AudioRNN.h5")
    parser.add_argument("-t", "--train", help="train the model", action="store_true")
    parser.add_argument("-test", "--test", help="test processing", action="store_true")
    parser.add_argument("-g", "--generate", help="generate_audio", action="store_true")
    parser.add_argument("-gl", "--genlength", help="length of generate_audio in secs", type=float, default=1)
    parser.add_argument("-sh5", "--saveHDF5", help="save preprocessed data to HDF5 file", action="store_true")
    parser.add_argument("-lh5", "--loadHDF5", help="load preprocessed data from HDF5 file", action="store_true")
    parser.add_argument("-lah5", "--loadaudioHDF5", help="load preprocessed data from HDF5 file", action="store_true")
    parser.add_argument("-sah5", "--saveaudioHDF5", help="save preprocessed data to HDF5 file", action="store_true")
    parser.add_argument("-e", "--epochs", help="Number of epochs", type=int, default=100)
    parser.add_argument("-b", "--batchsize", help="Number of batches per epoch", type=int, default=128)
    parser.add_argument("-ne", "--numexamples", help="Number of examples to use from dataset", type=int, default=5000)
    parser.add_argument("-ts", "--timesteps", help="Number of samples in time context", type=int, default=1000)
    parser.add_argument("-tsft", "--timeshift", help="Number of samples to skip for each example", type=int, default=1000)
    parser.add_argument("-n", "--neurons", help="Number of neurons per layer", default=256)
    parser.add_argument("-a", "--audiofile", help="create audio file", default="output.wav")
    arg = parser.parse_args()

    # training arguments
    train = arg.train

    # generate audio arguments
    gen_audio = arg.generate
    gen_length = arg.genlength
    num_examples = arg.numexamples

    # data arguments
    load_HDF5 = arg.loadHDF5
    save_HDF5 = arg.saveHDF5
    load_audio_HDF5 = arg.loadaudioHDF5
    save_audio_HDF5 = arg.saveaudioHDF5
    sample_rate = arg.samplerate
    test = arg.test

    # model arguments
    epochs = arg.epochs
    batch_size = arg.batchsize
    time_steps = arg.timesteps
    time_shift = arg.timeshift
    neurons_per_layer = arg.neurons

# NOTE: for debugging
    #train = True
    #load_HDF5 = True
    #gen_audio = True
    #save_HDF5 = True
    #load_audio_HDF5 = True
    #time_steps = 4000
    #num_examples = 1000000
    #test = True
    #arg.datadir = 'Opeth'

    # file arguments
    model_file = os.path.join(script_dir, arg.modelfile)
    audio_file = os.path.join(script_dir, arg.audiofile)
    data_dir = os.path.join(script_dir, arg.datadir)
    data_file = os.path.join(script_dir, arg.datafile.replace('.h5', str(time_steps) + '.h5'))
    audio_data_file = os.path.join(script_dir, arg.audiodatafile.replace('.h5', str(sample_rate) + '.h5'))

    # load audio data, if dataset is not loaded from HDF5
    if not load_HDF5 and train:
        if load_audio_HDF5:
            print(f"loading audio data from {audio_data_file}")
            data = load_audio_from_HDF5(os.path.join(script_dir, audio_data_file))

            # NOTE: test data loader
            if test:
                data = from_ulaw(data)
                data = post_process(data)
                write_audio(data.astype('int16'), os.path.join(script_dir, 'test-dataloader.wav'), sample_rate)     # test HDF5 data loader/ mu-law transform

            print(f"Data max: {data.max()}, Data min: {data.min()}")
        else:
            print("[Data Preperation]")
            print(f"loading waves from {data_dir}")
            raw_data = load_data(data_dir, sample_rate)
            if test:
                write_audio(raw_data, os.path.join(script_dir, 'test-rawdata.wav'), sample_rate)     # test data loader/resampler
            print(f'Number of samples: {len(raw_data)} Length of data: {len(raw_data)/sample_rate} secs')
            data = pre_process(raw_data)

            # encode data to 8-bits
            print('Encoding data as mu-law')
            data = to_ulaw(data)

            # NOTE: test mu-law encodding
            if test:
                data = from_ulaw(data)
                data = post_process(data)
                write_audio(data.astype('int16'), os.path.join(script_dir, 'test-procdata.wav'), sample_rate)     # test mu-law transform

        # write pre processed audio to HDF5 file
        if save_audio_HDF5:
            print(f"saving pre-processed audio data to {audio_data_file}")
            save_audio_to_HDF5(data, os.path.join(script_dir, audio_data_file))

        if save_HDF5:
            # create datasets and save to HDF5 file
            print(f"saving dataset to {data_file}")
            DataGen.save_data_to_HDF5(data, time_steps, data_file)

    # train the model
    if train:

        # load the datasets
        assert os.path.exists(data_file), f"Data file {data_file}, does not exists!"
        print(f"loading dataset from {data_file}")
        x_train, y_train = load_from_HDF5(data_file, num_examples)

        # NOTE:test target vector
        if test:
            data = from_ulaw(y_train)
            data = post_process(data)
            write_audio(data.astype('int16'), os.path.join(script_dir, 'test-target.wav'), sample_rate)

        # normalize the data
        x_train = x_train/255

        # get training and validation sets
        x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.01, shuffle=False )
        # data must be a multiple of batch size
        num_train_samples = batch_size * (len(x_train)//batch_size)
        x_train = x_train[0:num_train_samples,:,:]
        y_train = y_train[0:num_train_samples,:]
        num_valid_samples = batch_size * (len(x_valid)//batch_size)
        x_valid = x_valid[0:num_valid_samples,:,:]
        y_valid = y_valid[0:num_valid_samples,:]
        print(f"TRAINING: Shape of input data {x_train.shape}, Shape of target data {y_train.shape}")
        print(f"VALIDATION: Shape of input data {x_valid.shape}, Shape of target data {y_valid.shape}")

        # build model
        AudioRNN = model(batch_size, time_steps, neurons_per_layer)
        AudioRNN.summary()

        print('[Initiating training]')
        csv_logger = CSVLogger('AudioRNN.log')
        best_valid_model_checkpoint = ModelCheckpoint(model_file, save_best_only=True)
        best_train_model_checkpoint = ModelCheckpoint(model_file.split('.')[0] + '_train.h5', save_best_only=True, monitor='loss', mode='min')
        # Future Callbacks
        #escb = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
        #audio_prompt = x_train[0:batch_size, :, :]
        #gen_audio_callback = SaveAudioCallback(1, 0.5, sample_rate, time_steps, audio_prompt, batch_size)
        model_history = AudioRNN.fit(x_train, y_train, validation_data=(x_valid, y_valid), batch_size=batch_size, epochs=epochs, callbacks=[csv_logger, best_train_model_checkpoint, best_valid_model_checkpoint])
        with open(model_file.split('.')[0] + '.npy', "wb") as outfile:
            pickle.dump(model_history.history, outfile)

    # generate audio from the trained model
    if gen_audio:
        AudioRNN = model(1, time_steps, neurons_per_layer)
        print(f"loading weights from {model_file}")
        AudioRNN.load_weights(model_file)
        AudioRNN.summary()
        example =  np.random.randint(0,num_examples)
        example = 0
        print(f"Prompting audio generation with example {example}")
        audio_prompt, _ = load_from_HDF5(data_file, 1, start_example=example)
        audio = generate_audio(AudioRNN, gen_length, sample_rate, time_steps, audio_prompt, 1)
        write_audio(post_process(audio).astype('int16'), audio_file, sample_rate)

    print(f"AudioRNN completed at {time.ctime()}")

if __name__ == '__main__':
    main()