-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathAudioRNN.py
More file actions
340 lines (281 loc) · 14.2 KB
/
AudioRNN.py
File metadata and controls
340 lines (281 loc) · 14.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
import numpy as np
import soundfile as sf
import librosa
import argparse
import pickle
import h5py
from tqdm import tqdm
import glob, os, time, math
from scipy import signal
import AudioRNNData as DataGen
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Keras packages model
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, CuDNNGRU, CuDNNLSTM, Dropout, BatchNormalization, concatenate
from keras.callbacks import Callback, ModelCheckpoint, CSVLogger, EarlyStopping #, CoolDown
from keras.optimizers import Adam
from keras.utils import to_categorical
EMPHASISCOEFF = -0.95
class SaveAudioCallback(Callback):
def __init__(self, ckpt_freq, gen_length, sample_rate, time_steps, audio_context, batch_size):
super(SaveAudioCallback, self).__init__()
self.ckpt_freq = ckpt_freq
self.audio_context = audio_context
self.gen_length = gen_length
self.sample_rate = sample_rate
self.time_steps = time_steps
self.audio_context = audio_context
self.batch_size = batch_size
def on_epoch_end(self, epoch, logs={}):
if (epoch+1)%self.ckpt_freq==0:
ts = str(int(time.time()))
audio_file = os.path.join('output/', 'ckpt_'+ts+'.wav')
audio = generate_audio(self.model, self.gen_length, self.sample_rate, self.time_steps, self.audio_context, self.batch_size)
write_audio(post_process(audio).astype('int16'), audio_file, self.sample_rate)
def write_audio(samples, file_name, sample_rate):
sf.write(file_name, samples, sample_rate, subtype='PCM_16')
print('Audio saved to disk.')
def load_data(datadir, sample_rate):
concat_data = np.array([],dtype='float32')
for waveFile in glob.glob(os.path.join(datadir, '**', '*.wav'),recursive=True):
print('loading',waveFile)
data, sr = sf.read(waveFile, dtype='float32')
if sr != sample_rate:
data = data.T
data = librosa.resample(data, sr, sample_rate)
concat_data = np.append(concat_data,data)
else:
concat_data = np.append(concat_data, data)
return concat_data
def pre_process(samples):
print('Pre-Processing data')
# preemphasis
proc_samples = signal.lfilter( [1, EMPHASISCOEFF], [1], samples )
return proc_samples
def post_process(samples):
# deemphasis
proc_samples = signal.lfilter( [1], [1, EMPHASISCOEFF], samples )
return proc_samples
def standardize(data):
return (data - data.min())/(data.max() - data.min())
def from_ulaw(samples):
print('Decoding mu-law samples')
dec_samples = []
for sample in tqdm(samples):
ampl_val_8 = ((((sample) / 255.0) - 0.5) * 2.0)
ampl_val_16 = (np.sign(ampl_val_8) * (1/256.0) * ((1 + 256.0)**abs(ampl_val_8) - 1)) * 2**15
dec_samples.append(ampl_val_16)
return np.array(dec_samples, dtype=np.float)
def to_ulaw(samples):
print('Encoding mu-law samples')
enc_samples = [int((np.sign(sample) * (np.log(1 + 256*abs(sample)) / (
np.log(1+256))) + 1)/2.0 * 255) for sample in tqdm(samples)]
return np.array(enc_samples, dtype=np.uint8)
def scale_data(samples):
samples = samples - samples.min()
samples = samples / (samples.max() - samples.min())
samples = (samples - 0.5) * 2
return samples
def model(batch_size, time_steps, num_neurons):
x_in = Input(batch_shape=(batch_size, time_steps, 1))
x = Dense(num_neurons, activation='relu')(x_in)
x = BatchNormalization()(x)
rnn_in1 = concatenate([x_in, x])
x = CuDNNGRU(num_neurons, return_sequences=True, stateful=False)(rnn_in1)
x = BatchNormalization()(x)
rnn_in2 = concatenate([x_in, x])
#x = Dropout(0.4)(x)
x = CuDNNGRU(num_neurons, return_sequences=False, stateful=False)(rnn_in2)
x = BatchNormalization()(x)
#x = Dropout(0.4)(x)
x = Dense(256, activation='softmax')(x)
model = Model(inputs=[x_in], outputs=[x])
model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
return model
def create_dataset(data, time_steps, time_shift):
print('Creating Dataset')
X = []
Y = []
for frame_start in range(0, len(data)-time_steps - 1, time_shift):
# get frame and normalize
frame = data[frame_start:frame_start+time_steps]
# append frame of data to dataset
X.append(frame.reshape(time_steps,1))
# get ulaw encoded sample after frame for target
Y.append(data[frame_start+time_steps])
return np.array(X, dtype='float32'), to_categorical(Y, num_classes=256)
def load_from_HDF5(data_file, num_examples, start_example=0):
print("Loading data from HDF5 file.")
with h5py.File(data_file, 'r') as hf:
x_train = hf['x_train'][start_example:start_example+num_examples]
y_train = hf['y_train'][start_example:start_example+num_examples]
return x_train.astype('float32'), y_train.astype('uint8')
def load_audio_from_HDF5(data_file):
print("Loading audio data from HDF5 file.")
with h5py.File(data_file, 'r') as hf:
return np.array(hf['AudioRNNData'])
def save_audio_to_HDF5(data, data_file):
print("Saving audio data to HDF5 file.")
with h5py.File(data_file, 'w') as hf:
hf.create_dataset('AudioRNNData', data=data)
def generate_audio(AudioRNN, gen_length, sample_rate, time_steps, audio_prompt, batch_size):
audio = []
audio_prompt /= 255 # normalize data
print(f"Generating {gen_length} secs of audio.")
for sample in tqdm(range(int(gen_length*sample_rate))):
output = AudioRNN.predict(audio_prompt.reshape(batch_size,time_steps,1))
pred_sample = np.argmax(output)
audio.append(pred_sample)
pred_sample /= 255
audio_prompt = np.append(audio_prompt, pred_sample)
audio_prompt = audio_prompt[1:]
return from_ulaw(audio)
def main():
print(f"AudioRNN starting at {time.ctime()}")
script_dir = os.path.dirname(os.path.realpath(__file__))
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--datadir", help="root directory of data", default="McGillSmall16k")
parser.add_argument("-sr", "--samplerate", help="audio sample rate", type=int, default=8000)
parser.add_argument("-df", "--datafile", help="HDF5 file to save data to", default="AudioRNNData.h5")
parser.add_argument("-af", "--audiodatafile", help="HDF5 file to save PCM data to", default="AudioData.h5")
parser.add_argument("-m", "--modelfile", help="create audio from existing model file", default="AudioRNN.h5")
parser.add_argument("-t", "--train", help="train the model", action="store_true")
parser.add_argument("-test", "--test", help="test processing", action="store_true")
parser.add_argument("-g", "--generate", help="generate_audio", action="store_true")
parser.add_argument("-gl", "--genlength", help="length of generate_audio in secs", type=float, default=1)
parser.add_argument("-sh5", "--saveHDF5", help="save preprocessed data to HDF5 file", action="store_true")
parser.add_argument("-lh5", "--loadHDF5", help="load preprocessed data from HDF5 file", action="store_true")
parser.add_argument("-lah5", "--loadaudioHDF5", help="load preprocessed data from HDF5 file", action="store_true")
parser.add_argument("-sah5", "--saveaudioHDF5", help="save preprocessed data to HDF5 file", action="store_true")
parser.add_argument("-e", "--epochs", help="Number of epochs", type=int, default=100)
parser.add_argument("-b", "--batchsize", help="Number of batches per epoch", type=int, default=128)
parser.add_argument("-ne", "--numexamples", help="Number of examples to use from dataset", type=int, default=5000)
parser.add_argument("-ts", "--timesteps", help="Number of samples in time context", type=int, default=1000)
parser.add_argument("-tsft", "--timeshift", help="Number of samples to skip for each example", type=int, default=1000)
parser.add_argument("-n", "--neurons", help="Number of neurons per layer", default=256)
parser.add_argument("-a", "--audiofile", help="create audio file", default="output.wav")
arg = parser.parse_args()
# training arguments
train = arg.train
# generate audio arguments
gen_audio = arg.generate
gen_length = arg.genlength
num_examples = arg.numexamples
# data arguments
load_HDF5 = arg.loadHDF5
save_HDF5 = arg.saveHDF5
load_audio_HDF5 = arg.loadaudioHDF5
save_audio_HDF5 = arg.saveaudioHDF5
sample_rate = arg.samplerate
test = arg.test
# model arguments
epochs = arg.epochs
batch_size = arg.batchsize
time_steps = arg.timesteps
time_shift = arg.timeshift
neurons_per_layer = arg.neurons
# NOTE: for debugging
#train = True
#load_HDF5 = True
#gen_audio = True
#save_HDF5 = True
#load_audio_HDF5 = True
#time_steps = 4000
#num_examples = 1000000
#test = True
#arg.datadir = 'Opeth'
# file arguments
model_file = os.path.join(script_dir, arg.modelfile)
audio_file = os.path.join(script_dir, arg.audiofile)
data_dir = os.path.join(script_dir, arg.datadir)
data_file = os.path.join(script_dir, arg.datafile.replace('.h5', str(time_steps) + '.h5'))
audio_data_file = os.path.join(script_dir, arg.audiodatafile.replace('.h5', str(sample_rate) + '.h5'))
# load audio data, if dataset is not loaded from HDF5
if not load_HDF5 and train:
if load_audio_HDF5:
print(f"loading audio data from {audio_data_file}")
data = load_audio_from_HDF5(os.path.join(script_dir, audio_data_file))
# NOTE: test data loader
if test:
data = from_ulaw(data)
data = post_process(data)
write_audio(data.astype('int16'), os.path.join(script_dir, 'test-dataloader.wav'), sample_rate) # test HDF5 data loader/ mu-law transform
print(f"Data max: {data.max()}, Data min: {data.min()}")
else:
print("[Data Preperation]")
print(f"loading waves from {data_dir}")
raw_data = load_data(data_dir, sample_rate)
if test:
write_audio(raw_data, os.path.join(script_dir, 'test-rawdata.wav'), sample_rate) # test data loader/resampler
print(f'Number of samples: {len(raw_data)} Length of data: {len(raw_data)/sample_rate} secs')
data = pre_process(raw_data)
# encode data to 8-bits
print('Encoding data as mu-law')
data = to_ulaw(data)
# NOTE: test mu-law encodding
if test:
data = from_ulaw(data)
data = post_process(data)
write_audio(data.astype('int16'), os.path.join(script_dir, 'test-procdata.wav'), sample_rate) # test mu-law transform
# write pre processed audio to HDF5 file
if save_audio_HDF5:
print(f"saving pre-processed audio data to {audio_data_file}")
save_audio_to_HDF5(data, os.path.join(script_dir, audio_data_file))
if save_HDF5:
# create datasets and save to HDF5 file
print(f"saving dataset to {data_file}")
DataGen.save_data_to_HDF5(data, time_steps, data_file)
# train the model
if train:
# load the datasets
assert os.path.exists(data_file), f"Data file {data_file}, does not exists!"
print(f"loading dataset from {data_file}")
x_train, y_train = load_from_HDF5(data_file, num_examples)
# NOTE:test target vector
if test:
data = from_ulaw(y_train)
data = post_process(data)
write_audio(data.astype('int16'), os.path.join(script_dir, 'test-target.wav'), sample_rate)
# normalize the data
x_train = x_train/255
# get training and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.01, shuffle=False )
# data must be a multiple of batch size
num_train_samples = batch_size * (len(x_train)//batch_size)
x_train = x_train[0:num_train_samples,:,:]
y_train = y_train[0:num_train_samples,:]
num_valid_samples = batch_size * (len(x_valid)//batch_size)
x_valid = x_valid[0:num_valid_samples,:,:]
y_valid = y_valid[0:num_valid_samples,:]
print(f"TRAINING: Shape of input data {x_train.shape}, Shape of target data {y_train.shape}")
print(f"VALIDATION: Shape of input data {x_valid.shape}, Shape of target data {y_valid.shape}")
# build model
AudioRNN = model(batch_size, time_steps, neurons_per_layer)
AudioRNN.summary()
print('[Initiating training]')
csv_logger = CSVLogger('AudioRNN.log')
best_valid_model_checkpoint = ModelCheckpoint(model_file, save_best_only=True)
best_train_model_checkpoint = ModelCheckpoint(model_file.split('.')[0] + '_train.h5', save_best_only=True, monitor='loss', mode='min')
# Future Callbacks
#escb = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
#audio_prompt = x_train[0:batch_size, :, :]
#gen_audio_callback = SaveAudioCallback(1, 0.5, sample_rate, time_steps, audio_prompt, batch_size)
model_history = AudioRNN.fit(x_train, y_train, validation_data=(x_valid, y_valid), batch_size=batch_size, epochs=epochs, callbacks=[csv_logger, best_train_model_checkpoint, best_valid_model_checkpoint])
with open(model_file.split('.')[0] + '.npy', "wb") as outfile:
pickle.dump(model_history.history, outfile)
# generate audio from the trained model
if gen_audio:
AudioRNN = model(1, time_steps, neurons_per_layer)
print(f"loading weights from {model_file}")
AudioRNN.load_weights(model_file)
AudioRNN.summary()
example = np.random.randint(0,num_examples)
example = 0
print(f"Prompting audio generation with example {example}")
audio_prompt, _ = load_from_HDF5(data_file, 1, start_example=example)
audio = generate_audio(AudioRNN, gen_length, sample_rate, time_steps, audio_prompt, 1)
write_audio(post_process(audio).astype('int16'), audio_file, sample_rate)
print(f"AudioRNN completed at {time.ctime()}")
if __name__ == '__main__':
main()