import torch from torch.utils.data import Dataset from torch.utils.data import DataLoader import librosa import soundfile from sklearn import preprocessing import os import random import re from uyghur import uyghur_latin featurelen = 128 sample_rate = 22050 fft_len = 1024 window_len = fft_len window = "hann" white_noise,_=librosa.load('white.wav',sr=sample_rate, duration=15.0) perlin_noise,_=librosa.load('perlin.wav',sr=sample_rate, duration=15.0) cafe_noise, _ = librosa.load('cafe.wav',sr=sample_rate, duration=15.0) radio_noise, _ = librosa.load('radionoise.wav',sr=sample_rate, duration=15.0) def addnoise(audio): rnd = random.random() if len(audio) > len(white_noise): pass elif rnd <0.25: audio = audio + white_noise[:len(audio)] elif rnd <0.50: audio = audio + perlin_noise[:audio.shape[0]] elif rnd <0.75: audio = audio + radio_noise[:audio.shape[0]] else: audio = audio + cafe_noise[:audio.shape[0]] return audio def randomstretch(audio): factor = random.uniform(0.8, 1.2) audio = librosa.core.resample(audio,sample_rate,sample_rate*factor) return audio def spec_augment(feat, T=50, F=13, time_mask_num=1, freq_mask_num=1): #def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1): rnd = random.random() feat_size = feat.size(0) seq_len = feat.size(1) if rnd< 0.33: # time mask for _ in range(time_mask_num): t = random.randint(0, T) t0 = random.randint(0, seq_len - t) feat[:, t0 : t0 + t] = 0 elif rnd <0.66: # freq mask for _ in range(freq_mask_num): f = random.randint(0, F) f0 = random.randint(0, feat_size - f) feat[f0 : f0 + f, :] = 0 else: # time mask for _ in range(time_mask_num): t = random.randint(0, T) t0 = random.randint(0, seq_len - t) feat[:, t0 : t0 + t] = 0 # freq mask for _ in range(freq_mask_num): f = random.randint(0, F) f0 = random.randint(0, feat_size - f) feat[f0 : f0 + f, :] = 0 return feat def melfuture(wav_path, augument = False): audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase') if augument: if random.random()<0.5: audio = randomstretch(audio) if random.random()<0.5: audio = addnoise(audio) audio = preprocessing.minmax_scale(audio, axis=0) audio = librosa.effects.preemphasis(audio) hop_len = 200 if augument and random.random()<0.5: hop_len = random.randint(160,240) spec = librosa.feature.melspectrogram(y=audio, sr=s_r, n_fft=fft_len, hop_length=hop_len, n_mels=featurelen, fmax=8000) spec = librosa.power_to_db(spec) spec = (spec - spec.mean()) / spec.std() spec = torch.FloatTensor(spec) if augument == True and random.random()<0.5: spec = spec_augment(spec) return spec def rawfuture(wav_path, augument = False): audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase') audio = preprocessing.minmax_scale(audio, axis=0) if augument: if random.random()<0.5: audio = addnoise(audio) if random.random()<0.5: audio = randomstretch(audio) audio = librosa.effects.preemphasis(audio) spec = torch.FloatTensor(audio) spec.unsqueeze_(0) spec = (spec - spec.mean()) / spec.std() return spec class SpeechDataset(Dataset): def __init__(self, index_path, augumentation = False): self.Raw = False with open(index_path,encoding='utf_8_sig') as f: lines = f.readlines() self.idx = [] for x in lines: item = x.strip().split("\t") line = [] line.append(item[0]) char_indx = uyghur_latin.encode(item[1]) line.append(char_indx) self.idx.append(line) self.augument = augumentation def __getitem__(self, index): wav_path, char_index = self.idx[index] if self.Raw == True: x = rawfuture(wav_path, self.augument) else: x = melfuture(wav_path, self.augument) return x, char_index, wav_path def __len__(self): return len(self.idx) def _collate_fn(batch): batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) input_lens = [sample[0].size(1) for sample in batch] target_lens = [len(sample[1]) for sample in batch] inputs = torch.zeros(len(batch), batch[0][0].size(0), max(input_lens) ,dtype=torch.float32) targets = torch.zeros(len(batch), max(target_lens),dtype=torch.long).fill_(uyghur_latin.pad_idx) target_lens = torch.IntTensor(target_lens) input_lens = torch.IntTensor(input_lens) paths = [] for x, sample in enumerate(batch): tensor = sample[0] target = sample[1] seq_length = tensor.size(1) inputs[x].narrow(1, 0, seq_length).copy_(tensor) targets[x][:len(target)] = torch.LongTensor(target) paths.append(sample[2]) return inputs, targets, input_lens, target_lens, paths class SpeechDataLoader(DataLoader): def __init__(self, *args, **kwargs): """ Creates a data loader for AudioDatasets. """ super(SpeechDataLoader, self).__init__(*args, **kwargs) self.collate_fn = _collate_fn # The following code is from: http://hetland.org/coding/python/levenshtein.py def levenshtein(a,b): "Calculates the Levenshtein distance between a and b." n, m = len(a), len(b) if n > m: # Make sure n <= m, to use O(min(n,m)) space a,b = b,a n,m = m,n current = list(range(n+1)) for i in range(1,m+1): previous, current = current, [i]+[0]*n for j in range(1,n+1): add, delete = previous[j]+1, current[j-1]+1 change = previous[j-1] if a[j-1] != b[i-1]: change = change + 1 current[j] = min(add, delete, change) return current[n] def wer(s1, src): sw = src.split() return levenshtein(s1.split(),sw), len(sw) def cer(s1, src): return levenshtein(s1,src),len(src) def cer_wer(preds, targets): err_c, lettercnt, err_w, wordcnt = 0,0,0,0 for pred, target in zip(preds, targets): c_er, c_cnt = cer(pred, target) w_er, w_cnt = wer(pred, target) err_c += c_er lettercnt += c_cnt wordcnt += w_cnt err_w += w_er return err_c, lettercnt, err_w, wordcnt def random_speed(): y, s_r = librosa.load("test1.wav", sr=sample_rate, res_type='polyphase') factor = random.uniform(0.8, 1.2) new_sr = s_r*factor new_y = librosa.core.resample(y,s_r,new_sr) soundfile.write("test1_1.wav",new_y, s_r) audio = librosa.effects.time_stretch(y,factor) soundfile.write("test1_2.wav",audio, s_r) def sinaq(): new_y, s_r = librosa.load("test1.wav", sr=sample_rate, res_type='polyphase') new_y = addnoise(new_y) #new_y = librosa.effects.preemphasis(new_y) new_y = preprocessing.minmax_scale(new_y, axis=0) soundfile.write("test1_1.wav",new_y, s_r) new_y, s_r = librosa.load("test2.wav", sr=sample_rate, res_type='polyphase') new_y = preprocessing.minmax_scale(new_y, axis=0) new_y = addnoise(new_y) #new_y = librosa.effects.preemphasis(new_y) soundfile.write("test2_1.wav",new_y, s_r) new_y, s_r = librosa.load("test3.wav", sr=sample_rate, res_type='polyphase') new_y = preprocessing.minmax_scale(new_y, axis=0) new_y = addnoise(new_y) #new_y = librosa.effects.preemphasis(new_y) soundfile.write("test3_1.wav",new_y, s_r) new_y, s_r = librosa.load("test4.wav", sr=sample_rate, res_type='polyphase') new_y = preprocessing.minmax_scale(new_y, axis=0) new_y = addnoise(new_y) #new_y = librosa.effects.preemphasis(new_y) soundfile.write("test4_1.wav",new_y, s_r) new_y, s_r = librosa.load("test6.wav", sr=sample_rate, res_type='polyphase') new_y = preprocessing.minmax_scale(new_y, axis=0) new_y = addnoise(new_y) #new_y = librosa.effects.preemphasis(new_y) soundfile.write("test6_1.wav",new_y, s_r) if __name__ == "__main__": #import matplotlib.pyplot as plt #import librosa.display #random_speed() sinaq() #y, s_r = librosa.load("test6.wav", sr=sample_rate, res_type='polyphase') #soundfile.write("test6_1.wav",addnoise(y), s_r) #soundfile.write("test6_2.wav",addnoise(y), s_r) #soundfile.write("test6_3.wav",addnoise(y), s_r) #soundfile.write("test6_4.wav",addnoise(y), s_r) #soundfile.write("test6_5.wav",addnoise(y), s_r)