diff --git a/data.py b/data.py deleted file mode 100644 index c13b703..0000000 --- a/data.py +++ /dev/null @@ -1,278 +0,0 @@ -import torch -from torch.utils.data import Dataset -from torch.utils.data import DataLoader - -import librosa -import soundfile -from sklearn import preprocessing -import os -import random -import re -from uyghur import uyghur_latin - -featurelen = 128 -sample_rate = 22050 -fft_len = 1024 -window_len = fft_len -window = "hann" - -white_noise,_=librosa.load('white.wav',sr=sample_rate, duration=15.0) -perlin_noise,_=librosa.load('perlin.wav',sr=sample_rate, duration=15.0) -cafe_noise, _ = librosa.load('cafe.wav',sr=sample_rate, duration=15.0) -radio_noise, _ = librosa.load('radionoise.wav',sr=sample_rate, duration=15.0) - -def addnoise(audio): - rnd = random.random() - if len(audio) > len(white_noise): - pass - elif rnd <0.25: - audio = audio + white_noise[:len(audio)] - elif rnd <0.50: - audio = audio + perlin_noise[:audio.shape[0]] - elif rnd <0.75: - audio = audio + radio_noise[:audio.shape[0]] - else: - audio = audio + cafe_noise[:audio.shape[0]] - return audio - -def randomstretch(audio): - factor = random.uniform(0.8, 1.2) - audio = librosa.core.resample(audio,sample_rate,sample_rate*factor) - return audio - -def spec_augment(feat, T=50, F=13, time_mask_num=1, freq_mask_num=1): -#def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1): - rnd = random.random() - - feat_size = feat.size(0) - seq_len = feat.size(1) - - if rnd< 0.33: - # time mask - for _ in range(time_mask_num): - t = random.randint(0, T) - t0 = random.randint(0, seq_len - t) - feat[:, t0 : t0 + t] = 0 - - elif rnd <0.66: - # freq mask - for _ in range(freq_mask_num): - f = random.randint(0, F) - f0 = random.randint(0, feat_size - f) - feat[f0 : f0 + f, :] = 0 - else: - # time mask - for _ in range(time_mask_num): - t = random.randint(0, T) - t0 = random.randint(0, seq_len - t) - feat[:, t0 : t0 + t] = 0 - - # freq mask - for _ in range(freq_mask_num): - f = random.randint(0, F) - f0 = random.randint(0, feat_size - f) - feat[f0 : f0 + f, :] = 0 - - return feat - - -def melfuture(wav_path, augument = False): - audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase') - if augument: - if random.random()<0.5: - audio = randomstretch(audio) - - if random.random()<0.5: - audio = addnoise(audio) - - audio = preprocessing.minmax_scale(audio, axis=0) - audio = librosa.effects.preemphasis(audio) - - hop_len = 200 - if augument and random.random()<0.5: - hop_len = random.randint(160,240) - - spec = librosa.feature.melspectrogram(y=audio, sr=s_r, n_fft=fft_len, hop_length=hop_len, n_mels=featurelen, fmax=8000) - spec = librosa.power_to_db(spec) - spec = (spec - spec.mean()) / spec.std() - - spec = torch.FloatTensor(spec) - if augument == True and random.random()<0.5: - spec = spec_augment(spec) - - return spec - -def rawfuture(wav_path, augument = False): - audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase') - audio = preprocessing.minmax_scale(audio, axis=0) - if augument: - if random.random()<0.5: - audio = addnoise(audio) - - if random.random()<0.5: - audio = randomstretch(audio) - - audio = librosa.effects.preemphasis(audio) - spec = torch.FloatTensor(audio) - spec.unsqueeze_(0) - spec = (spec - spec.mean()) / spec.std() - return spec - -class SpeechDataset(Dataset): - def __init__(self, index_path, augumentation = False): - self.Raw = False - with open(index_path,encoding='utf_8_sig') as f: - lines = f.readlines() - - self.idx = [] - for x in lines: - item = x.strip().split("\t") - line = [] - line.append(item[0]) - char_indx = uyghur_latin.encode(item[1]) - line.append(char_indx) - self.idx.append(line) - - self.augument = augumentation - - def __getitem__(self, index): - wav_path, char_index = self.idx[index] - if self.Raw == True: - x = rawfuture(wav_path, self.augument) - else: - x = melfuture(wav_path, self.augument) - - return x, char_index, wav_path - - def __len__(self): - return len(self.idx) - -def _collate_fn(batch): - batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True) - input_lens = [sample[0].size(1) for sample in batch] - target_lens = [len(sample[1]) for sample in batch] - - inputs = torch.zeros(len(batch), batch[0][0].size(0), max(input_lens) ,dtype=torch.float32) - targets = torch.zeros(len(batch), max(target_lens),dtype=torch.long).fill_(uyghur_latin.pad_idx) - - target_lens = torch.IntTensor(target_lens) - input_lens = torch.IntTensor(input_lens) - paths = [] - for x, sample in enumerate(batch): - tensor = sample[0] - target = sample[1] - seq_length = tensor.size(1) - inputs[x].narrow(1, 0, seq_length).copy_(tensor) - targets[x][:len(target)] = torch.LongTensor(target) - paths.append(sample[2]) - return inputs, targets, input_lens, target_lens, paths - - - -class SpeechDataLoader(DataLoader): - def __init__(self, *args, **kwargs): - """ - Creates a data loader for AudioDatasets. - """ - super(SpeechDataLoader, self).__init__(*args, **kwargs) - self.collate_fn = _collate_fn - - - -# The following code is from: http://hetland.org/coding/python/levenshtein.py -def levenshtein(a,b): - "Calculates the Levenshtein distance between a and b." - n, m = len(a), len(b) - if n > m: - # Make sure n <= m, to use O(min(n,m)) space - a,b = b,a - n,m = m,n - - current = list(range(n+1)) - for i in range(1,m+1): - previous, current = current, [i]+[0]*n - for j in range(1,n+1): - add, delete = previous[j]+1, current[j-1]+1 - change = previous[j-1] - if a[j-1] != b[i-1]: - change = change + 1 - current[j] = min(add, delete, change) - - return current[n] - -def wer(s1, src): - sw = src.split() - return levenshtein(s1.split(),sw), len(sw) - -def cer(s1, src): - return levenshtein(s1,src),len(src) - -def cer_wer(preds, targets): - err_c, lettercnt, err_w, wordcnt = 0,0,0,0 - for pred, target in zip(preds, targets): - c_er, c_cnt = cer(pred, target) - w_er, w_cnt = wer(pred, target) - err_c += c_er - lettercnt += c_cnt - wordcnt += w_cnt - err_w += w_er - - return err_c, lettercnt, err_w, wordcnt - - -def random_speed(): - y, s_r = librosa.load("test1.wav", sr=sample_rate, res_type='polyphase') - factor = random.uniform(0.8, 1.2) - new_sr = s_r*factor - new_y = librosa.core.resample(y,s_r,new_sr) - soundfile.write("test1_1.wav",new_y, s_r) - - audio = librosa.effects.time_stretch(y,factor) - soundfile.write("test1_2.wav",audio, s_r) - - -def sinaq(): - new_y, s_r = librosa.load("test1.wav", sr=sample_rate, res_type='polyphase') - new_y = addnoise(new_y) - #new_y = librosa.effects.preemphasis(new_y) - new_y = preprocessing.minmax_scale(new_y, axis=0) - soundfile.write("test1_1.wav",new_y, s_r) - - new_y, s_r = librosa.load("test2.wav", sr=sample_rate, res_type='polyphase') - new_y = preprocessing.minmax_scale(new_y, axis=0) - new_y = addnoise(new_y) - #new_y = librosa.effects.preemphasis(new_y) - soundfile.write("test2_1.wav",new_y, s_r) - - new_y, s_r = librosa.load("test3.wav", sr=sample_rate, res_type='polyphase') - new_y = preprocessing.minmax_scale(new_y, axis=0) - new_y = addnoise(new_y) - #new_y = librosa.effects.preemphasis(new_y) - soundfile.write("test3_1.wav",new_y, s_r) - - new_y, s_r = librosa.load("test4.wav", sr=sample_rate, res_type='polyphase') - new_y = preprocessing.minmax_scale(new_y, axis=0) - new_y = addnoise(new_y) - #new_y = librosa.effects.preemphasis(new_y) - soundfile.write("test4_1.wav",new_y, s_r) - - new_y, s_r = librosa.load("test6.wav", sr=sample_rate, res_type='polyphase') - new_y = preprocessing.minmax_scale(new_y, axis=0) - new_y = addnoise(new_y) - #new_y = librosa.effects.preemphasis(new_y) - soundfile.write("test6_1.wav",new_y, s_r) - -if __name__ == "__main__": - #import matplotlib.pyplot as plt - #import librosa.display - - #random_speed() - sinaq() - #y, s_r = librosa.load("test6.wav", sr=sample_rate, res_type='polyphase') - #soundfile.write("test6_1.wav",addnoise(y), s_r) - #soundfile.write("test6_2.wav",addnoise(y), s_r) - #soundfile.write("test6_3.wav",addnoise(y), s_r) - #soundfile.write("test6_4.wav",addnoise(y), s_r) - #soundfile.write("test6_5.wav",addnoise(y), s_r) - -