Delete data.py
This commit is contained in:
parent
0a88a2c968
commit
8f239fdb84
278
data.py
278
data.py
@ -1,278 +0,0 @@
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
import librosa
|
||||
import soundfile
|
||||
from sklearn import preprocessing
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
from uyghur import uyghur_latin
|
||||
|
||||
featurelen = 128
|
||||
sample_rate = 22050
|
||||
fft_len = 1024
|
||||
window_len = fft_len
|
||||
window = "hann"
|
||||
|
||||
white_noise,_=librosa.load('white.wav',sr=sample_rate, duration=15.0)
|
||||
perlin_noise,_=librosa.load('perlin.wav',sr=sample_rate, duration=15.0)
|
||||
cafe_noise, _ = librosa.load('cafe.wav',sr=sample_rate, duration=15.0)
|
||||
radio_noise, _ = librosa.load('radionoise.wav',sr=sample_rate, duration=15.0)
|
||||
|
||||
def addnoise(audio):
|
||||
rnd = random.random()
|
||||
if len(audio) > len(white_noise):
|
||||
pass
|
||||
elif rnd <0.25:
|
||||
audio = audio + white_noise[:len(audio)]
|
||||
elif rnd <0.50:
|
||||
audio = audio + perlin_noise[:audio.shape[0]]
|
||||
elif rnd <0.75:
|
||||
audio = audio + radio_noise[:audio.shape[0]]
|
||||
else:
|
||||
audio = audio + cafe_noise[:audio.shape[0]]
|
||||
return audio
|
||||
|
||||
def randomstretch(audio):
|
||||
factor = random.uniform(0.8, 1.2)
|
||||
audio = librosa.core.resample(audio,sample_rate,sample_rate*factor)
|
||||
return audio
|
||||
|
||||
def spec_augment(feat, T=50, F=13, time_mask_num=1, freq_mask_num=1):
|
||||
#def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1):
|
||||
rnd = random.random()
|
||||
|
||||
feat_size = feat.size(0)
|
||||
seq_len = feat.size(1)
|
||||
|
||||
if rnd< 0.33:
|
||||
# time mask
|
||||
for _ in range(time_mask_num):
|
||||
t = random.randint(0, T)
|
||||
t0 = random.randint(0, seq_len - t)
|
||||
feat[:, t0 : t0 + t] = 0
|
||||
|
||||
elif rnd <0.66:
|
||||
# freq mask
|
||||
for _ in range(freq_mask_num):
|
||||
f = random.randint(0, F)
|
||||
f0 = random.randint(0, feat_size - f)
|
||||
feat[f0 : f0 + f, :] = 0
|
||||
else:
|
||||
# time mask
|
||||
for _ in range(time_mask_num):
|
||||
t = random.randint(0, T)
|
||||
t0 = random.randint(0, seq_len - t)
|
||||
feat[:, t0 : t0 + t] = 0
|
||||
|
||||
# freq mask
|
||||
for _ in range(freq_mask_num):
|
||||
f = random.randint(0, F)
|
||||
f0 = random.randint(0, feat_size - f)
|
||||
feat[f0 : f0 + f, :] = 0
|
||||
|
||||
return feat
|
||||
|
||||
|
||||
def melfuture(wav_path, augument = False):
|
||||
audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase')
|
||||
if augument:
|
||||
if random.random()<0.5:
|
||||
audio = randomstretch(audio)
|
||||
|
||||
if random.random()<0.5:
|
||||
audio = addnoise(audio)
|
||||
|
||||
audio = preprocessing.minmax_scale(audio, axis=0)
|
||||
audio = librosa.effects.preemphasis(audio)
|
||||
|
||||
hop_len = 200
|
||||
if augument and random.random()<0.5:
|
||||
hop_len = random.randint(160,240)
|
||||
|
||||
spec = librosa.feature.melspectrogram(y=audio, sr=s_r, n_fft=fft_len, hop_length=hop_len, n_mels=featurelen, fmax=8000)
|
||||
spec = librosa.power_to_db(spec)
|
||||
spec = (spec - spec.mean()) / spec.std()
|
||||
|
||||
spec = torch.FloatTensor(spec)
|
||||
if augument == True and random.random()<0.5:
|
||||
spec = spec_augment(spec)
|
||||
|
||||
return spec
|
||||
|
||||
def rawfuture(wav_path, augument = False):
|
||||
audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase')
|
||||
audio = preprocessing.minmax_scale(audio, axis=0)
|
||||
if augument:
|
||||
if random.random()<0.5:
|
||||
audio = addnoise(audio)
|
||||
|
||||
if random.random()<0.5:
|
||||
audio = randomstretch(audio)
|
||||
|
||||
audio = librosa.effects.preemphasis(audio)
|
||||
spec = torch.FloatTensor(audio)
|
||||
spec.unsqueeze_(0)
|
||||
spec = (spec - spec.mean()) / spec.std()
|
||||
return spec
|
||||
|
||||
class SpeechDataset(Dataset):
|
||||
def __init__(self, index_path, augumentation = False):
|
||||
self.Raw = False
|
||||
with open(index_path,encoding='utf_8_sig') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
self.idx = []
|
||||
for x in lines:
|
||||
item = x.strip().split("\t")
|
||||
line = []
|
||||
line.append(item[0])
|
||||
char_indx = uyghur_latin.encode(item[1])
|
||||
line.append(char_indx)
|
||||
self.idx.append(line)
|
||||
|
||||
self.augument = augumentation
|
||||
|
||||
def __getitem__(self, index):
|
||||
wav_path, char_index = self.idx[index]
|
||||
if self.Raw == True:
|
||||
x = rawfuture(wav_path, self.augument)
|
||||
else:
|
||||
x = melfuture(wav_path, self.augument)
|
||||
|
||||
return x, char_index, wav_path
|
||||
|
||||
def __len__(self):
|
||||
return len(self.idx)
|
||||
|
||||
def _collate_fn(batch):
|
||||
batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
|
||||
input_lens = [sample[0].size(1) for sample in batch]
|
||||
target_lens = [len(sample[1]) for sample in batch]
|
||||
|
||||
inputs = torch.zeros(len(batch), batch[0][0].size(0), max(input_lens) ,dtype=torch.float32)
|
||||
targets = torch.zeros(len(batch), max(target_lens),dtype=torch.long).fill_(uyghur_latin.pad_idx)
|
||||
|
||||
target_lens = torch.IntTensor(target_lens)
|
||||
input_lens = torch.IntTensor(input_lens)
|
||||
paths = []
|
||||
for x, sample in enumerate(batch):
|
||||
tensor = sample[0]
|
||||
target = sample[1]
|
||||
seq_length = tensor.size(1)
|
||||
inputs[x].narrow(1, 0, seq_length).copy_(tensor)
|
||||
targets[x][:len(target)] = torch.LongTensor(target)
|
||||
paths.append(sample[2])
|
||||
return inputs, targets, input_lens, target_lens, paths
|
||||
|
||||
|
||||
|
||||
class SpeechDataLoader(DataLoader):
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Creates a data loader for AudioDatasets.
|
||||
"""
|
||||
super(SpeechDataLoader, self).__init__(*args, **kwargs)
|
||||
self.collate_fn = _collate_fn
|
||||
|
||||
|
||||
|
||||
# The following code is from: http://hetland.org/coding/python/levenshtein.py
|
||||
def levenshtein(a,b):
|
||||
"Calculates the Levenshtein distance between a and b."
|
||||
n, m = len(a), len(b)
|
||||
if n > m:
|
||||
# Make sure n <= m, to use O(min(n,m)) space
|
||||
a,b = b,a
|
||||
n,m = m,n
|
||||
|
||||
current = list(range(n+1))
|
||||
for i in range(1,m+1):
|
||||
previous, current = current, [i]+[0]*n
|
||||
for j in range(1,n+1):
|
||||
add, delete = previous[j]+1, current[j-1]+1
|
||||
change = previous[j-1]
|
||||
if a[j-1] != b[i-1]:
|
||||
change = change + 1
|
||||
current[j] = min(add, delete, change)
|
||||
|
||||
return current[n]
|
||||
|
||||
def wer(s1, src):
|
||||
sw = src.split()
|
||||
return levenshtein(s1.split(),sw), len(sw)
|
||||
|
||||
def cer(s1, src):
|
||||
return levenshtein(s1,src),len(src)
|
||||
|
||||
def cer_wer(preds, targets):
|
||||
err_c, lettercnt, err_w, wordcnt = 0,0,0,0
|
||||
for pred, target in zip(preds, targets):
|
||||
c_er, c_cnt = cer(pred, target)
|
||||
w_er, w_cnt = wer(pred, target)
|
||||
err_c += c_er
|
||||
lettercnt += c_cnt
|
||||
wordcnt += w_cnt
|
||||
err_w += w_er
|
||||
|
||||
return err_c, lettercnt, err_w, wordcnt
|
||||
|
||||
|
||||
def random_speed():
|
||||
y, s_r = librosa.load("test1.wav", sr=sample_rate, res_type='polyphase')
|
||||
factor = random.uniform(0.8, 1.2)
|
||||
new_sr = s_r*factor
|
||||
new_y = librosa.core.resample(y,s_r,new_sr)
|
||||
soundfile.write("test1_1.wav",new_y, s_r)
|
||||
|
||||
audio = librosa.effects.time_stretch(y,factor)
|
||||
soundfile.write("test1_2.wav",audio, s_r)
|
||||
|
||||
|
||||
def sinaq():
|
||||
new_y, s_r = librosa.load("test1.wav", sr=sample_rate, res_type='polyphase')
|
||||
new_y = addnoise(new_y)
|
||||
#new_y = librosa.effects.preemphasis(new_y)
|
||||
new_y = preprocessing.minmax_scale(new_y, axis=0)
|
||||
soundfile.write("test1_1.wav",new_y, s_r)
|
||||
|
||||
new_y, s_r = librosa.load("test2.wav", sr=sample_rate, res_type='polyphase')
|
||||
new_y = preprocessing.minmax_scale(new_y, axis=0)
|
||||
new_y = addnoise(new_y)
|
||||
#new_y = librosa.effects.preemphasis(new_y)
|
||||
soundfile.write("test2_1.wav",new_y, s_r)
|
||||
|
||||
new_y, s_r = librosa.load("test3.wav", sr=sample_rate, res_type='polyphase')
|
||||
new_y = preprocessing.minmax_scale(new_y, axis=0)
|
||||
new_y = addnoise(new_y)
|
||||
#new_y = librosa.effects.preemphasis(new_y)
|
||||
soundfile.write("test3_1.wav",new_y, s_r)
|
||||
|
||||
new_y, s_r = librosa.load("test4.wav", sr=sample_rate, res_type='polyphase')
|
||||
new_y = preprocessing.minmax_scale(new_y, axis=0)
|
||||
new_y = addnoise(new_y)
|
||||
#new_y = librosa.effects.preemphasis(new_y)
|
||||
soundfile.write("test4_1.wav",new_y, s_r)
|
||||
|
||||
new_y, s_r = librosa.load("test6.wav", sr=sample_rate, res_type='polyphase')
|
||||
new_y = preprocessing.minmax_scale(new_y, axis=0)
|
||||
new_y = addnoise(new_y)
|
||||
#new_y = librosa.effects.preemphasis(new_y)
|
||||
soundfile.write("test6_1.wav",new_y, s_r)
|
||||
|
||||
if __name__ == "__main__":
|
||||
#import matplotlib.pyplot as plt
|
||||
#import librosa.display
|
||||
|
||||
#random_speed()
|
||||
sinaq()
|
||||
#y, s_r = librosa.load("test6.wav", sr=sample_rate, res_type='polyphase')
|
||||
#soundfile.write("test6_1.wav",addnoise(y), s_r)
|
||||
#soundfile.write("test6_2.wav",addnoise(y), s_r)
|
||||
#soundfile.write("test6_3.wav",addnoise(y), s_r)
|
||||
#soundfile.write("test6_4.wav",addnoise(y), s_r)
|
||||
#soundfile.write("test6_5.wav",addnoise(y), s_r)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user