agnlash/data.py
2020-12-25 16:47:12 +09:00

279 lines
8.6 KiB
Python

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import librosa
import soundfile
from sklearn import preprocessing
import os
import random
import re
from uyghur import uyghur_latin
featurelen = 128
sample_rate = 22050
fft_len = 1024
window_len = fft_len
window = "hann"
white_noise,_=librosa.load('white.wav',sr=sample_rate, duration=15.0)
perlin_noise,_=librosa.load('perlin.wav',sr=sample_rate, duration=15.0)
cafe_noise, _ = librosa.load('cafe.wav',sr=sample_rate, duration=15.0)
radio_noise, _ = librosa.load('radionoise.wav',sr=sample_rate, duration=15.0)
def addnoise(audio):
rnd = random.random()
if len(audio) > len(white_noise):
pass
elif rnd <0.25:
audio = audio + white_noise[:len(audio)]
elif rnd <0.50:
audio = audio + perlin_noise[:audio.shape[0]]
elif rnd <0.75:
audio = audio + radio_noise[:audio.shape[0]]
else:
audio = audio + cafe_noise[:audio.shape[0]]
return audio
def randomstretch(audio):
factor = random.uniform(0.8, 1.2)
audio = librosa.core.resample(audio,sample_rate,sample_rate*factor)
return audio
def spec_augment(feat, T=50, F=13, time_mask_num=1, freq_mask_num=1):
#def spec_augment(feat, T=70, F=15, time_mask_num=1, freq_mask_num=1):
rnd = random.random()
feat_size = feat.size(0)
seq_len = feat.size(1)
if rnd< 0.33:
# time mask
for _ in range(time_mask_num):
t = random.randint(0, T)
t0 = random.randint(0, seq_len - t)
feat[:, t0 : t0 + t] = 0
elif rnd <0.66:
# freq mask
for _ in range(freq_mask_num):
f = random.randint(0, F)
f0 = random.randint(0, feat_size - f)
feat[f0 : f0 + f, :] = 0
else:
# time mask
for _ in range(time_mask_num):
t = random.randint(0, T)
t0 = random.randint(0, seq_len - t)
feat[:, t0 : t0 + t] = 0
# freq mask
for _ in range(freq_mask_num):
f = random.randint(0, F)
f0 = random.randint(0, feat_size - f)
feat[f0 : f0 + f, :] = 0
return feat
def melfuture(wav_path, augument = False):
audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase')
if augument:
if random.random()<0.5:
audio = randomstretch(audio)
if random.random()<0.5:
audio = addnoise(audio)
audio = preprocessing.minmax_scale(audio, axis=0)
audio = librosa.effects.preemphasis(audio)
hop_len = 200
if augument and random.random()<0.5:
hop_len = random.randint(160,240)
spec = librosa.feature.melspectrogram(y=audio, sr=s_r, n_fft=fft_len, hop_length=hop_len, n_mels=featurelen, fmax=8000)
spec = librosa.power_to_db(spec)
spec = (spec - spec.mean()) / spec.std()
spec = torch.FloatTensor(spec)
if augument == True and random.random()<0.5:
spec = spec_augment(spec)
return spec
def rawfuture(wav_path, augument = False):
audio, s_r = librosa.load(wav_path, sr=sample_rate, res_type='polyphase')
audio = preprocessing.minmax_scale(audio, axis=0)
if augument:
if random.random()<0.5:
audio = addnoise(audio)
if random.random()<0.5:
audio = randomstretch(audio)
audio = librosa.effects.preemphasis(audio)
spec = torch.FloatTensor(audio)
spec.unsqueeze_(0)
spec = (spec - spec.mean()) / spec.std()
return spec
class SpeechDataset(Dataset):
def __init__(self, index_path, augumentation = False):
self.Raw = False
with open(index_path,encoding='utf_8_sig') as f:
lines = f.readlines()
self.idx = []
for x in lines:
item = x.strip().split("\t")
line = []
line.append(item[0])
char_indx = uyghur_latin.encode(item[1])
line.append(char_indx)
self.idx.append(line)
self.augument = augumentation
def __getitem__(self, index):
wav_path, char_index = self.idx[index]
if self.Raw == True:
x = rawfuture(wav_path, self.augument)
else:
x = melfuture(wav_path, self.augument)
return x, char_index, wav_path
def __len__(self):
return len(self.idx)
def _collate_fn(batch):
batch = sorted(batch, key=lambda sample: sample[0].size(1), reverse=True)
input_lens = [sample[0].size(1) for sample in batch]
target_lens = [len(sample[1]) for sample in batch]
inputs = torch.zeros(len(batch), batch[0][0].size(0), max(input_lens) ,dtype=torch.float32)
targets = torch.zeros(len(batch), max(target_lens),dtype=torch.long).fill_(uyghur_latin.pad_idx)
target_lens = torch.IntTensor(target_lens)
input_lens = torch.IntTensor(input_lens)
paths = []
for x, sample in enumerate(batch):
tensor = sample[0]
target = sample[1]
seq_length = tensor.size(1)
inputs[x].narrow(1, 0, seq_length).copy_(tensor)
targets[x][:len(target)] = torch.LongTensor(target)
paths.append(sample[2])
return inputs, targets, input_lens, target_lens, paths
class SpeechDataLoader(DataLoader):
def __init__(self, *args, **kwargs):
"""
Creates a data loader for AudioDatasets.
"""
super(SpeechDataLoader, self).__init__(*args, **kwargs)
self.collate_fn = _collate_fn
# The following code is from: http://hetland.org/coding/python/levenshtein.py
def levenshtein(a,b):
"Calculates the Levenshtein distance between a and b."
n, m = len(a), len(b)
if n > m:
# Make sure n <= m, to use O(min(n,m)) space
a,b = b,a
n,m = m,n
current = list(range(n+1))
for i in range(1,m+1):
previous, current = current, [i]+[0]*n
for j in range(1,n+1):
add, delete = previous[j]+1, current[j-1]+1
change = previous[j-1]
if a[j-1] != b[i-1]:
change = change + 1
current[j] = min(add, delete, change)
return current[n]
def wer(s1, src):
sw = src.split()
return levenshtein(s1.split(),sw), len(sw)
def cer(s1, src):
return levenshtein(s1,src),len(src)
def cer_wer(preds, targets):
err_c, lettercnt, err_w, wordcnt = 0,0,0,0
for pred, target in zip(preds, targets):
c_er, c_cnt = cer(pred, target)
w_er, w_cnt = wer(pred, target)
err_c += c_er
lettercnt += c_cnt
wordcnt += w_cnt
err_w += w_er
return err_c, lettercnt, err_w, wordcnt
def random_speed():
y, s_r = librosa.load("test1.wav", sr=sample_rate, res_type='polyphase')
factor = random.uniform(0.8, 1.2)
new_sr = s_r*factor
new_y = librosa.core.resample(y,s_r,new_sr)
soundfile.write("test1_1.wav",new_y, s_r)
audio = librosa.effects.time_stretch(y,factor)
soundfile.write("test1_2.wav",audio, s_r)
def sinaq():
new_y, s_r = librosa.load("test1.wav", sr=sample_rate, res_type='polyphase')
new_y = addnoise(new_y)
#new_y = librosa.effects.preemphasis(new_y)
new_y = preprocessing.minmax_scale(new_y, axis=0)
soundfile.write("test1_1.wav",new_y, s_r)
new_y, s_r = librosa.load("test2.wav", sr=sample_rate, res_type='polyphase')
new_y = preprocessing.minmax_scale(new_y, axis=0)
new_y = addnoise(new_y)
#new_y = librosa.effects.preemphasis(new_y)
soundfile.write("test2_1.wav",new_y, s_r)
new_y, s_r = librosa.load("test3.wav", sr=sample_rate, res_type='polyphase')
new_y = preprocessing.minmax_scale(new_y, axis=0)
new_y = addnoise(new_y)
#new_y = librosa.effects.preemphasis(new_y)
soundfile.write("test3_1.wav",new_y, s_r)
new_y, s_r = librosa.load("test4.wav", sr=sample_rate, res_type='polyphase')
new_y = preprocessing.minmax_scale(new_y, axis=0)
new_y = addnoise(new_y)
#new_y = librosa.effects.preemphasis(new_y)
soundfile.write("test4_1.wav",new_y, s_r)
new_y, s_r = librosa.load("test6.wav", sr=sample_rate, res_type='polyphase')
new_y = preprocessing.minmax_scale(new_y, axis=0)
new_y = addnoise(new_y)
#new_y = librosa.effects.preemphasis(new_y)
soundfile.write("test6_1.wav",new_y, s_r)
if __name__ == "__main__":
#import matplotlib.pyplot as plt
#import librosa.display
#random_speed()
sinaq()
#y, s_r = librosa.load("test6.wav", sr=sample_rate, res_type='polyphase')
#soundfile.write("test6_1.wav",addnoise(y), s_r)
#soundfile.write("test6_2.wav",addnoise(y), s_r)
#soundfile.write("test6_3.wav",addnoise(y), s_r)
#soundfile.write("test6_4.wav",addnoise(y), s_r)
#soundfile.write("test6_5.wav",addnoise(y), s_r)