agnlash/uyghur.py
2021-06-14 16:56:51 +09:00

67 lines
1.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
class Uyghur():
def __init__(self, ):
self.uyghur_latin = "abcdefghijklmnopqrstuvwxyz éöü’"
self._vocab_list = [self.pad_char, self.sos_char,self.eos_char] + list(self.uyghur_latin) # $ for padding char. index must be 0
self._vocab2idx = {v: idx for idx, v in enumerate(self._vocab_list)}
def encode(self, s):
s = s.replace("-", ' ').replace(",", ' ').replace(".", ' ').replace("!", ' ').replace("?", ' ').replace("'","")
s = re.sub('\s+',' ',s).strip().lower()
seq = [self.vocab_to_idx(v) for v in s if v in self.uyghur_latin]
return seq
def decode(self, seq):
vocabs = []
for idx in seq:
v = self.idx_to_vocab(idx)
if idx == self.pad_idx or idx == self.eos_idx:
break
elif idx == self.sos_idx:
pass
else:
vocabs.append(v)
s = re.sub('\s+',' ',"".join(vocabs)).strip()
return s
def vocab_to_idx(self, vocab):
return self._vocab2idx[vocab]
def idx_to_vocab(self, idx):
return self._vocab_list[idx]
def vocab_list(self):
return self._vocab_list
@property
def vocab_size(self):
return len(self._vocab_list)
@property
def pad_idx(self):
return self.vocab_to_idx(self.pad_char)
@property
def sos_idx(self):
return self.vocab_to_idx(self.sos_char)
@property
def eos_idx(self):
return self.vocab_to_idx(self.eos_char)
@property
def pad_char(self):
return "<pad>"
@property
def sos_char(self):
return "<sos>"
@property
def eos_char(self):
return "<eos>"
uyghur_latin = Uyghur()