add: transform func for Unicode/ Bytes in Qwen tokenizer
This commit is contained in:
parent
e8adbab368
commit
81fe88ade7
21
lib/nlp/tokenize/bytesToUnicode.ts
Normal file
21
lib/nlp/tokenize/bytesToUnicode.ts
Normal file
@ -0,0 +1,21 @@
|
||||
export default function bytesToUnicode(): { [key: number]: string } {
|
||||
const bs: number[] = [
|
||||
...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
|
||||
...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
|
||||
...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1)
|
||||
];
|
||||
|
||||
const cs: number[] = [...bs];
|
||||
let n = 0;
|
||||
|
||||
for (let b = 0; b < 256; b++) {
|
||||
if (!bs.includes(b)) {
|
||||
bs.push(b);
|
||||
cs.push(256 + n);
|
||||
n++;
|
||||
}
|
||||
}
|
||||
|
||||
const csChars: string[] = cs.map(n => String.fromCharCode(n));
|
||||
return Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
|
||||
}
|
28
lib/nlp/tokenize/unicodeToBytes.ts
Normal file
28
lib/nlp/tokenize/unicodeToBytes.ts
Normal file
@ -0,0 +1,28 @@
|
||||
export default function unicodeToBytes(): { [key: string]: number } {
|
||||
const bs: number[] = [
|
||||
...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
|
||||
...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
|
||||
...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1)
|
||||
];
|
||||
|
||||
const cs: number[] = [...bs];
|
||||
let n = 0;
|
||||
|
||||
for (let b = 0; b < 256; b++) {
|
||||
if (!bs.includes(b)) {
|
||||
bs.push(b);
|
||||
cs.push(256 + n);
|
||||
n++;
|
||||
}
|
||||
}
|
||||
|
||||
const csChars: string[] = cs.map(n => String.fromCharCode(n));
|
||||
const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
|
||||
|
||||
const reversedMapping: { [key: string]: number } = {};
|
||||
for (const [key, value] of Object.entries(originalMapping)) {
|
||||
reversedMapping[value] = Number(key);
|
||||
}
|
||||
|
||||
return reversedMapping;
|
||||
}
|
7
test/bytesToUnicode.test.ts
Normal file
7
test/bytesToUnicode.test.ts
Normal file
@ -0,0 +1,7 @@
|
||||
import { expect, test } from "bun:test";
|
||||
import bytesToUnicode from "../lib/nlp/tokenize/bytesToUnicode";
|
||||
|
||||
test("bytesToUnicode: test", () => {
|
||||
const byteToUnicodeMap = bytesToUnicode();
|
||||
expect(byteToUnicodeMap["206"]).toEqual("Î");
|
||||
});
|
17
test/unicodeToBytes.test.ts
Normal file
17
test/unicodeToBytes.test.ts
Normal file
@ -0,0 +1,17 @@
|
||||
import { expect, test } from "bun:test";
|
||||
import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes";
|
||||
|
||||
test("unicodeToBytes: test", () => {
|
||||
const byteToUnicodeMap = unicodeToBytes();
|
||||
const byteNumArray: number[] = [];
|
||||
const stringToConvert = "天æ°Ķ";
|
||||
for (let i = 0; i < stringToConvert.length; i++) {
|
||||
const chr = stringToConvert[i];
|
||||
const byteNumber = byteToUnicodeMap[chr];
|
||||
byteNumArray.push(byteNumber);
|
||||
}
|
||||
const byteArray = new Uint8Array(byteNumArray);
|
||||
const decoder = new TextDecoder('utf-8');
|
||||
const utf8String = decoder.decode(byteArray);
|
||||
expect(utf8String).toEqual("天气")
|
||||
});
|
Loading…
Reference in New Issue
Block a user