add: transform func for Unicode/ Bytes in Qwen tokenizer

2024-09-28 23:13:14 +08:00 · 2024-09-28 23:13:14 +08:00 · 81fe88ade7
commit 81fe88ade7
parent e8adbab368
4 changed files with 73 additions and 0 deletions
--- a/lib/nlp/tokenize/bytesToUnicode.ts
+++ b/lib/nlp/tokenize/bytesToUnicode.ts
@ -0,0 +1,21 @@
+export default function bytesToUnicode(): { [key: number]: string } {
+    const bs: number[] = [
+        ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
+        ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
+        ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i)  // range(ord("®"), ord("ÿ") + 1)
+    ];
+
+    const cs: number[] = [...bs];
+    let n = 0;
+
+    for (let b = 0; b < 256; b++) {
+        if (!bs.includes(b)) {
+            bs.push(b);
+            cs.push(256 + n);
+            n++;
+        }
+    }
+
+    const csChars: string[] = cs.map(n => String.fromCharCode(n));
+    return Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
+}
--- a/lib/nlp/tokenize/unicodeToBytes.ts
+++ b/lib/nlp/tokenize/unicodeToBytes.ts
@ -0,0 +1,28 @@
+export default function unicodeToBytes(): { [key: string]: number } {
+    const bs: number[] = [
+        ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
+        ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
+        ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i)  // range(ord("®"), ord("ÿ") + 1)
+    ];
+
+    const cs: number[] = [...bs];
+    let n = 0;
+
+    for (let b = 0; b < 256; b++) {
+        if (!bs.includes(b)) {
+            bs.push(b);
+            cs.push(256 + n);
+            n++;
+        }
+    }
+
+    const csChars: string[] = cs.map(n => String.fromCharCode(n));
+    const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
+    
+    const reversedMapping: { [key: string]: number } = {};
+    for (const [key, value] of Object.entries(originalMapping)) {
+        reversedMapping[value] = Number(key);
+    }
+
+    return reversedMapping;
+}
--- a/test/bytesToUnicode.test.ts
+++ b/test/bytesToUnicode.test.ts
@ -0,0 +1,7 @@
+import { expect, test } from "bun:test";
+import bytesToUnicode from "../lib/nlp/tokenize/bytesToUnicode";
+
+test("bytesToUnicode: test", () => {
+	const byteToUnicodeMap = bytesToUnicode();
+	expect(byteToUnicodeMap["206"]).toEqual("Î");
+});
--- a/test/unicodeToBytes.test.ts
+++ b/test/unicodeToBytes.test.ts
@ -0,0 +1,17 @@
+import { expect, test } from "bun:test";
+import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes";
+
+test("unicodeToBytes: test", () => {
+	const byteToUnicodeMap = unicodeToBytes();
+	const byteNumArray: number[] = [];
+	const stringToConvert = "å¤©æ°Ķ";
+	for (let i = 0; i < stringToConvert.length; i++) {
+        const chr = stringToConvert[i];
+        const byteNumber = byteToUnicodeMap[chr];
+        byteNumArray.push(byteNumber);
+    }
+    const byteArray = new Uint8Array(byteNumArray);
+	const decoder = new TextDecoder('utf-8');
+    const utf8String = decoder.decode(byteArray);
+    expect(utf8String).toEqual("天气")
+});