update: bytes <-> unicode for qwen tokenizer

2024-10-07 01:41:09 +08:00 · 2024-10-07 01:41:09 +08:00 · 76c4472de1
commit 76c4472de1
parent 81fe88ade7
4 changed files with 78 additions and 56 deletions
--- a/lib/nlp/tokenize/bytesToUnicode.ts
+++ b/lib/nlp/tokenize/bytesToUnicode.ts
@ -1,21 +1,37 @@
-export default function bytesToUnicode(): { [key: number]: string } {
-    const bs: number[] = [
-        ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
-        ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
-        ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i)  // range(ord("®"), ord("ÿ") + 1)
-    ];
+function byteToUnicode(): { [key: number]: string } {
+	const bs: number[] = [
+		...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
+		...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
+		...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1)
+	];

-    const cs: number[] = [...bs];
-    let n = 0;
+	const cs: number[] = [...bs];
+	let n = 0;

-    for (let b = 0; b < 256; b++) {
-        if (!bs.includes(b)) {
-            bs.push(b);
-            cs.push(256 + n);
-            n++;
-        }
-    }
+	for (let b = 0; b < 256; b++) {
+		if (!bs.includes(b)) {
+			bs.push(b);
+			cs.push(256 + n);
+			n++;
+		}
+	}

-    const csChars: string[] = cs.map(n => String.fromCharCode(n));
-    return Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
+	const csChars: string[] = cs.map((n) => String.fromCharCode(n));
+	return Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
+}
+
+export default function bytesToUnicode(str: string): string {
+	const byteToUnicodeMap = byteToUnicode();
+	const encoder = new TextEncoder();
+
+	// Convert the input string to an array of bytes (numbers)
+	const byteArray = Array.from(encoder.encode(str));
+
+	// Map each byte to its corresponding Unicode character
+	const unicodeArray = byteArray.map((byte) => {
+		return byteToUnicodeMap[byte];
+	});
+
+	// Join the array of Unicode characters into a single string
+	return unicodeArray.join("");
 }
--- a/lib/nlp/tokenize/unicodeToBytes.ts
+++ b/lib/nlp/tokenize/unicodeToBytes.ts
@ -1,28 +1,46 @@
-export default function unicodeToBytes(): { [key: string]: number } {
-    const bs: number[] = [
-        ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
-        ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
-        ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i)  // range(ord("®"), ord("ÿ") + 1)
-    ];
+function unicodeToByte(): { [key: string]: number } {
+	const bs: number[] = [
+		...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
+		...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
+		...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1)
+	];

-    const cs: number[] = [...bs];
-    let n = 0;
+	const cs: number[] = [...bs];
+	let n = 0;

-    for (let b = 0; b < 256; b++) {
-        if (!bs.includes(b)) {
-            bs.push(b);
-            cs.push(256 + n);
-            n++;
-        }
-    }
+	for (let b = 0; b < 256; b++) {
+		if (!bs.includes(b)) {
+			bs.push(b);
+			cs.push(256 + n);
+			n++;
+		}
+	}

-    const csChars: string[] = cs.map(n => String.fromCharCode(n));
-    const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
+	const csChars: string[] = cs.map((n) => String.fromCharCode(n));
+	const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));

-    const reversedMapping: { [key: string]: number } = {};
-    for (const [key, value] of Object.entries(originalMapping)) {
-        reversedMapping[value] = Number(key);
-    }
+	const reversedMapping: { [key: string]: number } = {};
+	for (const [key, value] of Object.entries(originalMapping)) {
+		reversedMapping[value] = Number(key);
+	}

-    return reversedMapping;
+	return reversedMapping;
+}
+
+export default function unicodeToBytes(str: string): string {
+	const unicodeToByteMap = unicodeToByte();
+
+	const unicodeArray = str.split("");
+
+	const byteNumberArray = unicodeArray.map((char) => {
+		return unicodeToByteMap[char];
+	});
+
+	const byteArray = Uint8Array.from(byteNumberArray);
+
+	const decoder = new TextDecoder();
+
+	const decodedString = decoder.decode(byteArray);
+
+	return decodedString;
 }
--- a/test/bytesToUnicode.test.ts
+++ b/test/bytesToUnicode.test.ts
@ -1,7 +1,6 @@
 import { expect, test } from "bun:test";
-import bytesToUnicode from "../lib/nlp/tokenize/bytesToUnicode";
+import bytesToUnicodes from "../lib/nlp/tokenize/bytesToUnicode";

 test("bytesToUnicode: test", () => {
-	const byteToUnicodeMap = bytesToUnicode();
-	expect(byteToUnicodeMap["206"]).toEqual("Î");
+	expect(bytesToUnicodes("Hello 你好")).toEqual("HelloĠä½łå¥½");
 });
--- a/test/unicodeToBytes.test.ts
+++ b/test/unicodeToBytes.test.ts
@ -2,16 +2,5 @@ import { expect, test } from "bun:test";
 import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes";

 test("unicodeToBytes: test", () => {
-	const byteToUnicodeMap = unicodeToBytes();
-	const byteNumArray: number[] = [];
-	const stringToConvert = "å¤©æ°Ķ";
-	for (let i = 0; i < stringToConvert.length; i++) {
-        const chr = stringToConvert[i];
-        const byteNumber = byteToUnicodeMap[chr];
-        byteNumArray.push(byteNumber);
-    }
-    const byteArray = new Uint8Array(byteNumArray);
-	const decoder = new TextDecoder('utf-8');
-    const utf8String = decoder.decode(byteArray);
-    expect(utf8String).toEqual("天气")
+	expect(unicodeToBytes("HelloĠä½łå¥½")).toEqual("Hello 你好");
 });