update: bytes <-> unicode for qwen tokenizer

This commit is contained in:
alikia2x (寒寒) 2024-10-07 01:41:09 +08:00
parent 81fe88ade7
commit 76c4472de1
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
4 changed files with 78 additions and 56 deletions

View File

@ -1,4 +1,4 @@
export default function bytesToUnicode(): { [key: number]: string } { function byteToUnicode(): { [key: number]: string } {
const bs: number[] = [ const bs: number[] = [
...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1) ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1) ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
@ -16,6 +16,22 @@ export default function bytesToUnicode(): { [key: number]: string } {
} }
} }
const csChars: string[] = cs.map(n => String.fromCharCode(n)); const csChars: string[] = cs.map((n) => String.fromCharCode(n));
return Object.fromEntries(bs.map((b, i) => [b, csChars[i]])); return Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
} }
export default function bytesToUnicode(str: string): string {
const byteToUnicodeMap = byteToUnicode();
const encoder = new TextEncoder();
// Convert the input string to an array of bytes (numbers)
const byteArray = Array.from(encoder.encode(str));
// Map each byte to its corresponding Unicode character
const unicodeArray = byteArray.map((byte) => {
return byteToUnicodeMap[byte];
});
// Join the array of Unicode characters into a single string
return unicodeArray.join("");
}

View File

@ -1,4 +1,4 @@
export default function unicodeToBytes(): { [key: string]: number } { function unicodeToByte(): { [key: string]: number } {
const bs: number[] = [ const bs: number[] = [
...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1) ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1) ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
@ -16,7 +16,7 @@ export default function unicodeToBytes(): { [key: string]: number } {
} }
} }
const csChars: string[] = cs.map(n => String.fromCharCode(n)); const csChars: string[] = cs.map((n) => String.fromCharCode(n));
const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]])); const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
const reversedMapping: { [key: string]: number } = {}; const reversedMapping: { [key: string]: number } = {};
@ -26,3 +26,21 @@ export default function unicodeToBytes(): { [key: string]: number } {
return reversedMapping; return reversedMapping;
} }
export default function unicodeToBytes(str: string): string {
const unicodeToByteMap = unicodeToByte();
const unicodeArray = str.split("");
const byteNumberArray = unicodeArray.map((char) => {
return unicodeToByteMap[char];
});
const byteArray = Uint8Array.from(byteNumberArray);
const decoder = new TextDecoder();
const decodedString = decoder.decode(byteArray);
return decodedString;
}

View File

@ -1,7 +1,6 @@
import { expect, test } from "bun:test"; import { expect, test } from "bun:test";
import bytesToUnicode from "../lib/nlp/tokenize/bytesToUnicode"; import bytesToUnicodes from "../lib/nlp/tokenize/bytesToUnicode";
test("bytesToUnicode: test", () => { test("bytesToUnicode: test", () => {
const byteToUnicodeMap = bytesToUnicode(); expect(bytesToUnicodes("Hello 你好")).toEqual("HelloĠä½łå¥½");
expect(byteToUnicodeMap["206"]).toEqual("Î");
}); });

View File

@ -2,16 +2,5 @@ import { expect, test } from "bun:test";
import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes"; import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes";
test("unicodeToBytes: test", () => { test("unicodeToBytes: test", () => {
const byteToUnicodeMap = unicodeToBytes(); expect(unicodeToBytes("HelloĠä½łå¥½")).toEqual("Hello 你好");
const byteNumArray: number[] = [];
const stringToConvert = "天æ°Ķ";
for (let i = 0; i < stringToConvert.length; i++) {
const chr = stringToConvert[i];
const byteNumber = byteToUnicodeMap[chr];
byteNumArray.push(byteNumber);
}
const byteArray = new Uint8Array(byteNumArray);
const decoder = new TextDecoder('utf-8');
const utf8String = decoder.decode(byteArray);
expect(utf8String).toEqual("天气")
}); });