update: bytes <-> unicode for qwen tokenizer
This commit is contained in:
parent
81fe88ade7
commit
76c4472de1
@ -1,21 +1,37 @@
|
|||||||
export default function bytesToUnicode(): { [key: number]: string } {
|
function byteToUnicode(): { [key: number]: string } {
|
||||||
const bs: number[] = [
|
const bs: number[] = [
|
||||||
...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
|
...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
|
||||||
...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
|
...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
|
||||||
...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1)
|
...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1)
|
||||||
];
|
];
|
||||||
|
|
||||||
const cs: number[] = [...bs];
|
const cs: number[] = [...bs];
|
||||||
let n = 0;
|
let n = 0;
|
||||||
|
|
||||||
for (let b = 0; b < 256; b++) {
|
for (let b = 0; b < 256; b++) {
|
||||||
if (!bs.includes(b)) {
|
if (!bs.includes(b)) {
|
||||||
bs.push(b);
|
bs.push(b);
|
||||||
cs.push(256 + n);
|
cs.push(256 + n);
|
||||||
n++;
|
n++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const csChars: string[] = cs.map(n => String.fromCharCode(n));
|
const csChars: string[] = cs.map((n) => String.fromCharCode(n));
|
||||||
return Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
|
return Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export default function bytesToUnicode(str: string): string {
|
||||||
|
const byteToUnicodeMap = byteToUnicode();
|
||||||
|
const encoder = new TextEncoder();
|
||||||
|
|
||||||
|
// Convert the input string to an array of bytes (numbers)
|
||||||
|
const byteArray = Array.from(encoder.encode(str));
|
||||||
|
|
||||||
|
// Map each byte to its corresponding Unicode character
|
||||||
|
const unicodeArray = byteArray.map((byte) => {
|
||||||
|
return byteToUnicodeMap[byte];
|
||||||
|
});
|
||||||
|
|
||||||
|
// Join the array of Unicode characters into a single string
|
||||||
|
return unicodeArray.join("");
|
||||||
|
}
|
||||||
|
@ -1,28 +1,46 @@
|
|||||||
export default function unicodeToBytes(): { [key: string]: number } {
|
function unicodeToByte(): { [key: string]: number } {
|
||||||
const bs: number[] = [
|
const bs: number[] = [
|
||||||
...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
|
...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1)
|
||||||
...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
|
...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1)
|
||||||
...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1)
|
...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1)
|
||||||
];
|
];
|
||||||
|
|
||||||
const cs: number[] = [...bs];
|
const cs: number[] = [...bs];
|
||||||
let n = 0;
|
let n = 0;
|
||||||
|
|
||||||
for (let b = 0; b < 256; b++) {
|
for (let b = 0; b < 256; b++) {
|
||||||
if (!bs.includes(b)) {
|
if (!bs.includes(b)) {
|
||||||
bs.push(b);
|
bs.push(b);
|
||||||
cs.push(256 + n);
|
cs.push(256 + n);
|
||||||
n++;
|
n++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const csChars: string[] = cs.map(n => String.fromCharCode(n));
|
const csChars: string[] = cs.map((n) => String.fromCharCode(n));
|
||||||
const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
|
const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]]));
|
||||||
|
|
||||||
const reversedMapping: { [key: string]: number } = {};
|
|
||||||
for (const [key, value] of Object.entries(originalMapping)) {
|
|
||||||
reversedMapping[value] = Number(key);
|
|
||||||
}
|
|
||||||
|
|
||||||
return reversedMapping;
|
const reversedMapping: { [key: string]: number } = {};
|
||||||
|
for (const [key, value] of Object.entries(originalMapping)) {
|
||||||
|
reversedMapping[value] = Number(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
return reversedMapping;
|
||||||
|
}
|
||||||
|
|
||||||
|
export default function unicodeToBytes(str: string): string {
|
||||||
|
const unicodeToByteMap = unicodeToByte();
|
||||||
|
|
||||||
|
const unicodeArray = str.split("");
|
||||||
|
|
||||||
|
const byteNumberArray = unicodeArray.map((char) => {
|
||||||
|
return unicodeToByteMap[char];
|
||||||
|
});
|
||||||
|
|
||||||
|
const byteArray = Uint8Array.from(byteNumberArray);
|
||||||
|
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
|
||||||
|
const decodedString = decoder.decode(byteArray);
|
||||||
|
|
||||||
|
return decodedString;
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import { expect, test } from "bun:test";
|
import { expect, test } from "bun:test";
|
||||||
import bytesToUnicode from "../lib/nlp/tokenize/bytesToUnicode";
|
import bytesToUnicodes from "../lib/nlp/tokenize/bytesToUnicode";
|
||||||
|
|
||||||
test("bytesToUnicode: test", () => {
|
test("bytesToUnicode: test", () => {
|
||||||
const byteToUnicodeMap = bytesToUnicode();
|
expect(bytesToUnicodes("Hello 你好")).toEqual("HelloĠä½łå¥½");
|
||||||
expect(byteToUnicodeMap["206"]).toEqual("Î");
|
|
||||||
});
|
});
|
||||||
|
@ -2,16 +2,5 @@ import { expect, test } from "bun:test";
|
|||||||
import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes";
|
import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes";
|
||||||
|
|
||||||
test("unicodeToBytes: test", () => {
|
test("unicodeToBytes: test", () => {
|
||||||
const byteToUnicodeMap = unicodeToBytes();
|
expect(unicodeToBytes("HelloĠä½łå¥½")).toEqual("Hello 你好");
|
||||||
const byteNumArray: number[] = [];
|
|
||||||
const stringToConvert = "天æ°Ķ";
|
|
||||||
for (let i = 0; i < stringToConvert.length; i++) {
|
|
||||||
const chr = stringToConvert[i];
|
|
||||||
const byteNumber = byteToUnicodeMap[chr];
|
|
||||||
byteNumArray.push(byteNumber);
|
|
||||||
}
|
|
||||||
const byteArray = new Uint8Array(byteNumArray);
|
|
||||||
const decoder = new TextDecoder('utf-8');
|
|
||||||
const utf8String = decoder.decode(byteArray);
|
|
||||||
expect(utf8String).toEqual("天气")
|
|
||||||
});
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user