diff --git a/lib/nlp/tokenize/bytesToUnicode.ts b/lib/nlp/tokenize/bytesToUnicode.ts new file mode 100644 index 0000000..97e95a0 --- /dev/null +++ b/lib/nlp/tokenize/bytesToUnicode.ts @@ -0,0 +1,21 @@ +export default function bytesToUnicode(): { [key: number]: string } { + const bs: number[] = [ + ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1) + ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1) + ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1) + ]; + + const cs: number[] = [...bs]; + let n = 0; + + for (let b = 0; b < 256; b++) { + if (!bs.includes(b)) { + bs.push(b); + cs.push(256 + n); + n++; + } + } + + const csChars: string[] = cs.map(n => String.fromCharCode(n)); + return Object.fromEntries(bs.map((b, i) => [b, csChars[i]])); +} \ No newline at end of file diff --git a/lib/nlp/tokenize/unicodeToBytes.ts b/lib/nlp/tokenize/unicodeToBytes.ts new file mode 100644 index 0000000..125f048 --- /dev/null +++ b/lib/nlp/tokenize/unicodeToBytes.ts @@ -0,0 +1,28 @@ +export default function unicodeToBytes(): { [key: string]: number } { + const bs: number[] = [ + ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1) + ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1) + ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1) + ]; + + const cs: number[] = [...bs]; + let n = 0; + + for (let b = 0; b < 256; b++) { + if (!bs.includes(b)) { + bs.push(b); + cs.push(256 + n); + n++; + } + } + + const csChars: string[] = cs.map(n => String.fromCharCode(n)); + const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]])); + + const reversedMapping: { [key: string]: number } = {}; + for (const [key, value] of Object.entries(originalMapping)) { + reversedMapping[value] = Number(key); + } + + return reversedMapping; +} diff --git a/test/bytesToUnicode.test.ts b/test/bytesToUnicode.test.ts new file mode 100644 index 0000000..c83f57a --- /dev/null +++ b/test/bytesToUnicode.test.ts @@ -0,0 +1,7 @@ +import { expect, test } from "bun:test"; +import bytesToUnicode from "../lib/nlp/tokenize/bytesToUnicode"; + +test("bytesToUnicode: test", () => { + const byteToUnicodeMap = bytesToUnicode(); + expect(byteToUnicodeMap["206"]).toEqual("Î"); +}); diff --git a/test/unicodeToBytes.test.ts b/test/unicodeToBytes.test.ts new file mode 100644 index 0000000..78ad52a --- /dev/null +++ b/test/unicodeToBytes.test.ts @@ -0,0 +1,17 @@ +import { expect, test } from "bun:test"; +import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes"; + +test("unicodeToBytes: test", () => { + const byteToUnicodeMap = unicodeToBytes(); + const byteNumArray: number[] = []; + const stringToConvert = "天æ°Ķ"; + for (let i = 0; i < stringToConvert.length; i++) { + const chr = stringToConvert[i]; + const byteNumber = byteToUnicodeMap[chr]; + byteNumArray.push(byteNumber); + } + const byteArray = new Uint8Array(byteNumArray); + const decoder = new TextDecoder('utf-8'); + const utf8String = decoder.decode(byteArray); + expect(utf8String).toEqual("天气") +});