diff --git a/lib/nlp/tokenize/bytesToUnicode.ts b/lib/nlp/tokenize/bytesToUnicode.ts index 97e95a0..eb74dbc 100644 --- a/lib/nlp/tokenize/bytesToUnicode.ts +++ b/lib/nlp/tokenize/bytesToUnicode.ts @@ -1,21 +1,37 @@ -export default function bytesToUnicode(): { [key: number]: string } { - const bs: number[] = [ - ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1) - ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1) - ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1) - ]; +function byteToUnicode(): { [key: number]: string } { + const bs: number[] = [ + ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1) + ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1) + ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1) + ]; - const cs: number[] = [...bs]; - let n = 0; + const cs: number[] = [...bs]; + let n = 0; - for (let b = 0; b < 256; b++) { - if (!bs.includes(b)) { - bs.push(b); - cs.push(256 + n); - n++; - } - } + for (let b = 0; b < 256; b++) { + if (!bs.includes(b)) { + bs.push(b); + cs.push(256 + n); + n++; + } + } - const csChars: string[] = cs.map(n => String.fromCharCode(n)); - return Object.fromEntries(bs.map((b, i) => [b, csChars[i]])); -} \ No newline at end of file + const csChars: string[] = cs.map((n) => String.fromCharCode(n)); + return Object.fromEntries(bs.map((b, i) => [b, csChars[i]])); +} + +export default function bytesToUnicode(str: string): string { + const byteToUnicodeMap = byteToUnicode(); + const encoder = new TextEncoder(); + + // Convert the input string to an array of bytes (numbers) + const byteArray = Array.from(encoder.encode(str)); + + // Map each byte to its corresponding Unicode character + const unicodeArray = byteArray.map((byte) => { + return byteToUnicodeMap[byte]; + }); + + // Join the array of Unicode characters into a single string + return unicodeArray.join(""); +} diff --git a/lib/nlp/tokenize/unicodeToBytes.ts b/lib/nlp/tokenize/unicodeToBytes.ts index 125f048..aad47b4 100644 --- a/lib/nlp/tokenize/unicodeToBytes.ts +++ b/lib/nlp/tokenize/unicodeToBytes.ts @@ -1,28 +1,46 @@ -export default function unicodeToBytes(): { [key: string]: number } { - const bs: number[] = [ - ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1) - ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1) - ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1) - ]; +function unicodeToByte(): { [key: string]: number } { + const bs: number[] = [ + ...Array.from({ length: 126 - 33 + 1 }, (_, i) => 33 + i), // range(ord("!"), ord("~") + 1) + ...Array.from({ length: 172 - 161 + 1 }, (_, i) => 161 + i), // range(ord("¡"), ord("¬") + 1) + ...Array.from({ length: 255 - 174 + 1 }, (_, i) => 174 + i) // range(ord("®"), ord("ÿ") + 1) + ]; - const cs: number[] = [...bs]; - let n = 0; + const cs: number[] = [...bs]; + let n = 0; - for (let b = 0; b < 256; b++) { - if (!bs.includes(b)) { - bs.push(b); - cs.push(256 + n); - n++; - } - } + for (let b = 0; b < 256; b++) { + if (!bs.includes(b)) { + bs.push(b); + cs.push(256 + n); + n++; + } + } - const csChars: string[] = cs.map(n => String.fromCharCode(n)); - const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]])); - - const reversedMapping: { [key: string]: number } = {}; - for (const [key, value] of Object.entries(originalMapping)) { - reversedMapping[value] = Number(key); - } + const csChars: string[] = cs.map((n) => String.fromCharCode(n)); + const originalMapping = Object.fromEntries(bs.map((b, i) => [b, csChars[i]])); - return reversedMapping; + const reversedMapping: { [key: string]: number } = {}; + for (const [key, value] of Object.entries(originalMapping)) { + reversedMapping[value] = Number(key); + } + + return reversedMapping; +} + +export default function unicodeToBytes(str: string): string { + const unicodeToByteMap = unicodeToByte(); + + const unicodeArray = str.split(""); + + const byteNumberArray = unicodeArray.map((char) => { + return unicodeToByteMap[char]; + }); + + const byteArray = Uint8Array.from(byteNumberArray); + + const decoder = new TextDecoder(); + + const decodedString = decoder.decode(byteArray); + + return decodedString; } diff --git a/test/bytesToUnicode.test.ts b/test/bytesToUnicode.test.ts index c83f57a..18bfa8f 100644 --- a/test/bytesToUnicode.test.ts +++ b/test/bytesToUnicode.test.ts @@ -1,7 +1,6 @@ import { expect, test } from "bun:test"; -import bytesToUnicode from "../lib/nlp/tokenize/bytesToUnicode"; +import bytesToUnicodes from "../lib/nlp/tokenize/bytesToUnicode"; test("bytesToUnicode: test", () => { - const byteToUnicodeMap = bytesToUnicode(); - expect(byteToUnicodeMap["206"]).toEqual("Î"); + expect(bytesToUnicodes("Hello 你好")).toEqual("HelloĠä½łå¥½"); }); diff --git a/test/unicodeToBytes.test.ts b/test/unicodeToBytes.test.ts index 78ad52a..ab5be6f 100644 --- a/test/unicodeToBytes.test.ts +++ b/test/unicodeToBytes.test.ts @@ -2,16 +2,5 @@ import { expect, test } from "bun:test"; import unicodeToBytes from "../lib/nlp/tokenize/unicodeToBytes"; test("unicodeToBytes: test", () => { - const byteToUnicodeMap = unicodeToBytes(); - const byteNumArray: number[] = []; - const stringToConvert = "天æ°Ķ"; - for (let i = 0; i < stringToConvert.length; i++) { - const chr = stringToConvert[i]; - const byteNumber = byteToUnicodeMap[chr]; - byteNumArray.push(byteNumber); - } - const byteArray = new Uint8Array(byteNumArray); - const decoder = new TextDecoder('utf-8'); - const utf8String = decoder.decode(byteArray); - expect(utf8String).toEqual("天气") + expect(unicodeToBytes("HelloĠä½łå¥½")).toEqual("Hello 你好"); });