From ae6f10a6f09da9ee2bbafe8e6407349c8eac188e Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 28 Sep 2024 21:53:55 +0800 Subject: [PATCH] add: open set validation --- intention-classify/NLU_meta.json | 1 + intention-classify/extract.ipynb | 118 +++++++++++++++++- intention-classify/training/data_utils.py | 7 ++ intention-classify/training/model.py | 13 +- intention-classify/validation/inference.py | 4 +- .../validation/openset_validation.py | 80 ++++++++++++ 6 files changed, 215 insertions(+), 8 deletions(-) create mode 100644 intention-classify/NLU_meta.json create mode 100644 intention-classify/validation/openset_validation.py diff --git a/intention-classify/NLU_meta.json b/intention-classify/NLU_meta.json new file mode 100644 index 0000000..b2394b3 --- /dev/null +++ b/intention-classify/NLU_meta.json @@ -0,0 +1 @@ +{"idx_to_class": {"0": "weather", "1": "base64", "2": "url-encode", "3": "html-encode", "4": "ai.command", "5": "knowledge", "6": "ai.question", "7": "datetime"}, "threshold": 1.7} \ No newline at end of file diff --git a/intention-classify/extract.ipynb b/intention-classify/extract.ipynb index cdfb45c..1df0f53 100644 --- a/intention-classify/extract.ipynb +++ b/intention-classify/extract.ipynb @@ -40,7 +40,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "11caef0e1b674f6ab15880f3f25eca6a", + "model_id": "38137fc55ad24a9785ecbe1978bbc605", "version_major": 2, "version_minor": 0 }, @@ -69,6 +69,122 @@ "vocab = tokenizer.get_vocab()" ] }, + { + "cell_type": "code", + "execution_count": 8, + "id": "21214ff4-018d-4230-81b9-331ebb42773b", + "metadata": {}, + "outputs": [], + "source": [ + "def bytes_to_unicode():\n", + " \"\"\"\n", + " Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control\n", + " characters the bpe code barfs on.\n", + "\n", + " The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab\n", + " if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for\n", + " decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup\n", + " tables between utf-8 bytes and unicode strings.\n", + " \"\"\"\n", + " bs = (\n", + " list(range(ord(\"!\"), ord(\"~\") + 1)) + list(range(ord(\"¡\"), ord(\"¬\") + 1)) + list(range(ord(\"®\"), ord(\"ÿ\") + 1))\n", + " )\n", + " cs = bs[:]\n", + " n = 0\n", + " for b in range(2**8):\n", + " if b not in bs:\n", + " bs.append(b)\n", + " cs.append(2**8 + n)\n", + " n += 1\n", + " cs = [chr(n) for n in cs]\n", + " return dict(zip(bs, cs))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cbc23d2d-985b-443a-83ee-c2286046ad5e", + "metadata": {}, + "outputs": [], + "source": [ + "btu=bytes_to_unicode()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4a99fa07-4922-4d8d-9c28-2275bf9cb8df", + "metadata": {}, + "outputs": [], + "source": [ + "utb = reversed_dict = {value: key for key, value in btu.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "cb218ea7-50c7-4bb8-aa7f-0ee85da76147", + "metadata": {}, + "outputs": [], + "source": [ + "result = tokenizer.convert_ids_to_tokens([104307])[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "2dcb332a-cba9-4a14-9486-4e1ff6bd3dba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "å\n", + "229\n", + "¤\n", + "164\n", + "©\n", + "169\n", + "æ\n", + "230\n", + "°\n", + "176\n", + "Ķ\n", + "148\n" + ] + } + ], + "source": [ + "decoded=b\"\"\n", + "for chr in result:\n", + " print(chr)\n", + " if chr in utb:\n", + " print(utb[chr])\n", + " decoded+=bytes([utb[chr]])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "b1bf1289-2cab-4a97-ad21-b2d24de6d688", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'天气'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "decoded.decode(\"utf-8\", errors='replace')" + ] + }, { "cell_type": "code", "execution_count": 5, diff --git a/intention-classify/training/data_utils.py b/intention-classify/training/data_utils.py index 536845f..679a62e 100644 --- a/intention-classify/training/data_utils.py +++ b/intention-classify/training/data_utils.py @@ -31,6 +31,13 @@ def preprocess_data(data, embedding_map, tokenizer, class_to_idx, max_length=64) dataset.append((class_to_idx[label], embeddings)) return dataset +def get_sentences(data): + result = [] + for _, sentences in data.items(): + for sentence in sentences: + result.append(sentence) + return result + class TextDataset(Dataset): def __init__(self, data): diff --git a/intention-classify/training/model.py b/intention-classify/training/model.py index 21751ab..0dcb37a 100644 --- a/intention-classify/training/model.py +++ b/intention-classify/training/model.py @@ -32,18 +32,21 @@ class SelfAttention(nn.Module): class AttentionBasedModel(nn.Module): - def __init__(self, input_dim, num_classes, heads=8, dim_feedforward=512): + def __init__(self, input_dim, num_classes, heads=8, dim_feedforward=512, num_layers=3): super(AttentionBasedModel, self).__init__() - self.self_attention = SelfAttention(input_dim, heads) + self.self_attention_layers = nn.ModuleList([ + SelfAttention(input_dim, heads) for _ in range(num_layers) + ]) self.fc1 = nn.Linear(input_dim, dim_feedforward) self.fc2 = nn.Linear(dim_feedforward, num_classes) self.dropout = nn.Dropout(0.5) self.norm = nn.LayerNorm(input_dim) def forward(self, x): - attn_output = self.self_attention(x) - attn_output = self.norm(attn_output + x) - pooled_output = torch.mean(attn_output, dim=1) + for attn_layer in self.self_attention_layers: + attn_output = attn_layer(x) + x = self.norm(attn_output + x) + pooled_output = torch.mean(x, dim=1) x = F.relu(self.fc1(pooled_output)) x = self.dropout(x) x = self.fc2(x) diff --git a/intention-classify/validation/inference.py b/intention-classify/validation/inference.py index d3214fe..e3b3d79 100644 --- a/intention-classify/validation/inference.py +++ b/intention-classify/validation/inference.py @@ -66,8 +66,8 @@ embedding_map = torch.load("token_id_to_reduced_embedding.pt") tokenizer = AutoTokenizer.from_pretrained(model_name) # Example usage: -ENERGY_THRESHOLD = 0 -sentence = "天气" +ENERGY_THRESHOLD = 2 +sentence = "what on earth is the cross entropy loss" energy_threshold = ENERGY_THRESHOLD predicted = predict_with_energy( model, sentence, embedding_map, tokenizer, idx_to_class, energy_threshold diff --git a/intention-classify/validation/openset_validation.py b/intention-classify/validation/openset_validation.py new file mode 100644 index 0000000..a14423e --- /dev/null +++ b/intention-classify/validation/openset_validation.py @@ -0,0 +1,80 @@ +from training.model import AttentionBasedModel +from training.config import model_name +from training.config import DIMENSIONS +from training.data_utils import get_sentences +import json +import torch +import torch.nn.functional as F +from transformers import AutoTokenizer +from tqdm import tqdm +from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support + +def energy_score(logits): + # Energy score is minus logsumexp + return -torch.logsumexp(logits, dim=1) + + +def get_energy( + model, + sentence, + embedding_map, + tokenizer, + max_length=64, +): + model.eval() + tokens = tokenizer.tokenize(sentence) + token_ids = tokenizer.convert_tokens_to_ids(tokens) + embeddings = [embedding_map[token_id] for token_id in token_ids[:max_length]] + embeddings = torch.tensor(embeddings).unsqueeze(0) # Add batch dimension + current_shape = embeddings.shape + + if current_shape[1] < 2: + pad_size = 2 - current_shape[1] + embeddings = F.pad( + embeddings, (0, 0, 0, pad_size, 0, 0), mode="constant", value=0 + ) + + with torch.no_grad(): + logits = model(embeddings) + # Calculate energy score + energy = energy_score(logits) + + return energy + + +with open("data.json", "r") as f: + positive_data = json.load(f) +class_to_idx = {cls: idx for idx, cls in enumerate(positive_data.keys())} +idx_to_class = {idx: cls for cls, idx in class_to_idx.items()} +num_classes = len(class_to_idx) + +with open("noise.json", "r") as f: + negative_data = json.load(f) + +input_dim = DIMENSIONS +model = AttentionBasedModel(input_dim, num_classes) +model.load_state_dict(torch.load("./model.pt")) +embedding_map = torch.load("token_id_to_reduced_embedding.pt") +tokenizer = AutoTokenizer.from_pretrained(model_name) + + +all_preds = [] +all_labels = [] +ENERGY_THRESHOLD = 2 +for item in tqdm(get_sentences(positive_data)): + result = get_energy(model, item, embedding_map, tokenizer) < ENERGY_THRESHOLD + all_preds.append(result) + all_labels.append(1) + +for item in tqdm(negative_data): + result = get_energy(model, item, embedding_map, tokenizer) < ENERGY_THRESHOLD + all_preds.append(result) + all_labels.append(0) + +precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary') +accuracy = accuracy_score(all_labels, all_preds) + +print(f'Accuracy: {accuracy:.4f}') +print(f'Precision: {precision:.4f}') +print(f'Recall: {recall:.4f}') +print(f'F1 Score: {f1:.4f}') \ No newline at end of file