diff --git a/translate/analytics/Distribution.ipynb b/translate/analytics/Distribution.ipynb new file mode 100644 index 0000000..c6c383b --- /dev/null +++ b/translate/analytics/Distribution.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ba8f64d9-3fda-488c-93a4-ae7f1e0b9e97", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4a03c1cb-feb5-4a33-92d2-6c6ed92d0645", + "metadata": {}, + "outputs": [], + "source": [ + "def draw_histogram(filename, bin_range, bin_width, dataset_name):\n", + " \"\"\"\n", + " 绘制频率分布直方图\n", + "\n", + " Args:\n", + " filename: 数据文件路径\n", + " bin_range: 区间范围,一个长度为2的列表,表示最小值和最大值\n", + " bin_width: 区间间隔\n", + "\n", + " Returns:\n", + " None\n", + " \"\"\"\n", + "\n", + " # 读取数据\n", + " data = np.loadtxt(filename)\n", + "\n", + " # 计算bin的个数\n", + " num_bins = int((bin_range[1] - bin_range[0]) / bin_width)\n", + "\n", + " # 绘制直方图\n", + " plt.hist(data, bins=num_bins, range=bin_range)\n", + "\n", + " # 设置标题和坐标轴标签\n", + " plt.title('Quality Distribution of ' + dataset_name)\n", + " plt.xlabel('Quality Score (cos-sim of embeddings for each pair)')\n", + " plt.ylabel('Frequency')\n", + "\n", + " # 显示图形\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "55ffc3e1-0807-403a-ae1f-c7520ec9f46f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 示例用法\n", + "filename = './ccmatrix/CCMatrix.en-zh.sample.sim' # 替换为你的文件名\n", + "bin_range = [0.2, 1] # 设置区间范围\n", + "bin_width = 0.025 # 设置区间间隔\n", + "\n", + "draw_histogram(filename, bin_range, bin_width, \"CCMatrix (ZH-EN)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e6385cff-4b14-4177-9aa1-7e01b3af2b8f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 示例用法\n", + "filename = './translation2019/result.sim' # 替换为你的文件名\n", + "bin_range = [0.2, 1] # 设置区间范围\n", + "bin_width = 0.025 # 设置区间间隔\n", + "\n", + "draw_histogram(filename, bin_range, bin_width, \"nlp_chinese_corpus/translation2019\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/translate/analytics/ccmatrix/check_sim.py b/translate/analytics/ccmatrix/check_sim.py new file mode 100644 index 0000000..2282955 --- /dev/null +++ b/translate/analytics/ccmatrix/check_sim.py @@ -0,0 +1,79 @@ +import torch +from transformers import AutoModel, AutoTokenizer +from numpy.linalg import norm +import sys +import random +from tqdm import tqdm + +# Define the cosine similarity function +cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b)) + +# Load the model and tokenizer +model_name = 'jinaai/jina-embeddings-v2-base-zh' +model = AutoModel.from_pretrained(model_name, trust_remote_code=True) + +# Check if the correct number of command-line arguments are provided +if len(sys.argv) < 4 or len(sys.argv) > 5: + print("Usage: python script.py [num_samples]") + sys.exit(1) + +# Define file paths from command-line arguments +file_a_path = sys.argv[1] +file_b_path = sys.argv[2] +output_file_path = sys.argv[3] + +# Define the number of samples to randomly select +num_samples = int(sys.argv[4]) if len(sys.argv) == 5 else 100 + +# Get the total number of lines in the files without loading them fully +def count_lines(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + return sum(1 for _ in f) + +total_lines_a = count_lines(file_a_path) +total_lines_b = count_lines(file_b_path) + +# Ensure both files have the same number of lines +if total_lines_a != total_lines_b: + print("Files must have the same number of lines.") + sys.exit(1) + +# Select random sample indices without loading entire files +selected_indices = sorted(random.sample(range(total_lines_a), num_samples)) + +# Function to get all sampled lines from the file +def get_lines(file_path, line_numbers): + result = [] + max_i = max(line_numbers) + j=0 + next_i = line_numbers[j] + len_line_numbers = len(line_numbers) + with open(file_path, 'r', encoding='utf-8') as f: + for current_line, line in tqdm(enumerate(f)): + if current_line < next_i: + continue + result.append(line.strip()) + j+=1 + if current_line >= max_i or j >= len_line_numbers: + return result + next_i = line_numbers[j] + + return result + +lines_a = get_lines(file_a_path, selected_indices) +lines_b = get_lines(file_b_path, selected_indices) + +# Open output file for writing +with open(output_file_path, 'w', encoding='utf-8') as output_file: + for i, idx in tqdm(enumerate(selected_indices)): + # Get the corresponding lines from both files + line_a = lines_a[i] + line_b = lines_b[i] + + embeddings = model.encode([line_a, line_b]) + similarity = cos_sim(embeddings[0], embeddings[1]) + + # Write the similarity to the output file + output_file.write(f"{similarity}\n") + +print(f"Similarity calculation completed. Results saved to {output_file_path}") diff --git a/translate/analytics/filter.py b/translate/analytics/filter.py new file mode 100644 index 0000000..59381e0 --- /dev/null +++ b/translate/analytics/filter.py @@ -0,0 +1,59 @@ +from transformers import AutoModel +from numpy.linalg import norm +import argparse +from tqdm import tqdm + +parser = argparse.ArgumentParser( + description="Usage: python filter.py " +) + +parser.add_argument("file_a", type=str, help="File No.1") +parser.add_argument("file_b", type=str, help="File No.2") +parser.add_argument("output", type=str, help="Output file") +parser.add_argument( + "--resume", + type=int, + default=-1, + help="Resume from specified line", +) +args = parser.parse_args() + +# Define the cosine similarity function +cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b)) + +# Load the model and tokenizer +model_name = 'jinaai/jina-embeddings-v2-base-zh' +model = AutoModel.from_pretrained(model_name, trust_remote_code=True) + +# Define file paths from command-line arguments +file_a_path = args.file_a +file_b_path = args.file_b +output_file_path = args.output + +resume_from = args.resume +resume = resume_from >= 0 +output_file_mode = 'a' if resume else 'w' + +# Open files +with open(file_a_path, 'r', encoding='utf-8') as file_a, \ + open(file_b_path, 'r', encoding='utf-8') as file_b, \ + open(output_file_path, output_file_mode, encoding='utf-8') as output_file: + i=1 + # Read file A and file B line by line + for line_a, line_b in tqdm(zip(file_a, file_b)): + if resume and i < resume_from: + i+=1 + continue + # Remove trailing newline characters + line_a = line_a.strip() + line_b = line_b.strip() + + embeddings = model.encode([line_a, line_b]) + similarity = cos_sim(embeddings[0], embeddings[1]) + + # Write the similarity to the output file + output_file.write(f"{similarity}\n") + + i+=1 + +print(f"Similarity calculation completed. Results saved to {output_file_path}") \ No newline at end of file diff --git a/translate/analytics/translation2019/check_sim.py b/translate/analytics/translation2019/check_sim.py new file mode 100644 index 0000000..cc2c29c --- /dev/null +++ b/translate/analytics/translation2019/check_sim.py @@ -0,0 +1,74 @@ +from transformers import AutoModel +from numpy.linalg import norm +import sys +import random +import json +from tqdm import tqdm + +# Define the cosine similarity function +cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b)) + +# Load the model and tokenizer +model_name = 'jinaai/jina-embeddings-v2-base-zh' +model = AutoModel.from_pretrained(model_name, trust_remote_code=True) + +# Check if the correct number of command-line arguments are provided +if len(sys.argv) < 4 or len(sys.argv) > 5: + print("Usage: python script.py [num_samples]") + sys.exit(1) + +# Define file paths from command-line arguments +file_path = sys.argv[1] +output_file_path = sys.argv[2] + +# Define the number of samples to randomly select +num_samples = int(sys.argv[3]) if len(sys.argv) == 4 else 100 + +# Get the total number of lines in the files without loading them fully +def count_lines(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + return sum(1 for _ in f) + +total_lines = count_lines(file_path) + +# Select random sample indices without loading entire files +selected_indices = sorted(random.sample(range(total_lines), num_samples)) + +# Function to get all sampled lines from the file +def get_lines(file_path, line_numbers): + result = [] + max_i = max(line_numbers) + j=0 + next_i = line_numbers[j] + len_line_numbers = len(line_numbers) + with open(file_path, 'r', encoding='utf-8') as f: + for current_line, line in tqdm(enumerate(f)): + if current_line < next_i: + continue + result.append(line.strip()) + j+=1 + if current_line >= max_i or j >= len_line_numbers: + return result + next_i = line_numbers[j] + + return result + +lines = get_lines(file_path, selected_indices) + +# Open output file for writing +with open(output_file_path, 'w', encoding='utf-8') as output_file, open("1.txt", 'w', encoding='utf-8') as lf: + for i, idx in tqdm(enumerate(selected_indices)): + # Get the corresponding lines from both files + line = lines[i] + data = json.loads(line) + chn = data["chinese"] + eng = data["english"] + lf.write(str(idx)+'\n') + + embeddings = model.encode([chn, eng]) + similarity = cos_sim(embeddings[0], embeddings[1]) + + # Write the similarity to the output file + output_file.write(f"{similarity}\n") + +print(f"Similarity calculation completed. Results saved to {output_file_path}")