add: dataset quality check
This commit is contained in:
parent
237d2f5c96
commit
66cf093177
126
translate/analytics/Distribution.ipynb
Normal file
126
translate/analytics/Distribution.ipynb
Normal file
File diff suppressed because one or more lines are too long
79
translate/analytics/ccmatrix/check_sim.py
Normal file
79
translate/analytics/ccmatrix/check_sim.py
Normal file
@ -0,0 +1,79 @@
|
||||
import torch
|
||||
from transformers import AutoModel, AutoTokenizer
|
||||
from numpy.linalg import norm
|
||||
import sys
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
|
||||
# Define the cosine similarity function
|
||||
cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
|
||||
|
||||
# Load the model and tokenizer
|
||||
model_name = 'jinaai/jina-embeddings-v2-base-zh'
|
||||
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
# Check if the correct number of command-line arguments are provided
|
||||
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
||||
print("Usage: python script.py <file_a_path> <file_b_path> <output_file_path> [num_samples]")
|
||||
sys.exit(1)
|
||||
|
||||
# Define file paths from command-line arguments
|
||||
file_a_path = sys.argv[1]
|
||||
file_b_path = sys.argv[2]
|
||||
output_file_path = sys.argv[3]
|
||||
|
||||
# Define the number of samples to randomly select
|
||||
num_samples = int(sys.argv[4]) if len(sys.argv) == 5 else 100
|
||||
|
||||
# Get the total number of lines in the files without loading them fully
|
||||
def count_lines(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return sum(1 for _ in f)
|
||||
|
||||
total_lines_a = count_lines(file_a_path)
|
||||
total_lines_b = count_lines(file_b_path)
|
||||
|
||||
# Ensure both files have the same number of lines
|
||||
if total_lines_a != total_lines_b:
|
||||
print("Files must have the same number of lines.")
|
||||
sys.exit(1)
|
||||
|
||||
# Select random sample indices without loading entire files
|
||||
selected_indices = sorted(random.sample(range(total_lines_a), num_samples))
|
||||
|
||||
# Function to get all sampled lines from the file
|
||||
def get_lines(file_path, line_numbers):
|
||||
result = []
|
||||
max_i = max(line_numbers)
|
||||
j=0
|
||||
next_i = line_numbers[j]
|
||||
len_line_numbers = len(line_numbers)
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for current_line, line in tqdm(enumerate(f)):
|
||||
if current_line < next_i:
|
||||
continue
|
||||
result.append(line.strip())
|
||||
j+=1
|
||||
if current_line >= max_i or j >= len_line_numbers:
|
||||
return result
|
||||
next_i = line_numbers[j]
|
||||
|
||||
return result
|
||||
|
||||
lines_a = get_lines(file_a_path, selected_indices)
|
||||
lines_b = get_lines(file_b_path, selected_indices)
|
||||
|
||||
# Open output file for writing
|
||||
with open(output_file_path, 'w', encoding='utf-8') as output_file:
|
||||
for i, idx in tqdm(enumerate(selected_indices)):
|
||||
# Get the corresponding lines from both files
|
||||
line_a = lines_a[i]
|
||||
line_b = lines_b[i]
|
||||
|
||||
embeddings = model.encode([line_a, line_b])
|
||||
similarity = cos_sim(embeddings[0], embeddings[1])
|
||||
|
||||
# Write the similarity to the output file
|
||||
output_file.write(f"{similarity}\n")
|
||||
|
||||
print(f"Similarity calculation completed. Results saved to {output_file_path}")
|
59
translate/analytics/filter.py
Normal file
59
translate/analytics/filter.py
Normal file
@ -0,0 +1,59 @@
|
||||
from transformers import AutoModel
|
||||
from numpy.linalg import norm
|
||||
import argparse
|
||||
from tqdm import tqdm
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Usage: python filter.py <file_a_path> <file_b_path> <output_file_path>"
|
||||
)
|
||||
|
||||
parser.add_argument("file_a", type=str, help="File No.1")
|
||||
parser.add_argument("file_b", type=str, help="File No.2")
|
||||
parser.add_argument("output", type=str, help="Output file")
|
||||
parser.add_argument(
|
||||
"--resume",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Resume from specified line",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Define the cosine similarity function
|
||||
cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
|
||||
|
||||
# Load the model and tokenizer
|
||||
model_name = 'jinaai/jina-embeddings-v2-base-zh'
|
||||
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
# Define file paths from command-line arguments
|
||||
file_a_path = args.file_a
|
||||
file_b_path = args.file_b
|
||||
output_file_path = args.output
|
||||
|
||||
resume_from = args.resume
|
||||
resume = resume_from >= 0
|
||||
output_file_mode = 'a' if resume else 'w'
|
||||
|
||||
# Open files
|
||||
with open(file_a_path, 'r', encoding='utf-8') as file_a, \
|
||||
open(file_b_path, 'r', encoding='utf-8') as file_b, \
|
||||
open(output_file_path, output_file_mode, encoding='utf-8') as output_file:
|
||||
i=1
|
||||
# Read file A and file B line by line
|
||||
for line_a, line_b in tqdm(zip(file_a, file_b)):
|
||||
if resume and i < resume_from:
|
||||
i+=1
|
||||
continue
|
||||
# Remove trailing newline characters
|
||||
line_a = line_a.strip()
|
||||
line_b = line_b.strip()
|
||||
|
||||
embeddings = model.encode([line_a, line_b])
|
||||
similarity = cos_sim(embeddings[0], embeddings[1])
|
||||
|
||||
# Write the similarity to the output file
|
||||
output_file.write(f"{similarity}\n")
|
||||
|
||||
i+=1
|
||||
|
||||
print(f"Similarity calculation completed. Results saved to {output_file_path}")
|
74
translate/analytics/translation2019/check_sim.py
Normal file
74
translate/analytics/translation2019/check_sim.py
Normal file
@ -0,0 +1,74 @@
|
||||
from transformers import AutoModel
|
||||
from numpy.linalg import norm
|
||||
import sys
|
||||
import random
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
# Define the cosine similarity function
|
||||
cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
|
||||
|
||||
# Load the model and tokenizer
|
||||
model_name = 'jinaai/jina-embeddings-v2-base-zh'
|
||||
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
||||
|
||||
# Check if the correct number of command-line arguments are provided
|
||||
if len(sys.argv) < 4 or len(sys.argv) > 5:
|
||||
print("Usage: python script.py <file_path> <output_file_path> [num_samples]")
|
||||
sys.exit(1)
|
||||
|
||||
# Define file paths from command-line arguments
|
||||
file_path = sys.argv[1]
|
||||
output_file_path = sys.argv[2]
|
||||
|
||||
# Define the number of samples to randomly select
|
||||
num_samples = int(sys.argv[3]) if len(sys.argv) == 4 else 100
|
||||
|
||||
# Get the total number of lines in the files without loading them fully
|
||||
def count_lines(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return sum(1 for _ in f)
|
||||
|
||||
total_lines = count_lines(file_path)
|
||||
|
||||
# Select random sample indices without loading entire files
|
||||
selected_indices = sorted(random.sample(range(total_lines), num_samples))
|
||||
|
||||
# Function to get all sampled lines from the file
|
||||
def get_lines(file_path, line_numbers):
|
||||
result = []
|
||||
max_i = max(line_numbers)
|
||||
j=0
|
||||
next_i = line_numbers[j]
|
||||
len_line_numbers = len(line_numbers)
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for current_line, line in tqdm(enumerate(f)):
|
||||
if current_line < next_i:
|
||||
continue
|
||||
result.append(line.strip())
|
||||
j+=1
|
||||
if current_line >= max_i or j >= len_line_numbers:
|
||||
return result
|
||||
next_i = line_numbers[j]
|
||||
|
||||
return result
|
||||
|
||||
lines = get_lines(file_path, selected_indices)
|
||||
|
||||
# Open output file for writing
|
||||
with open(output_file_path, 'w', encoding='utf-8') as output_file, open("1.txt", 'w', encoding='utf-8') as lf:
|
||||
for i, idx in tqdm(enumerate(selected_indices)):
|
||||
# Get the corresponding lines from both files
|
||||
line = lines[i]
|
||||
data = json.loads(line)
|
||||
chn = data["chinese"]
|
||||
eng = data["english"]
|
||||
lf.write(str(idx)+'\n')
|
||||
|
||||
embeddings = model.encode([chn, eng])
|
||||
similarity = cos_sim(embeddings[0], embeddings[1])
|
||||
|
||||
# Write the similarity to the output file
|
||||
output_file.write(f"{similarity}\n")
|
||||
|
||||
print(f"Similarity calculation completed. Results saved to {output_file_path}")
|
Loading…
Reference in New Issue
Block a user