add: dataset quality check

2024-09-20 00:53:51 +08:00 · 2024-09-20 00:53:51 +08:00 · 66cf093177
commit 66cf093177
parent 237d2f5c96
4 changed files with 338 additions and 0 deletions
--- a/translate/analytics/Distribution.ipynb
+++ b/translate/analytics/Distribution.ipynb
--- a/translate/analytics/ccmatrix/check_sim.py
+++ b/translate/analytics/ccmatrix/check_sim.py
@ -0,0 +1,79 @@
 import torch
 from transformers import AutoModel, AutoTokenizer
 from numpy.linalg import norm
 import sys
 import random
 from tqdm import tqdm
 # Define the cosine similarity function
 cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
 # Load the model and tokenizer
 model_name = 'jinaai/jina-embeddings-v2-base-zh'
 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 # Check if the correct number of command-line arguments are provided
 if len(sys.argv) < 4 or len(sys.argv) > 5:
    print("Usage: python script.py <file_a_path> <file_b_path> <output_file_path> [num_samples]")
    sys.exit(1)
 # Define file paths from command-line arguments
 file_a_path = sys.argv[1]
 file_b_path = sys.argv[2]
 output_file_path = sys.argv[3]
 # Define the number of samples to randomly select
 num_samples = int(sys.argv[4]) if len(sys.argv) == 5 else 100
 # Get the total number of lines in the files without loading them fully
 def count_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)
 total_lines_a = count_lines(file_a_path)
 total_lines_b = count_lines(file_b_path)
 # Ensure both files have the same number of lines
 if total_lines_a != total_lines_b:
    print("Files must have the same number of lines.")
    sys.exit(1)
 # Select random sample indices without loading entire files
 selected_indices = sorted(random.sample(range(total_lines_a), num_samples))
 # Function to get all sampled lines from the file
 def get_lines(file_path, line_numbers):
    result = []
    max_i = max(line_numbers)
    j=0
    next_i = line_numbers[j]
    len_line_numbers = len(line_numbers)
    with open(file_path, 'r', encoding='utf-8') as f:
        for current_line, line in tqdm(enumerate(f)):
            if current_line < next_i:
                continue
            result.append(line.strip())
            j+=1
            if current_line >= max_i or j >= len_line_numbers:
                return result
            next_i = line_numbers[j]
    return result
 lines_a = get_lines(file_a_path, selected_indices)
 lines_b = get_lines(file_b_path, selected_indices)
 # Open output file for writing
 with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for i, idx in tqdm(enumerate(selected_indices)):
        # Get the corresponding lines from both files
        line_a = lines_a[i]
        line_b = lines_b[i]
        embeddings = model.encode([line_a, line_b])
        similarity = cos_sim(embeddings[0], embeddings[1])
        # Write the similarity to the output file
        output_file.write(f"{similarity}\n")
 print(f"Similarity calculation completed. Results saved to {output_file_path}")
--- a/translate/analytics/filter.py
+++ b/translate/analytics/filter.py
@ -0,0 +1,59 @@
 from transformers import AutoModel
 from numpy.linalg import norm
 import argparse
 from tqdm import tqdm
 parser = argparse.ArgumentParser(
    description="Usage: python filter.py <file_a_path> <file_b_path> <output_file_path>"
 )
 parser.add_argument("file_a", type=str, help="File No.1")
 parser.add_argument("file_b", type=str, help="File No.2")
 parser.add_argument("output", type=str, help="Output file")
 parser.add_argument(
    "--resume",
    type=int,
    default=-1,
    help="Resume from specified line",
 )
 args = parser.parse_args()
 # Define the cosine similarity function
 cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
 # Load the model and tokenizer
 model_name = 'jinaai/jina-embeddings-v2-base-zh'
 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 # Define file paths from command-line arguments
 file_a_path = args.file_a
 file_b_path = args.file_b
 output_file_path = args.output
 resume_from = args.resume
 resume = resume_from >= 0
 output_file_mode = 'a' if resume else 'w'
 # Open files
 with open(file_a_path, 'r', encoding='utf-8') as file_a, \
     open(file_b_path, 'r', encoding='utf-8') as file_b, \
     open(output_file_path, output_file_mode, encoding='utf-8') as output_file:
    i=1
    # Read file A and file B line by line
    for line_a, line_b in tqdm(zip(file_a, file_b)):
        if resume and i < resume_from:
            i+=1
            continue
        # Remove trailing newline characters
        line_a = line_a.strip()
        line_b = line_b.strip()
        embeddings = model.encode([line_a, line_b])
        similarity = cos_sim(embeddings[0], embeddings[1])
        # Write the similarity to the output file
        output_file.write(f"{similarity}\n")
        i+=1
 print(f"Similarity calculation completed. Results saved to {output_file_path}")
--- a/translate/analytics/translation2019/check_sim.py
+++ b/translate/analytics/translation2019/check_sim.py
@ -0,0 +1,74 @@
 from transformers import AutoModel
 from numpy.linalg import norm
 import sys
 import random
 import json
 from tqdm import tqdm
 # Define the cosine similarity function
 cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
 # Load the model and tokenizer
 model_name = 'jinaai/jina-embeddings-v2-base-zh'
 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 # Check if the correct number of command-line arguments are provided
 if len(sys.argv) < 4 or len(sys.argv) > 5:
    print("Usage: python script.py <file_path> <output_file_path> [num_samples]")
    sys.exit(1)
 # Define file paths from command-line arguments
 file_path = sys.argv[1]
 output_file_path = sys.argv[2]
 # Define the number of samples to randomly select
 num_samples = int(sys.argv[3]) if len(sys.argv) == 4 else 100
 # Get the total number of lines in the files without loading them fully
 def count_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)
 total_lines = count_lines(file_path)
 # Select random sample indices without loading entire files
 selected_indices = sorted(random.sample(range(total_lines), num_samples))
 # Function to get all sampled lines from the file
 def get_lines(file_path, line_numbers):
    result = []
    max_i = max(line_numbers)
    j=0
    next_i = line_numbers[j]
    len_line_numbers = len(line_numbers)
    with open(file_path, 'r', encoding='utf-8') as f:
        for current_line, line in tqdm(enumerate(f)):
            if current_line < next_i:
                continue
            result.append(line.strip())
            j+=1
            if current_line >= max_i or j >= len_line_numbers:
                return result
            next_i = line_numbers[j]
    return result
 lines = get_lines(file_path, selected_indices)
 # Open output file for writing
 with open(output_file_path, 'w', encoding='utf-8') as output_file, open("1.txt", 'w', encoding='utf-8') as lf:
    for i, idx in tqdm(enumerate(selected_indices)):
        # Get the corresponding lines from both files
        line = lines[i]
        data = json.loads(line)
        chn = data["chinese"]
        eng = data["english"]
        lf.write(str(idx)+'\n')
        embeddings = model.encode([chn, eng])
        similarity = cos_sim(embeddings[0], embeddings[1])
        # Write the similarity to the output file
        output_file.write(f"{similarity}\n")
 print(f"Similarity calculation completed. Results saved to {output_file_path}")