add: dataset quality check

2024-09-20 00:53:51 +08:00 · 2024-09-20 00:53:51 +08:00 · 66cf093177
commit 66cf093177
parent 237d2f5c96
4 changed files with 338 additions and 0 deletions
--- a/translate/analytics/Distribution.ipynb
+++ b/translate/analytics/Distribution.ipynb
--- a/translate/analytics/ccmatrix/check_sim.py
+++ b/translate/analytics/ccmatrix/check_sim.py
@ -0,0 +1,79 @@
+import torch
+from transformers import AutoModel, AutoTokenizer
+from numpy.linalg import norm
+import sys
+import random
+from tqdm import tqdm
+
+# Define the cosine similarity function
+cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
+
+# Load the model and tokenizer
+model_name = 'jinaai/jina-embeddings-v2-base-zh'
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+
+# Check if the correct number of command-line arguments are provided
+if len(sys.argv) < 4 or len(sys.argv) > 5:
+    print("Usage: python script.py <file_a_path> <file_b_path> <output_file_path> [num_samples]")
+    sys.exit(1)
+
+# Define file paths from command-line arguments
+file_a_path = sys.argv[1]
+file_b_path = sys.argv[2]
+output_file_path = sys.argv[3]
+
+# Define the number of samples to randomly select
+num_samples = int(sys.argv[4]) if len(sys.argv) == 5 else 100
+
+# Get the total number of lines in the files without loading them fully
+def count_lines(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return sum(1 for _ in f)
+
+total_lines_a = count_lines(file_a_path)
+total_lines_b = count_lines(file_b_path)
+
+# Ensure both files have the same number of lines
+if total_lines_a != total_lines_b:
+    print("Files must have the same number of lines.")
+    sys.exit(1)
+
+# Select random sample indices without loading entire files
+selected_indices = sorted(random.sample(range(total_lines_a), num_samples))
+
+# Function to get all sampled lines from the file
+def get_lines(file_path, line_numbers):
+    result = []
+    max_i = max(line_numbers)
+    j=0
+    next_i = line_numbers[j]
+    len_line_numbers = len(line_numbers)
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for current_line, line in tqdm(enumerate(f)):
+            if current_line < next_i:
+                continue
+            result.append(line.strip())
+            j+=1
+            if current_line >= max_i or j >= len_line_numbers:
+                return result
+            next_i = line_numbers[j]
+            
+    return result
+
+lines_a = get_lines(file_a_path, selected_indices)
+lines_b = get_lines(file_b_path, selected_indices)
+
+# Open output file for writing
+with open(output_file_path, 'w', encoding='utf-8') as output_file:
+    for i, idx in tqdm(enumerate(selected_indices)):
+        # Get the corresponding lines from both files
+        line_a = lines_a[i]
+        line_b = lines_b[i]
+
+        embeddings = model.encode([line_a, line_b])
+        similarity = cos_sim(embeddings[0], embeddings[1])
+
+        # Write the similarity to the output file
+        output_file.write(f"{similarity}\n")
+
+print(f"Similarity calculation completed. Results saved to {output_file_path}")
--- a/translate/analytics/filter.py
+++ b/translate/analytics/filter.py
@ -0,0 +1,59 @@
+from transformers import AutoModel
+from numpy.linalg import norm
+import argparse
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser(
+    description="Usage: python filter.py <file_a_path> <file_b_path> <output_file_path>"
+)
+
+parser.add_argument("file_a", type=str, help="File No.1")
+parser.add_argument("file_b", type=str, help="File No.2")
+parser.add_argument("output", type=str, help="Output file")
+parser.add_argument(
+    "--resume",
+    type=int,
+    default=-1,
+    help="Resume from specified line",
+)
+args = parser.parse_args()
+
+# Define the cosine similarity function
+cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
+
+# Load the model and tokenizer
+model_name = 'jinaai/jina-embeddings-v2-base-zh'
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+
+# Define file paths from command-line arguments
+file_a_path = args.file_a
+file_b_path = args.file_b
+output_file_path = args.output
+
+resume_from = args.resume
+resume = resume_from >= 0
+output_file_mode = 'a' if resume else 'w'
+
+# Open files
+with open(file_a_path, 'r', encoding='utf-8') as file_a, \
+     open(file_b_path, 'r', encoding='utf-8') as file_b, \
+     open(output_file_path, output_file_mode, encoding='utf-8') as output_file:
+    i=1
+    # Read file A and file B line by line
+    for line_a, line_b in tqdm(zip(file_a, file_b)):
+        if resume and i < resume_from:
+            i+=1
+            continue
+        # Remove trailing newline characters
+        line_a = line_a.strip()
+        line_b = line_b.strip()
+        
+        embeddings = model.encode([line_a, line_b])
+        similarity = cos_sim(embeddings[0], embeddings[1])
+        
+        # Write the similarity to the output file
+        output_file.write(f"{similarity}\n")
+        
+        i+=1
+
+print(f"Similarity calculation completed. Results saved to {output_file_path}")
--- a/translate/analytics/translation2019/check_sim.py
+++ b/translate/analytics/translation2019/check_sim.py
@ -0,0 +1,74 @@
+from transformers import AutoModel
+from numpy.linalg import norm
+import sys
+import random
+import json
+from tqdm import tqdm
+
+# Define the cosine similarity function
+cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
+
+# Load the model and tokenizer
+model_name = 'jinaai/jina-embeddings-v2-base-zh'
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+
+# Check if the correct number of command-line arguments are provided
+if len(sys.argv) < 4 or len(sys.argv) > 5:
+    print("Usage: python script.py <file_path> <output_file_path> [num_samples]")
+    sys.exit(1)
+
+# Define file paths from command-line arguments
+file_path = sys.argv[1]
+output_file_path = sys.argv[2]
+
+# Define the number of samples to randomly select
+num_samples = int(sys.argv[3]) if len(sys.argv) == 4 else 100
+
+# Get the total number of lines in the files without loading them fully
+def count_lines(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        return sum(1 for _ in f)
+
+total_lines = count_lines(file_path)
+
+# Select random sample indices without loading entire files
+selected_indices = sorted(random.sample(range(total_lines), num_samples))
+
+# Function to get all sampled lines from the file
+def get_lines(file_path, line_numbers):
+    result = []
+    max_i = max(line_numbers)
+    j=0
+    next_i = line_numbers[j]
+    len_line_numbers = len(line_numbers)
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for current_line, line in tqdm(enumerate(f)):
+            if current_line < next_i:
+                continue
+            result.append(line.strip())
+            j+=1
+            if current_line >= max_i or j >= len_line_numbers:
+                return result
+            next_i = line_numbers[j]
+            
+    return result
+
+lines = get_lines(file_path, selected_indices)
+
+# Open output file for writing
+with open(output_file_path, 'w', encoding='utf-8') as output_file, open("1.txt", 'w', encoding='utf-8') as lf:
+    for i, idx in tqdm(enumerate(selected_indices)):
+        # Get the corresponding lines from both files
+        line = lines[i]
+        data = json.loads(line)
+        chn = data["chinese"]
+        eng = data["english"]
+        lf.write(str(idx)+'\n')
+
+        embeddings = model.encode([chn, eng])
+        similarity = cos_sim(embeddings[0], embeddings[1])
+
+        # Write the similarity to the output file
+        output_file.write(f"{similarity}\n")
+
+print(f"Similarity calculation completed. Results saved to {output_file_path}")