cvsa/filter/clean_dataset.py

35 lines
1.1 KiB
Python

import random
# File paths
input_file = 'data/filter/test.jsonl'
output_file = 'data/filter/test_filtered.jsonl'
removed_lines_file = 'data/filter/removed_lines.jsonl'
# Read all lines from the input file
with open(input_file, 'r') as file:
lines = file.readlines()
# Identify lines that match `"label": 0`
matching_lines = [line for line in lines if '"label": 0' in line]
LINES = 50
# Randomly select 200 lines to remove
if len(matching_lines) >= LINES:
lines_to_remove = random.sample(matching_lines, LINES)
else:
lines_to_remove = matching_lines # If fewer than 200 lines are available, remove all
# Remove the selected lines from the original list
filtered_lines = [line for line in lines if line not in lines_to_remove]
# Write the filtered lines back to the original file
with open(output_file, 'w') as file:
file.writelines(filtered_lines)
# Save the removed lines to another file
with open(removed_lines_file, 'w') as file:
file.writelines(lines_to_remove)
print(f"Removed {len(lines_to_remove)} lines and saved them to {removed_lines_file}")