35 lines
1.1 KiB
Python
35 lines
1.1 KiB
Python
import random
|
|
|
|
# File paths
|
|
input_file = 'data/filter/test.jsonl'
|
|
output_file = 'data/filter/test_filtered.jsonl'
|
|
removed_lines_file = 'data/filter/removed_lines.jsonl'
|
|
|
|
# Read all lines from the input file
|
|
with open(input_file, 'r') as file:
|
|
lines = file.readlines()
|
|
|
|
# Identify lines that match `"label": 0`
|
|
matching_lines = [line for line in lines if '"label": 0' in line]
|
|
|
|
LINES = 50
|
|
|
|
# Randomly select 200 lines to remove
|
|
if len(matching_lines) >= LINES:
|
|
lines_to_remove = random.sample(matching_lines, LINES)
|
|
else:
|
|
lines_to_remove = matching_lines # If fewer than 200 lines are available, remove all
|
|
|
|
# Remove the selected lines from the original list
|
|
filtered_lines = [line for line in lines if line not in lines_to_remove]
|
|
|
|
# Write the filtered lines back to the original file
|
|
with open(output_file, 'w') as file:
|
|
file.writelines(filtered_lines)
|
|
|
|
# Save the removed lines to another file
|
|
with open(removed_lines_file, 'w') as file:
|
|
file.writelines(lines_to_remove)
|
|
|
|
print(f"Removed {len(lines_to_remove)} lines and saved them to {removed_lines_file}")
|