sparkastML/text-difficulty/grammar/data_deriving.py

82 lines
2.6 KiB
Python

import os
from dotenv import load_dotenv
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
load_dotenv()
client = OpenAI(
api_key=os.getenv("API_KEY"),
base_url=os.getenv("BASE_URL"),
)
def get_AI_response(text, client, model_name, temp):
messages = [
{"role": "user", "content": text},
]
response = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=temp,
)
return response.choices[0].message.content
def get_Examples(df, row, client, model_name, temp):
exp = df["Example"][row]
cds = df["Can-do statement"][row]
gdw = df["guideword"][row]
lvl = df["Level"][row]
cat = df["SuperCategory"][row] + '/' + df["SubCategory"][row]
prompt = \
f'''Generate 10 example sentences based on the following instructions.
Pay close attention to the 'Can-do Statement' and ensure all generated sentences adhere strictly to it.
Provide only the sentences without any additional formatting or markdown.
Output the sentences in plain text, one sentence per line, and do not contain empty line.
INSTRUCTION
Level: {lvl}
Guideword: {gdw}
Can-do Statement: {cds}
Category: {cat}
Example Sentences:
{exp}
'''
return get_AI_response(prompt, client, model_name, temp)
def process_chunk(df, chunk, client, model, temp):
results = []
for row in chunk:
exps = get_Examples(df, row, client, model, temp)
results.append(exps)
return results
input_file = './EGP.csv'
df = pd.read_csv(input_file)
newdf = df.copy()
model = os.getenv("TRANSLATION_MODEL")
temp = float(os.getenv("TRANSLATION_TEMP"))
chunk_size = 64
total_rows = len(df.index)
num_chunks = (total_rows + chunk_size - 1) // chunk_size # Ceiling division
with tqdm(total=total_rows) as pbar:
for chunk_idx in range(num_chunks):
start = chunk_idx * chunk_size
end = min(start + chunk_size, total_rows)
chunk = range(start, end)
with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
futures = {executor.submit(get_Examples, df, row, client, model, temp): row for row in chunk} # 将 row 与 future 绑定
for future in as_completed(futures):
row = futures[future] # 获取对应的行号
result = future.result() # 获取 AI 返回的结果
newdf.at[row, "Example"] = result # 更新到正确的行
pbar.update(len(chunk))
newdf.to_csv("output.csv", index=False)
newdf.to_csv("EGP_Derivied.csv", index=False)