-
Notifications
You must be signed in to change notification settings - Fork 5
/
data_utils.py
89 lines (70 loc) · 3.34 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import json
from datasets import load_dataset, DatasetDict
from utils import make_chat_template_prompt, INSTRUCTION_PREFIX
def transform_conala(output_dir="datasets"):
dataset = load_dataset("neulab/docprompting-conala", trust_remote_code=True)
instruction_prefix = INSTRUCTION_PREFIX["conala"]
def process_example(e, split):
user_content = e["nl"]
assistant_content = None if split == "test" else e["cmd"]
messages = make_chat_template_prompt(user_content, assistant_content, instruction_prefix)
return {"messages": messages}
for split in dataset.keys():
dataset[split] = dataset[split].map(lambda e: process_example(e, split), num_proc=8)
dataset.save_to_disk(f"{output_dir}/conala")
def transform_mbpp(output_dir="datasets"):
dataset = load_dataset("google-research-datasets/mbpp", trust_remote_code=True)
instruction_prefix = INSTRUCTION_PREFIX["mbpp"]
def process_example(e, split):
user_content = f"{e['text']} Your code should pass these tests:"
for test in e["test_list"]:
user_content += f"\n{test}"
assistant_content = None if split == "test" else e["code"]
messages = make_chat_template_prompt(user_content, assistant_content, instruction_prefix)
return {"messages": messages}
for split in dataset.keys():
dataset[split] = dataset[split].map(lambda e: process_example(e, split), num_proc=8)
dataset.save_to_disk(f"{output_dir}/mbpp")
def transform_apps(output_dir="datasets"):
# this preprocessing follows the same format used in the original APPs paper:
# https://github.com/hendrycks/apps/blob/main/train/dataset_apps/APPSBaseDataset.py
# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/example_script.py
dataset = load_dataset("codeparrot/apps", trust_remote_code=True)
instruction_prefix = INSTRUCTION_PREFIX["apps"]
def process_example(e, split):
starter_code = None if len(e["starter_code"]) == 0 else e["starter_code"]
try:
input_outpout = json.loads(e["input_output"])
fn_name = None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
except ValueError:
fn_name = None
try:
solutions = json.loads(e["solutions"])
except ValueError:
solutions = [""]
user_content = e["question"]
if starter_code:
user_content += starter_code
if fn_name:
user_content += "\nUse Standard Input format\n"
else:
user_content += "\nUse Call-Based format\n"
assistant_content = None if split == "test" else solutions[0]
messages = make_chat_template_prompt(user_content, assistant_content, instruction_prefix)
return {"messages": messages}
# create validation set
train_set = dataset["train"].shuffle(42)
validation_set = train_set.select(range(500))
train_set = train_set.select(range(500, len(train_set)))
dataset = DatasetDict({
"train": train_set,
"validation": validation_set,
"test": dataset["test"]
})
for split in dataset.keys():
dataset[split] = dataset[split].map(lambda e: process_example(e, split), num_proc=8)
dataset.save_to_disk(f"{output_dir}/apps")
if __name__ == "__main__":
transform_conala()
transform_mbpp()
transform_apps()