Skip to content

Commit

Permalink
Adding download code for ARCT, MCTest, MCTACO, MuTual, and QuAIL (#1258)
Browse files Browse the repository at this point in the history
* add download code for ARCT and MCTACO"

* add mutual, mutual_plus, and quail download code

* update supported tasks data

* add download code for mctest160 and mctest500

* update jiant task name for mctest

* Update jiant/tasks/lib/quail.py

Co-authored-by: Clara Vania <cv50@log-1.nyu.cluster>
Co-authored-by: jeswan <57466294+jeswan@users.noreply.github.com>
  • Loading branch information
3 people committed Jan 7, 2021
1 parent 9a45712 commit e9d6c68
Show file tree
Hide file tree
Showing 9 changed files with 399 additions and 48 deletions.
12 changes: 6 additions & 6 deletions guides/tasks/supported_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@

| Name | `task_name` | `jiant` | Downloader | `jiant_task_name` | Misc |
|---|---|:---:|:---:|---|---|
| MCTACO | mctaco || | mctaco | |
| MCTest | mctest160 or mctest500 || | mctest | |
| [Argument Reasoning Comprehension](https://arxiv.org/abs/1708.01425) | arct || | arct | [Github](https://github.com/UKPLab/argument-reasoning-comprehension-task) |
| [Argument Reasoning Comprehension](https://arxiv.org/abs/1708.01425) | arct ||| arct | [Github](https://github.com/UKPLab/argument-reasoning-comprehension-task) |
| Abductive NLI | abductive_nli ||| abductive_nli | |
| SuperGLUE Winogender Diagnostic | superglue_axg ||| superglue_axg | SuperGLUE |
| Acceptability Definiteness | acceptability_definiteness || | acceptability_definiteness | Function Words |
Expand All @@ -29,20 +27,22 @@
| GLUE Diagnostic | glue_diagnostics ||| glue_diagnostics | GLUE |
| HellaSwag | hellaswag ||| hellaswag | |
| [MCScript2.0](https://arxiv.org/pdf/1905.09531.pdf) | mcscript || | mcscript | [data](https://my.hidrive.com/share/wdnind8pp5#$/) |
| MCTACO | mctaco ||| mctaco | |
| MCTest | mctest160 or mctest500 ||| mctest160 or mctest600 | [data](https://mattr1.github.io/mctest/data.html) |
| MLM | * || * | mlm_simple | See task-specific notes. |
| MLQA | `mlqa_{lang1}_{lang2}` ||| mlqa | XTREME, multi-lang |
| MNLI | mnli ||| mnli | GLUE, MNLI-matched |
| MNLI-mismatched | mnli_mismatched ||| mnli_mismatched | GLUE |
| MRPC | mrpc ||| mrpc | GLUE |
| MultiRC | multirc ||| multirc | SuperGLUE |
| Mutual (standard version) | mutual || | mutual | [site](https://github.com/Nealcly/MuTual) |
| Mutual ("challenge" version) | mutual_plus || | mutual_plus | [site](https://github.com/Nealcly/MuTual) |
| Mutual (standard version) | mutual || | mutual | [site](https://github.com/Nealcly/MuTual) |
| Mutual ("challenge" version) | mutual_plus || | mutual_plus | [site](https://github.com/Nealcly/MuTual) |
| Natural Questions | mrqa_natural_questions ||| mrqa_natural_questions | [MRQA](https://mrqa.github.io/) version of task |
| NewsQA | newsqa ||| newsqa | |
| PIQA | piqa ||| piqa | [PIQA](https://yonatanbisk.com/piqa/) |
| QAMR | qamr ||| qamr | |
| QA-SRL | qasrl ||| qasrl | |
| Quail | quail || | quail | [site](http://text-machine.cs.uml.edu/lab2/projects/quail/) |
| QuAIL | quail || | quail | [site](http://text-machine.cs.uml.edu/lab2/projects/quail/) |
| Quoref | quoref ||| quoref | |
| EP-NER | ner || | ner | Edge-Probing |
| PAWS-X | `pawsx_{lang}` ||| pawsx | XTREME, multi-lang |
Expand Down
7 changes: 7 additions & 0 deletions jiant/scripts/download_data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@

OTHER_DOWNLOAD_TASKS = {
"abductive_nli",
"arct",
"fever_nli",
"swag",
"qamr",
"qasrl",
"newsqa",
"mctaco",
"mctest160",
"mctest500",
"mrqa_natural_questions",
"mutual",
"mutual_plus",
"piqa",
"winogrande",
"ropes",
Expand All @@ -39,4 +45,5 @@
"race",
"race_middle",
"race_high",
"quail",
}
238 changes: 238 additions & 0 deletions jiant/scripts/download_data/dl_datasets/files_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ def download_task_data_and_write_config(task_name: str, task_data_path: str, tas
download_abductive_nli_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "arct":
download_arct_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "fever_nli":
download_fever_nli_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
Expand All @@ -49,10 +53,30 @@ def download_task_data_and_write_config(task_name: str, task_data_path: str, tas
download_newsqa_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "mctaco":
download_mctaco_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "mctest160":
download_mctest160_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "mctest500":
download_mctest500_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "mrqa_natural_questions":
download_mrqa_natural_questions_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "mutual":
download_mutual_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "mutual_plus":
download_mutual_plus_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
)
elif task_name == "piqa":
download_piqa_data_and_write_config(
task_name=task_name, task_data_path=task_data_path, task_config_path=task_config_path
Expand Down Expand Up @@ -170,6 +194,220 @@ def download_abductive_nli_data_and_write_config(
)


def download_arct_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str):
os.makedirs(task_data_path, exist_ok=True)
file_name_list = [
"train-doubled.tsv",
"train-w-swap-doubled.tsv",
"train-w-swap.tsv",
"train.tsv",
"dev.tsv",
"test.tsv",
]
for file_name in file_name_list:
download_utils.download_file(
f"https://raw.githubusercontent.com/UKPLab/argument-reasoning-comprehension-task/"
+ f"master/experiments/src/main/python/data/{file_name}",
os.path.join(task_data_path, file_name),
)
py_io.write_json(
data={
"task": task_name,
"paths": {
"train": os.path.join(task_data_path, "train.tsv"),
"val": os.path.join(task_data_path, "val.tsv"),
"test": os.path.join(task_data_path, "test.tsv"),
"train_doubled": os.path.join(task_data_path, "train-doubled.tsv"),
"train_w_swap": os.path.join(task_data_path, "train-w-swap.tsv"),
"train_w_swap_doubled": os.path.join(task_data_path, "train-w-swap-doubled.tsv"),
},
"name": task_name,
},
path=task_config_path,
)


def download_mctaco_data_and_write_config(
task_name: str, task_data_path: str, task_config_path: str
):
os.makedirs(task_data_path, exist_ok=True)
file_name_list = ["dev_3783.tsv", "test_9442.tsv"]
for file_name in file_name_list:
download_utils.download_file(
f"https://raw.githubusercontent.com/CogComp/MCTACO/master/dataset/{file_name}",
os.path.join(task_data_path, file_name),
)
py_io.write_json(
data={
"task": task_name,
"paths": {
"val": os.path.join(task_data_path, "dev_3783.tsv"),
"test": os.path.join(task_data_path, "test_9442.tsv"),
},
"name": task_name,
},
path=task_config_path,
)


def download_mctest160_data_and_write_config(
task_name: str, task_data_path: str, task_config_path: str
):
os.makedirs(task_data_path, exist_ok=True)
download_utils.download_and_unzip(
"https://mattr1.github.io/mctest/data/MCTest.zip", task_data_path,
)
download_utils.download_and_unzip(
"https://mattr1.github.io/mctest/data/MCTestAnswers.zip", task_data_path,
)
os.rename(
os.path.join(task_data_path, "MCTestAnswers", f"mc160.test.ans"),
os.path.join(task_data_path, "MCTest", f"mc160.test.ans"),
)
shutil.rmtree(os.path.join(task_data_path, "MCTestAnswers"))
for phase in ["train", "dev", "test"]:
os.rename(
os.path.join(task_data_path, "MCTest", f"mc160.{phase}.tsv"),
os.path.join(task_data_path, f"mc160.{phase}.tsv"),
)
os.rename(
os.path.join(task_data_path, "MCTest", f"mc160.{phase}.ans"),
os.path.join(task_data_path, f"mc160.{phase}.ans"),
)
shutil.rmtree(os.path.join(task_data_path, "MCTest"))

py_io.write_json(
data={
"task": task_name,
"paths": {
"train": os.path.join(task_data_path, "mc160.train.tsv"),
"train_ans": os.path.join(task_data_path, "mc160.train.ans"),
"val": os.path.join(task_data_path, "mc160.dev.tsv"),
"val_ans": os.path.join(task_data_path, "mc160.dev.ans"),
"test": os.path.join(task_data_path, "mc160.test.tsv"),
"test_ans": os.path.join(task_data_path, "mc160.test.ans"),
},
"name": task_name,
},
path=task_config_path,
)


def download_mctest500_data_and_write_config(
task_name: str, task_data_path: str, task_config_path: str
):
os.makedirs(task_data_path, exist_ok=True)
download_utils.download_and_unzip(
"https://mattr1.github.io/mctest/data/MCTest.zip", task_data_path,
)
download_utils.download_and_unzip(
"https://mattr1.github.io/mctest/data/MCTestAnswers.zip", task_data_path,
)
os.rename(
os.path.join(task_data_path, "MCTestAnswers", f"mc500.test.ans"),
os.path.join(task_data_path, "MCTest", f"mc500.test.ans"),
)
shutil.rmtree(os.path.join(task_data_path, "MCTestAnswers"))
for phase in ["train", "dev", "test"]:
os.rename(
os.path.join(task_data_path, "MCTest", f"mc500.{phase}.tsv"),
os.path.join(task_data_path, f"mc500.{phase}.tsv"),
)
os.rename(
os.path.join(task_data_path, "MCTest", f"mc500.{phase}.ans"),
os.path.join(task_data_path, f"mc500.{phase}.ans"),
)
shutil.rmtree(os.path.join(task_data_path, "MCTest"))

py_io.write_json(
data={
"task": task_name,
"paths": {
"train": os.path.join(task_data_path, "mc500.train.tsv"),
"train_ans": os.path.join(task_data_path, "mc500.train.ans"),
"val": os.path.join(task_data_path, "mc500.dev.tsv"),
"val_ans": os.path.join(task_data_path, "mc500.dev.ans"),
"test": os.path.join(task_data_path, "mc500.test.tsv"),
"test_ans": os.path.join(task_data_path, "mc500.test.ans"),
},
"name": task_name,
},
path=task_config_path,
)


def download_mutual_data_and_write_config(
task_name: str, task_data_path: str, task_config_path: str
):
os.makedirs(task_data_path, exist_ok=True)
os.makedirs(task_data_path + "/train", exist_ok=True)
os.makedirs(task_data_path + "/dev", exist_ok=True)
os.makedirs(task_data_path + "/test", exist_ok=True)
num_files = {"train": 7088, "dev": 886, "test": 886}
for phase in num_files:
examples = []
for i in range(num_files[phase]):
file_name = phase + "_" + str(i + 1) + ".txt"
download_utils.download_file(
f"https://raw.githubusercontent.com/Nealcly/MuTual/"
+ f"master/data/mutual/{phase}/{file_name}",
os.path.join(task_data_path, phase, file_name),
)
for line in py_io.read_file_lines(os.path.join(task_data_path, phase, file_name)):
examples.append(line)
py_io.write_jsonl(examples, os.path.join(task_data_path, phase + ".jsonl"))
shutil.rmtree(os.path.join(task_data_path, phase))

py_io.write_json(
data={
"task": task_name,
"paths": {
"train": os.path.join(task_data_path, "train.jsonl"),
"val": os.path.join(task_data_path, "dev.jsonl"),
"test": os.path.join(task_data_path, "test.jsonl"),
},
"name": task_name,
},
path=task_config_path,
)


def download_mutual_plus_data_and_write_config(
task_name: str, task_data_path: str, task_config_path: str
):
os.makedirs(task_data_path, exist_ok=True)
os.makedirs(task_data_path + "/train", exist_ok=True)
os.makedirs(task_data_path + "/dev", exist_ok=True)
os.makedirs(task_data_path + "/test", exist_ok=True)
num_files = {"train": 7088, "dev": 886, "test": 886}
for phase in num_files:
examples = []
for i in range(num_files[phase]):
file_name = phase + "_" + str(i + 1) + ".txt"
download_utils.download_file(
f"https://raw.githubusercontent.com/Nealcly/MuTual/"
+ f"master/data/mutual_plus/{phase}/{file_name}",
os.path.join(task_data_path, phase, file_name),
)
for line in py_io.read_file_lines(os.path.join(task_data_path, phase, file_name)):
examples.append(line)
py_io.write_jsonl(examples, os.path.join(task_data_path, phase + ".jsonl"))
shutil.rmtree(os.path.join(task_data_path, phase))

py_io.write_json(
data={
"task": task_name,
"paths": {
"train": os.path.join(task_data_path, "train.jsonl"),
"val": os.path.join(task_data_path, "dev.jsonl"),
"test": os.path.join(task_data_path, "test.jsonl"),
},
"name": task_name,
},
path=task_config_path,
)


def download_fever_nli_data_and_write_config(
task_name: str, task_data_path: str, task_config_path: str
):
Expand Down
6 changes: 6 additions & 0 deletions jiant/scripts/download_data/dl_datasets/hf_datasets_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,12 @@
"phase_list": ["train", "val", "test"],
"jiant_task_name": "race",
},
"quail": {
"path": "quail",
"phase_list": ["train", "val", "test"],
"jiant_task_name": "quail",
"phase_map": {"validation": "val", "challenge": "test"},
},
}

# HF-Datasets uses "validation", we use "val"
Expand Down
2 changes: 1 addition & 1 deletion jiant/tasks/lib/quail.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def _create_examples(cls, lines, set_type):
guid="%s-%s" % (set_type, i),
prompt=line["context"] + " " + line["question"],
choice_list=[d for d in line["answers"]],
label=line["label"],
label=line["correct_answer_id"],
)
)
return examples
9 changes: 7 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@
extras = {}
extras["testing"] = ["pytest", "pytest-cov", "pre-commit"]
extras["docs"] = ["sphinx"]
extras["quality"] = ["black == 19.10b0", "flake8-docstrings == 1.5.0", "flake8 >= 3.7.9", "mypy == 0.770"]
extras["quality"] = [
"black == 19.10b0",
"flake8-docstrings == 1.5.0",
"flake8 >= 3.7.9",
"mypy == 0.770",
]
extras["dev"] = extras["testing"] + extras["quality"]

setup(
Expand Down Expand Up @@ -80,7 +85,7 @@
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
'License :: OSI Approved :: MIT License',
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
Expand Down
Loading

0 comments on commit e9d6c68

Please sign in to comment.