-
Notifications
You must be signed in to change notification settings - Fork 2
/
prepare.sh
73 lines (56 loc) · 1.99 KB
/
prepare.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/bin/bash
# get mBART and En-Ro training data for WMT16
# adapt from https://github.com/rsennrich/wmt16-scripts/blob/master/sample/download_files.sh
ROOT=./
SCRIPTS=$ROOT/scripts
DEVTEST=$ROOT/devtest
cd $ROOT
# prepare mBART
wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz
tar -xf mbart.cc25.v2.tar.gz
PATHTOMBART=$ROOT/mbart.cc25.v2
# prepare original WMT data
mkdir $ROOT/data
cd $ROOT/data
wget http://www.statmt.org/europarl/v7/ro-en.tgz
wget http://opus.lingfil.uu.se/download.php?f=SETIMES2/en-ro.txt.zip -O SETIMES2.en-ro.txt.zip
tar -xf ro-en.tgz
unzip SETIMES2.en-ro.txt.zip
cat europarl-v7.ro-en.en SETIMES.en-ro.en > corpus.en
cat europarl-v7.ro-en.ro SETIMES.en-ro.ro > corpus.ro
# prepare BT data
for lang in ro en
do
wget http://data.statmt.org/rsennrich/wmt16_backtranslations/en-ro/corpus.bt.en-ro.$lang.gz
gzip -d corpus.bt.en-ro.$lang.gz
done
# add tag to BT data
python3 $SCRIPTS/addtag.py < corpus.bt.en-ro.en > tag.bt.en
# cat two data
cat corpus.en tag.bt.en > train.en
cat corpus.ro corpus.bt.en-ro.ro > train.ro
# preprocess raw data
SRC=en_XX
TGT=ro_RO
# apply sentencepiece
python3 $SCRIPTS/tospm.py $PATHTOMBART/sentence.bpe.model < train.en > train.en_XX
python3 $SCRIPTS/tospm.py $PATHTOMBART/sentence.bpe.model < train.ro > train.ro_RO
python3 $SCRIPTS/tospm.py $PATHTOMBART/sentence.bpe.model < $DEVTEST/dev.en > dev.en_XX
python3 $SCRIPTS/tospm.py $PATHTOMBART/sentence.bpe.model < $DEVTEST/dev.ro > dev.ro_RO
python3 $SCRIPTS/tospm.py $PATHTOMBART/sentence.bpe.model < $DEVTEST/test.tag.en > test.en_XX
python3 $SCRIPTS/tospm.py $PATHTOMBART/sentence.bpe.model < $DEVTEST/test.ro > test.ro_RO
# build data bin for fairseq
SRCDICT=$PATHTOMBART/dict.txt
TGTDICT=$PATHTOMBART/dict.txt
fairseq-preprocess \
--source-lang ${SRC} \
--target-lang ${TGT} \
--trainpref train \
--validpref dev \
--testpref test \
--destdir $ROOT/tagbt-data-bin \
--thresholdtgt 0 \
--thresholdsrc 0 \
--srcdict ${SRCDICT} \
--tgtdict ${TGTDICT} \
--workers 40