-
Notifications
You must be signed in to change notification settings - Fork 0
/
modelling.py
122 lines (86 loc) · 3.26 KB
/
modelling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import pandas as pd
import numpy as np
from datasets import load_metric
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
class TweetDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key,val in self.encodings.items()}
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
def load_dataset(tokenizer, data_path):
print('Loading data ...')
indic_df = pd.read_csv(data_path)
training_texts = indic_df.tweet.tolist()
training_labels = indic_df.lable.tolist()
train_texts, val_texts, train_labels, val_labels = train_test_split(training_texts, training_labels, test_size=0.2, shuffle=True)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)
return train_dataset, val_dataset
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
metric = load_metric("accuracy")
return metric.compute(predictions=predictions, references=labels)
def get_tokenizer(model_name):
print('Loading tokenizer ...')
return AutoTokenizer.from_pretrained(model_name)
def get_model(model_name):
print('Loading model ...')
return AutoModelForSequenceClassification.from_pretrained(model_name)
def train(tokenizer, model, train_dataset, val_dataset, model_path):
print('Start training ...')
training_args = TrainingArguments(
output_dir='/model/results',
num_train_epochs=4,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
learning_rate=5e-5,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
evaluation_strategy="steps",
save_strategy="steps",
save_steps=2000,
eval_steps=2000,
save_total_limit=2,
load_best_model_at_end=True
)
trainer = Trainer(
model = model,
args = training_args,
train_dataset = train_dataset,
eval_dataset = val_dataset,
compute_metrics=compute_metrics
)
print(trainer.train())
tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)
def run(model_name, data_path = 'data/train.csv', \
model_dir = 'models'):
tokenizer, model = get_tokenizer(model_name), get_model(model_name)
train_dataset, val_dataset = load_dataset(tokenizer, data_path)
if model_name == 'DistilBert':
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model_path = os.path.join(model_dir, 'DistilBert')
elif model_name == 'mBERT':
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model_path = os.path.join(model_dir, 'mBERT')
else:
raise TypeError('Choose model_name either "DistilBert" or "mBERT"')
train(tokenizer, model, train_dataset, val_dataset, model_path)
if __name__ == '__main__':
# run()
pass