-
Notifications
You must be signed in to change notification settings - Fork 1
/
dataProcessing.py
169 lines (153 loc) · 7.5 KB
/
dataProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import os
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
class ReadWriteData():
"""
Read in the data - read each text file that we are processing into a pd dataframe
Write out data to a specified CSV file
"""
def __init__(self, loc):
self.datapath = loc
self.file_names = [file for file in os.listdir(
loc) if file.endswith('.story')]
self.df = pd.DataFrame(columns=['file', 'text', 'summary'])
def read_in_files(self, filesToRead):
"""
Read in files from datapath defined by self.datapath
@filesToRead = The number of files to read in from the directory
"""
files_to_read = self.file_names[:filesToRead]
print(len(files_to_read))
for file in files_to_read:
with open(self.datapath + "/" + file, encoding="utf8") as f:
data = f.read()
# 0 - body, 1 - summary
data_split = re.split("@highlight", data, maxsplit=1)
# appending to df
self.df = self.df.append({'file': file, 'text': str(
data_split[0]), 'summary': str(data_split[1])}, ignore_index=True)
print("read in files")
print(self.df.head())
def get_df(self):
"""
Return panda dataframe in it's current state
"""
return self.df
def df_to_csv(self, csv_name):
"""
Convert a panda dataframe to a csv with the name @csv_name
"""
self.df.to_csv(csv_name, index=True)
class CleanData():
"""
Use the data that is read in, as a pd df, and clean it - remove any whitespace and empty columns
Shape the data as necessary
Removal of stop words - Could this cause an issue when dealing with creating gramatically correct summaries?
Lemmatization - reduing words down to the stem forms
"""
def __init__(self, dataframe):
self.df = dataframe
def sent_pos_clean(self, doc):
"""
Cleaning Data when sentPos=True. We pass in an array of tokenized sentences that need to be cleaned and have <eos> tokens appended to the end.
"""
doc = [re.sub(r'\(CNN\)|(Daily\sMail)|--|[^\w\s\.]', '', x) for x in doc]
doc = [re.sub(r'(\.(?=[\s\r\n]|$))', '', x) for x in doc]
doc = [re.sub(r'\n', ' ', x) for x in doc]
doc = [re.sub(r'\.', '', x) for x in doc]
# add eos token so that we can split the document into sentences easier in sent_position method
doc = [x + ' <eos>' for x in doc]
doc = "".join(doc)
return doc
def clean_data(self, textRank, wordFreq, sentPos):
"""
Clean data by removing punctuation and words relating to the source of the article
@textRank = True if text rank is being run. In this case the summary seperator @highlight is not removed
@wordFreq = True if word frequency is being run. In this case the summary seperator @highlight is not removed
@sentPos = True if sentence position is being run. In this case sent_pos_clean is called and <eos> tokens are added to the end of each sentence.
"""
# dropping duplicates
self.df.drop_duplicates(subset=['file'], inplace=True)
self.df.dropna(axis=0, inplace=True) # dropping na
# clean texts
if (sentPos == "True"):
# add in eos tokens
self.df['text'] = self.df['text'].apply(lambda x: nltk.sent_tokenize(x, language='english')).apply(lambda x: self.sent_pos_clean(x))
else:
self.df['text'] = self.df['text'].apply(lambda x: re.sub(r'\(CNN\)|(Daily\sMail)|--|[^\w\s\.]', '', x)).apply(lambda x: re.sub(r'(\.(?=[\s\r\n]|$))', '', x)).apply(lambda x: re.sub(r'\n', ' ', x)).apply(lambda x: re.sub(r'\.', '', x))
# separate the summaries using a '.'
if (textRank == "True") or (wordFreq == "True"):
self.df['summary'] = self.df['summary'].apply(lambda x: re.sub(r'\n|[^\w\s\.\@]', '', x))
else:
self.df['summary'] = self.df['summary'].apply(lambda x: re.sub(r'\n|[^\w\s\.\@]', '', x)).apply(lambda x: re.sub(r'@highlight', ' ', x))
print("cleaned data")
print(self.df.head())
def remove_stop_words(self):
"""
Remove stop words from the text and summaries using nltk stopwords
"""
stop_words = set(stopwords.words('english'))
self.df['text'] = self.df['text'].apply(lambda x: nltk.word_tokenize(x)).apply(
lambda x: " ".join([word for word in x if not word.lower() in stop_words]))
self.df['summary'] = self.df['summary'].apply(lambda x: nltk.word_tokenize(x)).apply(
lambda x: " ".join([word for word in x if not word.lower() in stop_words]))
print(self.df.head())
print("removed stop words")
def get_pos(self, word):
"""
Get nltk part of speech tag for a token, and translate it to a wordnet part of speech symbol
Note: WordNet POS does not recognise "I" - Preposition, "M" - modal, "C" - conjunction, "P" - pronoun
"""
pos = nltk.pos_tag([word])[0][1][0]
wordnet_conv = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
if pos in wordnet_conv.keys():
return wordnet_conv.get(pos)
return ""
def lemmatization(self, pos):
"""
Lemmatization of articles using the WordNet Lemmatizer. This reduces tokens down to their stem form.
@pos = Value can be True or False. Used to indicate whether or not to use POS whilst lemmatizing
"""
# initialise wordnet lemmatizer
lemmatizer = WordNetLemmatizer()
text_tokenized = self.df['text'].apply(lambda x: nltk.word_tokenize(x))
if pos == "True":
print("lemmatize with pos")
for i in range(0, len(text_tokenized)):
text_lemmatized = []
# loop through each word, get its pos, and then lemmatize it
for word in text_tokenized[i]:
self.get_pos(word)
pos = self.get_pos(word)
if pos != "":
lemma = lemmatizer.lemmatize(word, pos)
text_lemmatized.append(lemma)
else:
# if it has no pos token, simply lemmatize it without
text_lemmatized.append(word)
text_lemmatized = ' '.join(map(str, text_lemmatized))
# replace original text with the lemmatized form
self.df['text'][i] = text_lemmatized
else:
print("lemmatize w/o POS")
self.df['text'] = text_tokenized.apply(
lambda x: [lemmatizer.lemmatize(w) for w in x])
self.df['text'] = self.df['text'].apply(lambda x: ' '.join(x))
def drop_null_rows(self):
"""
Check for rows with null values in them, and copy these into a new dataframe (df1).
Drop any rows in df1 from df to ensure no NaN valued rows are present/
*Note. using simply dropna(how='any') does not seem to drop any of the rows*
"""
print(self.df.isnull().values.any())
print(self.df.shape)
df1 = self.df[self.df.isna().any(axis=1)]
print(df1.shape)
self.df.drop(df1.index, axis=0, inplace=True)
print(self.df.shape)
print(self.df.isnull().values.any())