forked from shaheen-syed/Twitter-Sentiment-Analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
8_plot_results.py
422 lines (303 loc) · 13.3 KB
/
8_plot_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
# -*- coding: utf-8 -*-
"""
Created by: Shaheen Syed
Date: July 2018
Step 8 - Plot Results
----------------------
Create various plots
- Donutplot showing sentiments per mode of research (int./trans./mult.)
- Sentiment over time shown as a stacked bar chart per week
- Sentiment by occuptation, only showing the most positive occupations
- Bar chart showing the frequency of user tags, @, and URLs for each sentiment and each mode of research
There are 4 switches that can be turned on or off (by setting their value to True or False). Each switch will create the corresponding plot.
* create_donot_plot = [True|False]
* create_time_stacked_bar_plot = [True|False]
* create_sentiment_by_occupation = [True|False]
* create_twitter_tokens_bar_plot = [True|False]
How to run:
python 8_plot_results.py
"""
# packages and modules
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import FormatStrFormatter
from helper_functions import *
from database import MongoDatabase
from sklearn.externals import joblib
# Turn on/off what needs to be executed
create_donot_plot = True
create_time_stacked_bar_plot = True
create_sentiment_by_occupation = True
create_twitter_tokens_bar_plot = True
"""
Script starts here
"""
if __name__ == "__main__":
# create logging to console
set_logger()
# verbose
logging.info('Start: {} '.format(__file__))
# read labels
labels = joblib.load(os.path.join('files', 'labels', 'labels.pkl'))
# dictionary of with key = tweet_id and value = label
tweet_id_to_label = {l[0]:l[1] for l in labels}
# define the label types
label_types = ['positive', 'neutral', 'negative']
# define the colors for each sentiment (postive, neutral and negative)
colors = ['#52bf80','#088bdc', '#fe6300']
# plot location
plot_location = os.path.join('files', 'plots')
# create location if not exists
create_directory(plot_location)
# create database connection
db = MongoDatabase()
if create_donot_plot:
"""
Create a donutplot for interdisciplinary, transdisciplinary and multidisciplinary target tweets where each
donutplot shows the percentage of positive, negative and neutral tweets
file will be saved to files/plots/donutplot.pdf
"""
# create the subplts
fig, axs = plt.subplots(1,3, figsize=(21, 6))
# make axes available like ax[0] instead of ax[0,0]
axs = axs.ravel()
# loop over each axis and plot the donut
for i, ax in enumerate(axs.reshape(-1)):
# retrieve only rows for specific tweet type, e.g. interdisciplinary, multidisciplinary
subset_labels = labels[labels[:,2] == i ]
# set the plot title, for example, interdiscipline n = xxxx
ax.set_title(get_tweet_type_from_code(i) + ' n = ' + str(len(subset_labels)))
# get number of tweets
sizes = [
len(subset_labels[subset_labels[:,1] == 2]),
len(subset_labels[subset_labels[:,1] == 1]),
len(subset_labels[subset_labels[:,1] == 0]),
]
# create the labels for the legend
names = [
'positive n = ' + str(sizes[0]),
'neutral n = '+ str(sizes[1]),
'negative n = '+ str(sizes[2])
]
# plot donut
ax.pie(sizes, autopct='%.0f%%', colors = colors, startangle = 90, counterclock = False, wedgeprops = { 'linewidth' : 5, 'edgecolor' : 'white' })
# add legend
ax.legend(names,loc = "center", frameon = False)
# nice round plots
ax.axis('equal')
my_circle = plt.Circle( (0,0), 0.7, color='white')
ax.add_artist(my_circle)
# adjust somewhat
plt.subplots_adjust(wspace = 0.0, hspace=0.0)
# remove some white space
plt.tight_layout()
# save figure
fig.savefig(os.path.join(plot_location, 'donutplot.pdf'))
# close plot so we can plot again if necessary
plt.close()
if create_time_stacked_bar_plot:
"""
Create a stacked bar chart that shows the sentiment over time, each bar shows the positive, negative and neutral tweets
"""
# get target tweet documents from database
D = db.read_collection(collection = 'target_tweets')
# create dictionary of week numbers per tweet id
dic_weeks = {}
for d in D:
dic_weeks[d['tweet_id']] = '{}-{}'.format(d['tweet_date'].year, str(d['tweet_date'].isocalendar()[1]).zfill(2))
# get list of year + week
weeks = sorted(set([x for x in dic_weeks.values()]))
# some tweets from a part of week 31 were obtained because the api looks at 7 days history but we don't need them because we only want full weeks
# weeks.remove('2017-31')
# create the figure environment so we can plot the barcharts
fig, axs = plt.subplots(3,1, figsize=(30, 19))
axs = axs.ravel()
# loop over the different tweet types
for tweet_type in range(3):
# get subset of labels
subset_labels = labels[labels[:,2] == tweet_type]
# create empty dataframe
df = pd.DataFrame(index = pd.Series(label_types).values)
# loop trough weeks and get sentiment values
for week in weeks:
week_tweet_ids = set()
for key, value in dic_weeks.items():
if value == week:
if key in subset_labels[:,0]:
week_tweet_ids.add(key)
# get labels
week_labels = []
for i in week_tweet_ids:
week_labels.append(tweet_id_to_label[i])
# get the counts per label per week
label_counts = Counter(week_labels)
# create series
df[week] = pd.Series([label_counts[2], label_counts[1], label_counts[0]]).values
# tranpose dataframe
df = df.transpose()
# plot the stacked bar chart
df.plot(kind = 'bar', stacked = True, color = colors, fontsize= 24, rot = 90, width = 0.8, linewidth = 0, alpha = 1., ax = axs[tweet_type])
# omit the ticks on the first and second plot, so only show ticks on the bottom one
if tweet_type != 2:
axs[tweet_type].set_xticklabels([])
# set the y labels
axs[tweet_type].set_ylabel('Number of tweets', fontsize='24')
# set the subplot title
axs[tweet_type].set_title(get_tweet_type_from_code(tweet_type), fontsize='28')
# set the legend and reverse the legend order
for i in range(3):
handles, legend_labels = axs[i].get_legend_handles_labels()
axs[i].legend(handles[::-1], legend_labels[::-1],loc = "upper left", frameon = False, fontsize='24')
# remove some white space
plt.tight_layout()
# save figure
fig.savefig(os.path.join(plot_location, 'sentiment-over-time.pdf'))
# close plot so we can plot again if necessary
plt.close()
if create_sentiment_by_occupation:
"""
Create a stacked bar chart with sentiment class by occupation
"""
# get target tweet documents from database
D = db.read_collection(collection = 'target_tweets')
# get counts of sentiment values per occupation
dic_counts = {}
# loop over all the tweet documents
for i, d in enumerate(D):
# get occupation matches
matches = d['matches']
# get inferred label 0 = negative, 1 = neutral, 2 = negative
label = d['label']
# get label for sentiment (for example, the word 'positive')
label_text = get_sentiment_label(label)
# get the tweet type (int/transd/multid/)
tweet_type = d['tweet_type']
# check if tweet type is not part of dictionary, if not, add it
if tweet_type not in dic_counts:
dic_counts[tweet_type] = {}
# loop over each of the matched occuptations
for occupation in matches:
# skip matches that are not an occuptation but mere a reference to academic setting
if occupation.strip() in ['research', 'science', 'university', 'education', 'college', 'studies', 'sciences', 'scientific', 'faculty', 'academy', 'academics','mathematics', 'health science',
'higher education', 'doctoral', 'researching', 'researchers', 'undergraduate','graduate', 'neuroscience', 'scientists', 'humanities', 'anthropology', 'health professional', 'business school']:
continue
# merge occuptations
if occupation.strip() in ['phd', 'phd student', 'phd candidate', 'ph.d.']: occupation = 'phd candidate'
if occupation.strip() in ['prof','prof.']: occupation = 'professor'
if occupation.strip() in ['post-doc','post doc']: occupation = 'postdoc'
# check if occupation is not part of the tweet type, if not, add it
if occupation not in dic_counts[tweet_type]:
dic_counts[tweet_type][occupation.strip()] = {'negative' : 0, 'neutral' : 0, 'positive' : 0}
# add 1 to the sentiment label
dic_counts[tweet_type][occupation][label_text] += 1
# create the plot environment
fig, axs = plt.subplots(3,1, figsize=(15, 15))
axs = axs.ravel()
# plot mode of research onto each axis
for i, key in enumerate(['interdisciplinary', 'transdisciplinary','multidisciplinary']):
# get the values from the dictionary
values = dic_counts[key]
# create empty dataframe
df = pd.DataFrame()
# loop over the values
for occupation, counts in values.iteritems():
# add counts per sentiment as series to dataframe
df['{} (n={})'.format(occupation, sum(counts.values()))] = pd.Series(counts)
# tranpose the dataframe
df = df.T
# get a copy of the totals
df['total'] = df.sum(axis=1)
# calculate percentage of positive tweets in relation to total tweets for occupation
df['positive-percentage'] = df['positive'] / df['total'] * 100.
# calculate percentage of negative tweets in relation to total tweets for occupation
df['negative-percentage'] = df['negative'] / df['total'] * 100.
df_subset = df.sort_values(by = ['total'], ascending=False)[0:25][['positive-percentage', 'negative-percentage']].sort_values(by = ['positive-percentage'], ascending=False)
# plot positive percentages
df_subset.plot(kind = 'bar', stacked = True, color = ['#52bf80', '#fe6300'], fontsize= 14, rot = 45, width = 0.8, linewidth = 0, ax = axs[i])
# alighn the x labels
for xtick in axs[i].get_xticklabels():
xtick.set_ha('right')
# set y limit
axs[i].set_ylim(0,75)
# set y label
axs[i].set_ylabel('% tweets', fontsize='14')
# set title
axs[i].set_title(key, fontsize='16')
# set the legend and reverse the legend order
for i in range(3):
handles, _ = axs[i].get_legend_handles_labels()
sentiment_labels = ['positive', 'negative']
axs[i].legend(handles[::-1], sentiment_labels[::-1],loc = "upper right", frameon = False, fontsize='14')
# remove some white space
plt.tight_layout()
# save figure
fig.savefig(os.path.join(plot_location, 'sentiment-by-occupation.pdf'))
# close plot so we can plot again if necessary
plt.close()
if create_twitter_tokens_bar_plot:
"""
Create a bar plot with frequence of emoji, @, and URL for each sentiment and for each mode of research
"""
# create dictionary with key = tweet_id and value = text
id_to_text = {d['tweet_id'] : d['raw_text'] for d in db.read_collection( collection = 'target_tweets')}
# empty list so we can add data to it
data = []
# loop over each tweet type, note that they are encoded as 0 = interdisciplinary, 1 = transdisciplinary, 2 = multidisciplinary
for tweet_type in range(3):
logging.info('Processing tweet type: {}'.format(tweet_type))
# loop trough label type, note that they are encoded as negative = 0, neutral = 1 and positive = 2
for label_type in range(3):
logging.info('Processing label: {}'.format(label_type))
# filter labels based on tweet type and label => interdisciplinary + positive
subset_labels = labels[(labels[:,2] == tweet_type) & (labels[:,1] == label_type)][:,0]
# counters for frequency of @, URL, and emoji
num_at, num_url, num_emoji = 0,0,0
# get only the tweets that are part of the subset
subset_tweets = [id_to_text[x] for x in subset_labels]
for text in subset_tweets:
# count @
num_at += text.count('@')
# count url
num_url += text.count('http')
# replace emojis by placeholder
text = replace_emojis(text, placeholder_pos = 'EMOJI', placeholder_neg = 'EMOJI')
# count frequency of placeholder
num_emoji += text.count('EMOJI')
# get value to normalize total counts
normalizer = float(len(subset_tweets))
# add to data
data.append([tweet_type, label_type, num_at / normalizer, num_url / normalizer, num_emoji / normalizer])
# create the figure environment so we can plot the barcharts
fig, axs = plt.subplots(1,3, figsize=(15, 5), sharey=True)
axs = axs.ravel()
# loop over each tweet type, note that they are encoded as 0 = interdisciplinary, 1 = transdisciplinary, 2 = multidisciplinary
for i in range(3):
# filter the data
subset_data = np.array([x for x in data if x[0] == i])[:,1:]
# create empty dataframe
df = pd.DataFrame(index = reversed(label_types))
# add at series
df['@'] = pd.Series(subset_data[:,1]).values
# add url series
df['URL'] = pd.Series(subset_data[:,2]).values
# add emoji series
df['EMOJI'] = pd.Series(subset_data[:,3]).values
# transpose the dataframe
df = df.transpose()
# plot the dataframe
df.plot(kind = 'bar', stacked = False, color = reversed(colors), fontsize=12, rot = 0, width = 0.8, linewidth = 0, alpha = 1., ax = axs[i])
# set the ylabel
axs[i].set_ylabel('Frequency/#tweets', fontsize='12')
# add the legend
axs[i].legend(loc = "upper right", frameon = False, fontsize='12')
# set the title
axs[i].set_title(get_tweet_type_from_code(i), fontsize='12')
# remove some white space
plt.tight_layout()
# save figure
fig.savefig(os.path.join(plot_location, 'frequency-emoji-url-at.pdf'))
# close plot so we can plot again if necessary
plt.close()