authors: [Aina Nurmagombetova](https://github.com/anurma) 🤙 [Alina Cherepanova](https://github.com/alinacherepanova) 🙋 [Anya Bataeva](https://github.com/fyzbt) 🤯 [Olya Silyutina](https://github.com/olgasilyutina) 🤩
import emopok
import emoji
import pandas as pd
import numpy as np
import string
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import word2vec
from gensim.models import Word2Vec
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import os
import glob
import json
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings('ignore')
# loading df with twitter and telegram data
df = pd.read_csv('./data/new_df_twi_tg.csv')
df['new_id'] = df.groupby(df.texts.tolist(), sort=False).ngroup() + 1
df_subset = df[['texts', 'new_id']]
unique_df = df_subset.drop_duplicates()
df[['texts', 'emoji', 'new_id']].to_csv("./data/emoji_texts_df.csv", header=['texts', 'emoji', 'index'], index = False)
unique_df.to_csv("./data/unique_emopok.csv", header=['texts', 'index'], index = False)
# text preprocessing and writing into csv file
emopok.preprocess_text(unique_df['texts'], unique_df['new_id'])
clean_data = pd.read_csv('./data/clean_text.csv')
William L. Hamilton, Kevin Clark, Jure Leskovec, and Dan Jurafsky. Inducing Domain-Specific Sentiment Lexicons from Unlabeled Corpora. Proceedings of EMNLP. 2016. (to appear; arXiv:1606.02820).
# create list of sentences for word2vec model
w2v_sentences = [nltk.word_tokenize(str(i)) for i in tqdm(clean_data['clean_texts'])]
HBox(children=(IntProgress(value=0, max=357077), HTML(value='')))
# creating the model and setting values for the various parameters
num_features = 100 # Word vector dimensionality
min_word_count = 5 # Minimum word count
num_workers = 4 # Number of parallel threads
context = 5 # Context window size
iterations = 20
w2v_model = emopok.train_word2vec(w2v_sentences, num_workers, num_features, min_word_count, context, iterations, \
file_path = './models/emopok_w2v_model')
w2v_model = Word2Vec.load("./models/emopok_w2v_model")
w2v_model.similar_by_word('привет')
[('приветик', 0.7054356336593628),
('апрелька', 0.49257901310920715),
('привееет', 0.4709334373474121),
('делишки', 0.4653734862804413),
('тезка', 0.4564298689365387),
('весточка', 0.4510449171066284),
('сестричка', 0.4470579922199249),
('шмуль', 0.4408361613750458),
('приветствие', 0.4390951693058014),
('денисовна', 0.43712013959884644)]
# get stats on word counts
flat_sentences = [item for sublist in w2v_sentences for item in sublist]
words_count = Counter(flat_sentences)
words_count = pd.DataFrame.from_dict(words_count, orient='index').reset_index()
# get input file for socialsent
words_count.columns = ['lem_word', 'count']
words = []
vectors = []
for word in tqdm(words_count['lem_word']):
try:
vectors.append(list(w2v_model[word]))
words.append(word)
except:
Exception
HBox(children=(IntProgress(value=0, max=135713), HTML(value='')))
# save word2vec output to txt file for socialsent
# script for sentiment analysis is here https://github.com/olgasilyutina/socialsent3/blob/master/example.ipynb
data_vect = pd.DataFrame(vectors, columns=list(range(num_features)))
data_vect.index = words
data_vect['lem_word'] = words
data_vect = data_vect.drop('lem_word', axis = 1).reset_index()
data_vect.to_csv('./data/data_emopok.txt', header = None, index = None, sep = ' ', mode = 'w')
# load results of socialsent model
sent_dfs = []
for file in glob.glob("./data/polarities/*.json"):
with open( file) as f:
data = json.load(f)
sent_dfs.append(pd.DataFrame(data, index=[0]).T.reset_index())
sent_df = pd.concat(sent_dfs)
sent_df.columns = ['word', 'sent']
sent_df = sent_df.reset_index().drop('index', axis=1).drop_duplicates()
sent_df = sent_df[sent_df['sent'] != 0]
# calcelate sentiments for every text
df_sent = pd.DataFrame({'doc': w2v_sentences})
df_sent = df_sent.reset_index()
df_sent['index'] = df_sent['index'] + 1
df_sent = df_sent.set_index(['index'])['doc'].apply(pd.Series).stack()
df_sent = df_sent.reset_index()
df_sent = df_sent.drop('level_1', axis=1)
df_sent.columns = ['index', 'word']
df_sent = df_sent.merge(sent_df, on=['word'], how='left')
df_sent = pd.DataFrame(df_sent.groupby('index').sent.sum()).reset_index()
df_sent.columns = ['new_id', 'sent']
df_sent_texts = df_sent.merge(unique_df, on = 'new_id')
pd.merge(df_sent, unique_df, how='inner', left_on=['index'], right_on=['new_id'])
df_sent_texts.to_csv('./data/sentiments_emopok.csv', header=['index', 'sent', 'texts'], index = False)
# get text features
textfeatures_df = []
for text, index in tqdm(zip(unique_df['texts'], unique_df['new_id'])):
textfeatures_df.append(emopok.textfeatures(text, index))
HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))
textfeatures_df = pd.concat(textfeatures_df)
textfeatures_df.describe()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
index |
n_chars |
n_commas |
n_digits |
n_exclaims |
n_hashtags |
n_lowers |
n_mentions |
n_urls |
n_words |
n_nonasciis |
n_uppers |
count |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
357077.000000 |
mean |
178539.000000 |
70.611608 |
0.657847 |
1.122324 |
0.183358 |
0.053823 |
50.520840 |
0.604973 |
0.179992 |
26.987843 |
45.531244 |
4.089048 |
std |
103079.395373 |
74.893091 |
1.272483 |
5.267934 |
0.864982 |
0.339021 |
53.307267 |
1.564861 |
0.446786 |
36.221020 |
51.563273 |
8.583359 |
min |
1.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
0.000000 |
0.000000 |
25% |
89270.000000 |
26.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
18.000000 |
0.000000 |
0.000000 |
8.000000 |
14.000000 |
1.000000 |
50% |
178539.000000 |
49.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
34.000000 |
0.000000 |
0.000000 |
20.000000 |
29.000000 |
2.000000 |
75% |
267808.000000 |
90.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
65.000000 |
1.000000 |
0.000000 |
35.000000 |
58.000000 |
4.000000 |
max |
357077.000000 |
4056.000000 |
115.000000 |
1492.000000 |
62.000000 |
21.000000 |
3154.000000 |
50.000000 |
23.000000 |
3474.000000 |
3276.000000 |
511.000000 |
textfeatures_df.to_csv('./data/textfeatures_emopok.csv', index = False)
Quoc Le, Tomas Mikolov Distributed Representations of Sentences and Documents, Proceedings of the 31 st International Conference on Machine
Learning. 2014
# preprocess text doc2vec representations
preprocess_text(unique_df['texts'], unique_df['new_id'], lemmatize = True, stopwords = False, \
russian_only = False, file_path = './data/d2v_clean_data.csv')
d2v_clean_data = pd.read_csv('./data/d2v_clean_data.csv')
# create list of sentences for doc2vec model
d2v_sentences = [nltk.word_tokenize(str(i)) for i in tqdm(d2v_clean_data['clean_texts'])]
HBox(children=(IntProgress(value=0, max=357077), HTML(value='')))
d2v_model, all_vectors = emopok.train_doc2vec(d2v_sentences, 100, 5, 10, 5, save_model_to = './models/emopok_d2v_model')
HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))
Model was successfully saved with a name ./models/emopok_w2v_model
all_vectors_df = pd.DataFrame(all_vectors)
all_vectors_df.columns = ['d2v_' + str(col) for col in all_vectors_df.columns]
all_vectors_df.to_csv('./data/d2v_vectors_emopok.csv', index = False)
Blei D. M., Ng A. Y., Jordan M. I. Latent dirichlet allocation //Journal of machine Learning research. – 2003. – Т. 3. – №. Jan. – С. 993-1022.
num_topics = 20
sentences = w2v_sentences
corpus = emopok.get_lda_model(sentences, num_topics, file_path = './models/emopok_lda_model')
lda_model_emo = emopok.load("./models/emopok_lda_model")
topics_df = emopok.get_topics_for_docs(corpus, lda_model_emo, 20, unique_df['texts'])
topics_df = pd.read_csv('./data/topics_df.csv')
dum_topics = pd.get_dummies(topics_df['topic'])
dum_topics.columns = ['topic_' + str(col) for col in dum_topics.columns]
dum_topics.to_csv('./data/dum_topics_emopok.csv', index = False)
emoji_texts = []
unique_emojies = list(df.emoji.unique())
for text in tqdm(unique_df['texts']):
emoji_texts.append(emopok.get_emoji_sentences(text, unique_emojies))
HBox(children=(IntProgress(value=0, max=357077), HTML(value='')))
# creating the model and setting values for the various parameters
num_features = 1000 # Word vector dimensionality
min_word_count = 50 # Minimum word count
num_workers = 4 # Number of parallel threads
context = 2 # Context window size
iterations = 20
w2v_emoji_model = emopok.train_word2vec(emoji_texts, num_workers, num_features, min_word_count, context, iterations, \
file_path = './models/emopok_w2v_emoji_model')
Model was successfully saved with a name ./models/emopok_w2v_emoji_model
w2v_emoji_model = Word2Vec.load("./models/emopok_w2v_emoji_model")
w2v_emoji_model.similar_by_word('💞')
[('💝', 0.8745415210723877),
('💟', 0.8650382161140442),
('💘', 0.8625384569168091),
('💓', 0.8616468906402588),
('💖', 0.830069899559021),
('❣', 0.8249319791793823),
('💗', 0.8223015069961548),
('💕', 0.8110256195068359),
('💌', 0.787800669670105),
('💋', 0.7590600252151489)]
unique_emojies = df.groupby('emoji').texts.count().reset_index().sort_values('texts')
unique_emojies = unique_emojies[unique_emojies['texts'] >= 50].emoji.tolist()
emojis_found = [e for e in unique_emojies if e in w2v_emoji_model.wv.vocab]
X = [w2v_emoji_model.wv[e] for e in unique_emojies if e in w2v_emoji_model.wv.vocab]
emopok.search_for_kmeans(30, X)
emo_clusters = emopok.train_kmeans(26, X, emojis_found, save_to = './data/emopok_clusters.csv')
# cluster 15 and 16 clusters separately
emo_clusters = pd.read_csv('./data/emopok_clusters.csv')
emo_clusters = emo_clusters.astype(str)
cluster = '15'
unique_emojies = emo_clusters[emo_clusters.cluster_group == cluster]['index'].tolist()
emojis_found = [e for e in unique_emojies if e in w2v_emoji_model.wv.vocab]
X = [w2v_emoji_model.wv[e] for e in unique_emojies if e in w2v_emoji_model.wv.vocab]
emopok.search_for_kmeans(15, X)
emo_clusters = emopok.train_kmeans(12, X, emojis_found, save_to = f'./data/emopok_clusters_{cluster}.csv')
for i in list(emo_clusters[['cluster_group']].drop_duplicates()['cluster_group']):
print(emo_clusters[emo_clusters.cluster_group == i])
emo_clusters = pd.read_csv('./data/emopok_clusters.csv')
emo_clusters = emo_clusters[~emo_clusters['cluster_group'].isin([15, 16])]
emo_clusters_15 = pd.read_csv('./data/emopok_clusters_15.csv')
emo_clusters_15['cluster_group'] = '15_' + emo_clusters_15['cluster_group'].astype(str)
emo_clusters_16 = pd.read_csv('./data/emopok_clusters_16.csv')
emo_clusters_16['cluster_group'] = '16_' + emo_clusters_16['cluster_group'].astype(str)
emo_clusters = pd.concat([emo_clusters, emo_clusters_15, emo_clusters_16])
emo_clusters.to_csv('./data/emopok_clusters.csv', index = False)
emo_clusters.groupby('cluster_group').count().reset_index().sort_values('index', ascending = False)