-
Notifications
You must be signed in to change notification settings - Fork 0
/
theARC.py
297 lines (199 loc) · 6.99 KB
/
theARC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#Jacob Pawlak
#February 25th, 2018
#Go Blue Team!
################################ IMPORTS ################################
#get the nltk library
import nltk
#Use OS and glob library to move through the computer's filesystem
import os
import glob
#To convert the ARC dictionary into a JSON object
import json
################################ GLOBAL VARIABLES ################################
#list of song titles
song_titles = []
#list of album titles
album_titles = []
#dictionary of albums and their songs
albums = {}
#whole dictionary that will get turned into a json file
ARC = {}
#dictionary of words (not the NLTK Token list) this list does not include the part of speech or anything like that, just the word count
list_of_words = {}
#{
# "word": count, "word": count, etc
#}
#dictionary of tenkenized words (includes POS, count, and other data?)
nltk_list_of_words = {}
#{
# "word/pos": {"word": WORD, "pos": POS, "count": count }
#}
#dictionary of hapax legomenon, sorted by song/album
hapax_legomena = {}
#{
# "word": "song", "word": "song", etc
#}
#longest word(s) because Why not, i'm sure it will be interesting.
global longest_word
#{
# "word": count, "word": count, etc
#}
################################ HELPER-FUNCTIONS ################################
#Helper function to insert the albums and songs entry into the ARC
def arc_helper_albums():
ARC['albums_with_songs'] = albums
return
#Helper function to insert the list_of_words into the ARC
def arc_helper_list_of_words():
ARC['list_of_words'] = list_of_words
return
#Helper function to insert the nltk_list_of_words into the ARC
def arc_helper_nltk_list_of_words():
ARC['nltk_list_of_words'] = nltk_list_of_words
return
#Helper function to insert the nltk_list_of_words into the ARC
def arc_helper_hapax_legomena():
ARC['hapax_legomena'] = hapax_legomena
return
#Helper function to insert the longest_word dict into the ARC
def arc_helper_longest_word():
ARC['longest_word'] = find_longest_word()
return
################################ SUB-FUNCTIONS ################################
#Function that searches the project directory for the albums and songs that I have scraped from the ol' internet
def clean_files():
#change the current directory to 'Albums'
os.chdir("Albums")
#grab a list of all the subdirectories (of Albums)
subds = os.listdir()
#for each of the aesop rock albums (subdirectory in subdirectories)
for subd in subds:
#change the current directory to the album subdirectory
os.chdir(subd)
#print("\t" + subd)
#if the album title is not in the list of album titles...
if subd not in album_titles:
#add it
album_titles.append(subd)
#make a temp list to be used in the albums dict
songs = []
#for each song (file will look like Blah.song)
for file in glob.glob("*.song"):
#print(file)
#add the song to the list of songs for this album
songs.append(file[:-5])
#if the song is not yet in the list of songs...
if file[:-5] not in song_titles:
#add it
song_titles.append(file[:-5])
#break the songs down to tokenized words and add them to the list of words
open_file = open(file, 'r')
for line in open_file:
if line != "\n":
#print("LINE")
#print(line[:-1])
tokens = nltk.word_tokenize(line)
tokenss = []
#print("TOKENS")
#print(tokens)
#print("TAGGED TOKENS")
#print(nltk.pos_tag(tokens))
#starting to clean the unwanted symbols and stuff out
bad_tokens = ['(', ')', '[', ']', '{', '}', '`', '\"']
for token in tokens:
#look at the first character
first_char = token[:1]
#while it is bad, delete it and look at the new first character
while first_char in bad_tokens:
token = token[1:]
first_char = token[:1]
tokenss.append(token)
#cleaning the empty strings out now with list comprehension
tokensss = [t for t in tokenss if t]
#getting rid of the other non-empty empty string string
tokenssss = [t for t in tokensss if t != "\'\'"]
#all of the following are just for debug
#print("CLEANED TOKENSSS")
#print(tokensss)
#print("TAGGED TOKENSSS")
#print(nltk.pos_tag(tokensss))
#print("TOKENSSSS")
#print(tokenssss)
#print(nltk.pos_tag(tokenssss))
cleaned_tokens = tokenssss
fill_list_of_words(cleaned_tokens)
cleaned_tokens_pos = nltk.pos_tag(cleaned_tokens)
fill_nltk_list_of_words(cleaned_tokens_pos)
#IMPORTANT: tokenssss is the cleaned list, sorry that you had to read all of the tokenssss variations,
# but you know how Python is with those immutable lists
open_file.close()
#add this album's songs to the albums dict
albums[subd] = songs
#print(albums)
#go up a directory for the loop
os.chdir("..")
os.chdir("..")
return
#Function to fill the list_of_words dictionary (this till not split by part of speech, just the tokens
def fill_list_of_words(tokens):
for token in tokens:
if token.lower() not in list_of_words.keys():
list_of_words[token.lower()] = 1
else:
list_of_words[token.lower()] += 1
return
#Function to fill the nltk_list_of_words dictionary, this will tokenize and pic part of speech for each word
def fill_nltk_list_of_words(tokens):
for token in tokens:
temp_dict = { token[0]: token[1]}
temp_key = str(token[0]) + "/" + str(token[1])
if temp_key not in nltk_list_of_words.keys():
nltk_list_of_words[temp_key] = {"word": token[0], "pos": token[1], "count": 1 }
else:
nltk_list_of_words[temp_key]["count"] += 1
return
#Function to look through the nltk_list_of_words and pick out the single occurrence words
def find_hapax_legomena():
for word in nltk_list_of_words.keys():
if nltk_list_of_words[word]["count"] == 1:
hapax_legomena[word] = 1
return
#Combs through the nltk_list_of_words and picks out the longest word(s)
def find_longest_word():
longest_word = {"a": 1}
for word in list_of_words.keys():
if len(word) > len(list(longest_word.keys())[0]):
longest_word = { word: list_of_words[word] }
elif len(word) == len(list(longest_word.keys())[0]):
longest_word[word] = list_of_words[word]
return longest_word
#You know what it is. Fill. The. ARC.
def fill_the_arc():
arc_helper_albums()
arc_helper_list_of_words()
arc_helper_nltk_list_of_words()
arc_helper_hapax_legomena()
arc_helper_longest_word()
return
################################ MAIN-FUNCTION ################################
def main():
#scan in and clean the files
clean_files()
find_hapax_legomena()
#longest_word = find_longest_word()
#fill the ARC, starting with albums
fill_the_arc()
#print out the ARC in dictionary form
print("The ARC: \n")
print(ARC)
print("#Words total")
print(len(nltk_list_of_words))
print("#Hapax")
print(len(hapax_legomena))
print("Longest word")
print(list(ARC["longest_word"].keys())[0])
json_file = open("the_ARC.json", 'w')
json_file.write(str(ARC))
json_file.close()
#transform the dictionary into the JSON object the_ARC.json
main()