-
Notifications
You must be signed in to change notification settings - Fork 1
/
geniusScrapping.py
54 lines (43 loc) · 1.65 KB
/
geniusScrapping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import lyricsgenius
import pandas as pd
import sys
import progressbar
def get_lyrics(token):
genius = lyricsgenius.Genius(token, timeout=400)
genius.verbose = False # Turn off status messages
# Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.remove_section_headers = True
# Include hits thought to be non-songs (e.g. track lists)
genius.skip_non_songs = False
# Exclude songs with these words in their title
genius.excluded_terms = ["(Live)"]
data = []
genres = ['rap', 'hip-hop', 'country',
'rock', 'pop', 'r-b', 'metal', 'jazz']
k = 1
widgets = ['Getting lyrics: ', progressbar.Percentage(), ' ',
progressbar.Bar(marker='=',left='[',right=']'),
' ', progressbar.ETA(), ' ', progressbar.FileTransferSpeed()]
bar = progressbar.ProgressBar(widgets=widgets, maxval=len(genres)*1000)
bar.start()
for genre in genres:
page = 1
while page:
res = genius.tag(genre, page=page)
for hit in res['hits']:
bar.update(k)
song_lyrics = genius.lyrics(song_url=hit['url'])
data.append([hit['artists'][0], hit['title'],
song_lyrics, genre, hit['url']])
k += 1
page = res['next_page']
df = pd.DataFrame(
data, columns=['artist', 'title', 'lyrics', 'genre', 'url'])
df.to_csv('lyrics.csv', index=False, sep='#')
bar.finish()
if __name__ == '__main__':
if (len(sys.argv) != 2):
print("Usage: python3 geniusScrapping.py <token>")
exit(1)
get_lyrics(sys.argv[1])
exit(1)