-
Notifications
You must be signed in to change notification settings - Fork 0
/
Nltk_summarization_nyt.py
86 lines (63 loc) · 2.33 KB
/
Nltk_summarization_nyt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
"""
Module that contains a class that returns the named entities from an article
"""
punctuation = punctuation + '\n'
class Text_Summarizer:
"""
Attributes
----------
reference : abstracts of the article
summary: summary
Methods
-------
get_Summary()
Return summary
Example
-------
To get summary:
Text_Summarizer_Object = Text_Summarizer(obj,fullText)
summary = Text_Summarizer_Object.get_Summary()
"""
def __init__(self, obj, inputText = None):
text = inputText.strip().replace("\n"," ")
tokens = word_tokenize(text)
stop_words = stopwords.words('english')
article_content = ''
#Finding weighted frequency of Occurrence
for p in text:
article_content += p
word_frequencies = {}
for word in tokens:
if word.lower() not in stop_words:
if word.lower() not in punctuation:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = word_frequencies[word]/max_frequency
# Calculating Sentence Scores
sent_token = sent_tokenize(article_content)
sentence_scores = {}
for sent in sent_token:
sentence = sent.split(" ")
for word in sentence:
if word.lower() in word_frequencies.keys():
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word.lower()]
else:
sentence_scores[sent] += word_frequencies[word.lower()]
from heapq import nlargest
reference = obj.getAbstract().strip().replace("\n"," ")
self.reference = reference
summary = nlargest(4, sentence_scores, key = sentence_scores.get)
final_summary = [word for word in summary]
self.summary = ' '.join(final_summary)
def get_Summary(self):
return self.summary