-
Notifications
You must be signed in to change notification settings - Fork 1
/
tf_isf.py
109 lines (95 loc) · 2.99 KB
/
tf_isf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
import math
import decimal
from collections import OrderedDict
tf_isf_feature = []
def tf_isf(m_d,total_files):
tfFinalList = []
df_dictionary = {}
idf_dictionary = {}
doc_sent_count = []
doc_count = 0
sent_ids=[]
number = 0
#isf_dict = {}
main_dict=OrderedDict()
for k,v in m_d.items():
main_dict[k]=v.split()
while number < total_files:
number = number + 1
tf_per_sent_list = []
n_sentences = 0
idlst=[]
for k, v in main_dict.items():
if str(k).startswith(str(number)+'.'):
idlst.append(k)
n_sentences = n_sentences + 1
tf_sent_dict = {}
for keyword in v:
if keyword in tf_sent_dict:
tf_sent_dict[keyword] = tf_sent_dict[keyword] + 1
else:
tf_sent_dict.update({keyword:1})
tf_per_sent_list.append(tf_sent_dict)
sent_ids.append(idlst)#sentence ids
tfFinalList.append(tf_per_sent_list)
doc_sent_count.append(n_sentences)
for sent_list in tfFinalList:
for dicts in sent_list:
maxtf = 0
for w in dicts:
if dicts[w] > maxtf:
maxtf = dicts[w]
for w in dicts:
dicts[w] = (dicts[w] / float(maxtf))
sf_list = []
for sent_list in tfFinalList:
sf = {}
for dicts in sent_list:
for w in dicts:
if w in sf:
sf[w] = sf[w] + 1
else:
sf.update({w:1})
sf_list.append(sf)
isf_list = []
id = 0
for dict1 in sf_list:
isf_dict = {}
for w in dict1:
isf = 0.0
val1 =0.0
val1 = float(doc_sent_count[id] / float(1 + dict1[w]))
if val1 > 0:
isf = math.log(val1 , 10)
isf_dict.update({w:isf})
#isf_dict[w] = isf
id = id + 1
isf_list.append(isf_dict)
#print isf_list[0]
id = 0
for doc in tfFinalList:
for sentdic in doc:
for w in sentdic:
isf_dict = isf_list[id]
if w in isf_dict:
sentdic[w] = sentdic[w] * isf_dict[w]
else:
sentdic[w] = 0
id = id + 1
ans=OrderedDict()
#tf_isf_feature
#[[{},{},{}..],[],[],..]
#list which contains list of dictionaries.
# the list represents a whole chunk
# the sublist represent a doc
# the dictionaries have (word:tf-isf) key-pair
for i,doc in enumerate(tfFinalList):
for j,sent in enumerate(doc):
score=0.0
l=len(sent)+1
for key in sent.keys():
score+=sent[key] # add tf_isf of words
score/=(l*1.0)
ans[sent_ids[i][j]]=round(decimal.Decimal(score),4)
return ans #sent-id : noralized total tf_isf score