-
Notifications
You must be signed in to change notification settings - Fork 0
/
similarity.py
53 lines (37 loc) · 2.03 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Python script to determine Euclidean Distance between the options
from math import *
import numpy
def euclidean_distance(x,y):
return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))
from pandas import *
alternatives = ['AppEEARS','Field Campaign Explorer (FCX)','OpenAltimetry','Data Access Tool','Operation IceBridge Portal','Vertex','Earthdata Search','SOOT','Airborne Data Visualizer','Giovanni','Worldview']
data = read_csv('acw-scores-fitness_only.csv')
vectors = {}
for x in alternatives:
vectors[x] = data[x].tolist()
results = numpy.zeros((11, 11))
# Iterate over the whole things
for i,k in enumerate(vectors.keys()):
for j, alternative in enumerate(alternatives):
if k == alternative:
score = 0
else:
# ok, normally euclidean distance is the same regardless of rodering, but in the case
# where a tool has more functionality than another tool, then we want the scores to be
# different. So we take the tool we're examining and remove all '0's, or capabilities
# it doesn't have, remove those from the comparison tool, and then calculate 'distance'.
# In the case of higher overlaps, we get a smaller number. When we compute the inverse
# relationship- the tool with many capabilities and the comparison tool without them,
# there will be A) more comparisons and B) a larger resulting number, signifying that
# it's harder to converge one way vs the other.
idxlist = numpy.nonzero(vectors[k])[0]
temp_vector = [vectors[k][idx] for idx in idxlist]
temp_alternative = [vectors[alternative][idx] for idx in idxlist]
score = euclidean_distance(temp_vector,temp_alternative)
results[i][j] = score
# print(results)
df = DataFrame(results, alternatives, alternatives)
#df = DataFrame(results, columns =alternatives, rows=alternatives)
print("\nPandas DataFrame: ")
print("Find a tool by the column, and the numbers indicate how far away the column-based tool is from meeting the row-based tools functionalities. Lower means closer to meeting its needs")
print(df)