-
Notifications
You must be signed in to change notification settings - Fork 4
/
Clustering_DBSCAN.py
152 lines (141 loc) · 5.5 KB
/
Clustering_DBSCAN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
'''
Author: Haoyang Ye (Zoe)
'''
import numpy as np
from sklearn.cluster import DBSCAN
from astropy.table import Table
import matplotlib.pyplot as plt
def find_center(Data, Flux, atom_max, min_samples, eps):
"""
Find the cluster centers for given data using DBSCAN package.
Data: input data, it should be a n*2 array
atom_max: the MCMC results determine the maximum number of the atoms that can be
found.
min_samples: sets the value 'min_samples' in function dbscan, which determines
the minimum grouping sample number
eps: the maximum distance between two samples for them to be considered as in
the same neighborhood
atom: the number of clusters picked by dbscan
xy: data arranged according to clustering results
noise: data that are not selected into any clusters
labels: flags/labels that determine which data belongs to which cluster
centers: cluster centre location (x_i,y_i) of each cluster i
widths: deviation of each clusters
"""
db = DBSCAN(eps, min_samples).fit(Data)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
atom = len(set(labels)) - (1 if -1 in labels else 0) # number of atoms
unique_labels = set(labels)
noise_label = (labels == -1)
noise = Data[noise_label & ~core_samples_mask]
xy = []
flux = []
centers = np.zeros((atom,2))
widths = np.zeros((atom,2))
for k in range(atom):
class_member_mask = (labels == k)
xy += [Data[class_member_mask & core_samples_mask]]
flux += [Flux[class_member_mask & core_samples_mask]]
centers[k] = np.mean(Data[class_member_mask & core_samples_mask], axis = 0)
widths[k] = np.std(Data[class_member_mask & core_samples_mask], axis = 0)
print ('We find ', atom, ' atoms')
if len(noise) == 0:
print ('There is no outlier.')
else:
print ('There are ', len(noise), 'outliers.')
print ('The', atom, 'cluster centers are \n', centers)
print ('The deviation of the ', atom, 'clusters are \n', widths)
plt.figure()
plt.scatter(noise.transpose()[0], noise.transpose()[1], s = 20, c = 'k', label = "Outlier")
for i in range(atom):
plt.scatter(xy[i][:,0], xy[i][:,1], s = 100, label = "Cluster %d" %i)
for i in range(atom):
plt.scatter(centers[i][0], centers[i][1], s = 80, marker = '*', label = "Centre %d" %i)
plt.grid(True)
plt.legend()
plt.xlabel('Image plane x axis')
plt.ylabel('Image plane y axis')
plt.gca().set_aspect('equal', adjustable='box')
plt.show()
return atom, xy, flux, noise, labels, centers, widths
def distance(point1, point2):
"""
Euclidean distance from cluster centre 1 and cluster centre 2,
the coordinates of the cluster centres should be located at the
centre of the cell.
>>>distance([0.1, 0.1], [0.2, 0.1])
>>>0.1
>>>distance([0.2, 0.1], [0.2, 0.1])
>>>0.0
"""
point1[0] = np.floor(point1[0]) + 0.5
point1[1] = np.floor(point1[1]) + 0.5
point2[0] = np.floor(point2[0]) + 0.5
point2[1] = np.floor(point2[1]) + 0.5
return np.abs((point1[0] - point2[0]) + complex(0,1)*(point1[1]- point2[1]))
def angle_between(p1, p2):
"""
Angle in degrees between cluster centre 1 and cluster centre 2.
"""
p1x = p1[0]
p1y = p1[1]
p2x = p2[0]
p2y = p2[1]
ang = np.arctan((p1y - p2y)/(p1x - p2x))
return ang/np.pi*180
if __name__ == "__main__":
# 1. All the input configuration
cluster_num = []
d = []
ang = []
filename = str(input('Enter the result file name:\n'))
min_samples = int(input('Enter min_samples (you can try to start with 10):\n'))
eps = float(input('Enter eps (you can try to start with 1):\n'))
# 2. Clustering process
# 2.1 Read in data
print ('Processing ' + filename + '...')
Data_c = Table.read(filename, format="ascii")
Data_vxy = np.zeros(shape=(len(Data_c),2))
Data_vxy[:,0]=Data_c['x'].data
Data_vxy[:,1]=Data_c['y'].data
Flux = Data_c['F'].data
# 2.2 Determine atom_max
n = 0
col_num = []
n_old = Data_c['k'][0]
for n_new in Data_c['k'][1:]:
if n_new==n_old:
n += 1
else:
col_num += [n+1]
n = 0
n_old = n_new
atom_max = int(np.max(col_num))
print ('atom_max', atom_max)
## 2.2 Clustering process ##
atom, xy, flux, noise, labels, centers, widths = find_center(Data_vxy, Flux, atom_max, min_samples, eps)
if atom > atom_max:
print ('You need to increase the variable min_samples or/and increase the variable eps')
print (atom, 'is even bigger than the possible biggest atom number', atom_max)
again = str(input('Do you want to change variables and try again? (y/n):'))
print (again)
while (again == 'Y' or again == 'y'):
min_samples = int(input('Enter min_samples:\n'))
eps = float(input('Enter eps:\n'))
atom, xy, flux, noise, labels, centers, widths = find_center(Data_vxy, Flux, atom_max, min_samples, eps)
again = str(input('Do you want to change variables and try again? (y/n):'))
if (again == 'N' or again == 'n'):
break
print ('You chose eps = ', eps, ' and min_samples = ', min_samples, '\n')
if atom == 2:
d += [distance(centers[0], centers[1])]
ang += [angle_between(centers[0], centers[1])]
elif atom == 1:
d += [0]
ang += [0]
else:
d += [-100]
ang += [-100]
cluster_num += [len(centers)]