-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathRnnDbscan.m
394 lines (344 loc) · 18 KB
/
RnnDbscan.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
classdef RnnDbscan < handle
%RnnDbscan RNN DBSCAN clustering
% RNN DBSCAN is a density-based clustering algorithm that uses reverse nearest
% neighbor counts as an estimate of observation density. It is based upon
% traversals of the directed k-nearest neighbor graph, and can handle clusters
% of different densities, unlike DBSCAN.
%
% RnnDbscan constructor:
% rnnDbscan = RnnDbscan(X, k, indexNeighbors) creates an RNN DBSCAN
% object for input data X, with k-nearest neighbors used in
% constructing the knn graph. indexNeighbors is the number of
% neighbors used to build the knn k-nearest neighbors index.
% indexNeighbors must be >= k + 1
%
% rnnDbscan = RnnDbscan(X, k, indexNeighbors, 'Method', method)
% creates an RNN DBSCAN object for input data X, with k nearest neighbors
% used in constructing the knn graph, using the specified knn method.
%
% Optional parameters:
% 'Method' - The method to use when building the knn index
% "knnsearch" - Use KNNSEARCH from Mathwork's Statistics and
% Machine Learning Toolbox. This is the default
% method.
% "nndescent" - Use the nearest neighbor descent algorithm to
% build an approximate knn index. For large
% matrices, this is much faster than KNNSEARCH, at
% the cost of slightly less accuracy. This method
% requires the pynndescent python package to be
% installed and accessible through MATLAB's Python
% external language interface
%
% Rows of X correspond to observations, and columns correspond to variables.
%
% RnnDbscan properties:
% K - number of nearest neighbors
% Data - input data X
% KnnIndex - k-nearest neighbor index
% KnnGraph - knn graph used for clustering
% Clusters - Clusters{i} contains indices for points in cluster i
% Outliers - array of outliers that do not belong to a cluster
% CorePoints - list of core points
% ClusterDensities - density of each cluster
% Labels - cluster labels; -1 corresponds to outliers
%
% RnnDbscan methods:
% cluster - cluster the input data
%
% This algorithm was presented in https://dx.doi.org/10.1109/TKDE.2017.2787640,
% by Bryant and Cios but no implementation was given. This implementation is
% based on the graph-based interpretation presented in the original paper.
%
% See also knnindex, knngraph
% SPDX-License-Identifier: BSD-3-Clause
% Copyright (c) 2020 Trevor Vannoy
properties (SetAccess = public, AbortSet)
% Number of nearest neighbors used to construct the knn graph
K (1,1) double {mustBePositive, mustBeInteger} = 1
end
properties (SetAccess = private)
% Input data; rows are observations, and columns are variables
Data (:,:) double
% k-nearest neighbor index
KnnIndex (:,:) int32
% Directed k-nearest neighbor graph
KnnGraph (1,1) digraph
% Cell array of clusters; Clusters{i} contains indices for all points
% that are in cluster i
Clusters (:,1) cell
% Outliers that do not belong to any cluster
Outliers (1,:) int32
% List of core points, which are points with >= k reverse nearest
% neighbors.
CorePoints (1,:) int32
% Cluster densities, defined as the max distance between
% connected core points
ClusterDensities (1,:) double
% Cluster labels. Outliers have a label value of -1; all other
% clusters have labels starting at 1
Labels (:,1) int32 = []
end
methods (Access = public)
function obj = RnnDbscan(X, k, index, options)
%RnnDbscan Construct an RNN DBSCAN object
% rnnDbscan = RnnDbscan(X, k, nIndexNeighbors) creates an RNN DBSCAN
% object for input data X, with k-nearest neighbors used in
% constructing the knn graph. indexNeighbors is the number of
% neighbors used to build the k-nearest neighbors index.
% indexNeighbors must be >= k + 1
%
% rnnDbscan = RnnDbscan(X, k, knnIndex) creates an RNN DBSCAN object
% using a precomputed knn index, knnIndex. knnIndex must have the same
% number of rows as X.
%
% rnnDbscan = RnnDbscan(X, k, nIndexNeighbors, 'Method', method)
% creates an RNN DBSCAN object for input data X, with k nearest
% neighbors used in constructing the knn graph, using the specified
% knn method.
%
% Optional parameters:
% 'Method' - The method to use when building the knn index
% "knnsearch" - Use KNNSEARCH from Mathwork's Statistics and
% Machine Learning Toolbox. This is the default
% method.
% "nndescent" - Use the nearest neighbor descent algorithm to
% build an approximate knn index. For large
% matrices, this is much faster than KNNSEARCH, at
% the cost of slightly less accuracy. This method
% requires the pynndescent python package to be
% installed and accessible through MATLAB's Python
% external language interface
% TODO: I could add another constructor that doesn't take indexNeighbors and uses k instead
arguments
X (:,:) double
k (1, 1) {mustBePositive, mustBeInteger}
index
options.Method (1,1) string {mustBeMember(options.Method, ["knnsearch", "nndescent"])} = "knnsearch"
end
obj.Data = X;
if numel(index) == 1
indexNeighbors = index;
if indexNeighbors < k + 1
error("indexNeighbors must be >= k + 1")
end
obj.KnnIndex = knnindex(obj.Data, indexNeighbors, ...
'Method', options.Method);
elseif size(index, 1) == size(X, 1)
if size(index, 2) < k + 1
error("knnIndex must have # columns >= k + 1")
end
obj.KnnIndex = index;
else
% TODO: better error message
error("argument 3 is incorrect")
end
obj.Labels = zeros(size(X, 1), 1, 'int32');
obj.K = k;
% the setter for k won't execute for k = 1 because 1 is the
% default value for k, so AbortSet will stop set.K from running
if k == 1
obj.KnnGraph = knngraph(obj.KnnIndex, obj.K, ...
'Precomputed', true);
end
end
function cluster(obj)
%CLUSTER Cluster the input data used to create the RnnDbscan object
% The resulting cluster assignments and labels are saved to the
% RnnDbscan object
obj.findInitialClusters();
obj.computeClusterDensities();
obj.addDirectlyDensityRechablePoints();
obj.createExtendedClusters();
end
end
methods
function obj = set.K(obj, k)
% this method only gets called if k != obj.K, due to AbortSet
if k >= size(obj.KnnIndex, 2)
error("Error setting property 'K' of class 'RnnDbsan':\n" + ...
"'K' must be less than knn index size %d", ...
size(obj.KnnIndex, 2));
end
obj.K = k;
% XXX: Mathwork's says I shouldn't set other properties in a setter
% method, which I do conceptually agree with, but I want to be able
% to "reset" the object to cluster using a different k without
% having to make a new object and recompute the knn index
obj.KnnGraph = knngraph(obj.KnnIndex, obj.K, ...
'Precomputed', true);
obj.Labels = zeros(1, size(obj.Data, 1), 1, 'int32');
obj.Outliers = int32([]);
obj.CorePoints = int32([]);
obj.Clusters = {};
obj.ClusterDensities = [];
end
end
methods (Access = private)
function findInitialClusters(obj)
% Find initial clusters from the core point subgraph
%
% In the graph-based interpretation of the RNN DBSCAN algorithm,
% an initial clustering of core points is defined by the weakly
% connected components in the core point subgraph, which is a subgraph
% of the knn graph consisting of only core points. Core points are
% defined as points with a reverse nearest neighbor count >= k, which
% corresponds to a node having an indegree >= k in the knn graph
obj.CorePoints = int32(find(indegree(obj.KnnGraph) >= obj.K));
% creating a subgraph renumbers the node IDs. This is problematic
% because I am expecting the node IDs to correspond to the data
% points' indices in the data matrix. To remedy this, the subgraph's
% node IDs have to be converted back to the original node IDs.
corePointSubgraph = subgraph(obj.KnnGraph, obj.CorePoints);
% get the initial clusters from the connected components.
% 'OutputForm' = 'cell' returns a cell array where the i-th cell
% contains all of the node IDs belonging to component i; this
% results in a cell array where each cell is a cluster.
clusters = conncomp(corePointSubgraph, 'Type', 'weak', ...
'OutputForm', 'cell');
% convert the subgraph's node IDs back to the original node IDs that
% correspond to the data points' indices.
obj.Clusters = cellfun(@(c) obj.CorePoints(c), clusters, ...
'UniformOutput', false);
for i = 1:length(obj.Clusters)
obj.Labels(obj.Clusters{i}) = int32(i);
end
end
function addDirectlyDensityRechablePoints(obj)
% Add directly density-reachable border points to clusters
%
% Directly density-reachable points are border points that are in the
% k-neighborhood of a core point.
%
% This function expands the clusters to include some border points,
% but the resulting clustering is still considered to be the initial
% set of clusters in the RNN DBSCAN paper, rather than the extended
% clusters. This would be part of the "ExpandCluster" algorithm in the
% paper, but I am leveraging the graph interpretation rather than the
% algorithm pseudocode
for i = 1:length(obj.Clusters)
borderPoints = [];
% find k-nearest neighbors of every point in cluster i
for j = 1:length(obj.Clusters{i})
% k-nearest neighbors of a point j are the successors of
% node j in the knn graph, i.e. there is an edge originating
% from node j
neighbors = int32(successors(obj.KnnGraph, obj.Clusters{i}(j)));
% only add points to a cluster if they do not already belong
% to a cluster
unclusteredPoints = neighbors(obj.Labels(neighbors) == 0);
% points in a cluster are likely to share neighbors, but we
% don't want these neighbors to be added to the cluster
% more than once, so we take the union of all
% previously-found neighbors and the neighbors of point j
borderPoints = union(borderPoints, unclusteredPoints);
end
% force borderPoints to be a row vector, because the
% clusters are also row vectors
borderPoints = borderPoints(:)';
% add neighbors of core points in cluster i to cluster i
obj.Clusters{i} = horzcat(obj.Clusters{i}, borderPoints(:)');
obj.Labels(borderPoints) = int32(i);
end
end
function createExtendedClusters(obj)
% Cluster remaining points that satisify the extended cluster definition
%
% After creating the initial clusters, which comprise core points and
% their directly density-reachable border points, remaining points are
% assigned to a cluster C if they have a core point in their
% k-neighborhood that is < den(C) away. Such points are added to the
% cluster of their closest core point. Points that do not satisfy the
% extended cluster definition are labeled as outliers.
clusteredPoints = [obj.Clusters{:}];
pointIndices = int32([1:size(obj.Data, 1)]);
unclusteredPoints = setdiff(pointIndices, clusteredPoints);
for unclusteredPoint = unclusteredPoints
pointIsNoise = true;
% find k-nearest neighbors of the unclustered point
neighbors = int32(successors(obj.KnnGraph, unclusteredPoint));
% determine which neighbors are core points; we do not want to
% cluster the unclustered point if it is not connected to any
% core points
corePointNeighbors = intersect(neighbors, obj.CorePoints);
if corePointNeighbors
corePointNeighborsData = obj.Data(corePointNeighbors, :);
unclusteredPointData = obj.Data(unclusteredPoint, :);
% sort core point neighbors by distance from the
% unclustered point
distances = pairwiseDist(unclusteredPointData, ...
corePointNeighborsData);
[distances, sortIdx] = sort(distances);
corePointNeighbors = corePointNeighbors(sortIdx);
for i = 1:length(corePointNeighbors)
% find which cluster the core point belongs to
clusterIdx = obj.Labels(corePointNeighbors(i));
if distances(i) <= obj.ClusterDensities(clusterIdx)
% add unclustered point to cluster
obj.Clusters{clusterIdx} = horzcat(obj.Clusters{clusterIdx}, unclusteredPoint);
obj.Labels(unclusteredPoint) = int32(clusterIdx);
pointIsNoise = false;
% point can't be assigned to more than one cluster,
% so stop once it is assigned
break;
end
end
end
if pointIsNoise
% the unclustered point does not have any neighbors that are core points, so it is considered an outlier
obj.Outliers = horzcat(obj.Outliers, unclusteredPoint);
obj.Labels(unclusteredPoint) = int32(-1);
end
end
end
function computeClusterDensities(obj)
% Compute the densities of all the clusters.
%
% den(C) max(dist(x,y)) s.t. x,y are both core points in cluster C,
% and y is directly density-reachable from x (i.e. y is a k-nearest
% neighbor of x); equivalently, den(C) is the maximum distance between
% points that have an edge between them in the core points subgraph.
%
% Since the cluster density is defined as the maximum distance between
% nearest neighbor core points, the cluster density does not change as
% non-core points are added to the cluster. Thus we can calculate the
% cluster densities after finding the initial clusters of core points,
% which are defined by the weakly connected components of the core
% points subgraph.
obj.ClusterDensities = zeros(length(obj.Clusters), 1);
for i = 1:length(obj.Clusters)
% find all edges between core points in the cluster
clusterCorePoints = intersect(obj.Clusters{i}, obj.CorePoints);
G = subgraph(obj.KnnGraph, clusterCorePoints);
[sourceNodesTmp, targetNodesTmp] = findedge(G);
% If a cluster only has one core point, we can't compute it's
% density; consequently, we must check if there is an edge in
% the cluster before trying to compute it's density
if sourceNodesTmp
% findedge returns a source and target node for each edge;
% to reduce the number of iterations in the following for
% loop, we transform the source and target nodes such that
% there is one cell of target nodes for each source node
sourceNodes = unique(sourceNodesTmp);
targetNodes = mat2cell(targetNodesTmp, ...
histcounts(sourceNodesTmp, [sourceNodes; inf]));
for nodeId = 1:length(sourceNodes)
% convert node IDs back to their original indices
origNodeId = clusterCorePoints(nodeId);
origNeighborIds = clusterCorePoints(targetNodes{nodeId});
distances = pairwiseDist(obj.Data(origNodeId, :), ...
obj.Data(origNeighborIds, :));
maxDist = max(distances);
if maxDist > obj.ClusterDensities(i)
obj.ClusterDensities(i) = maxDist;
end
end
else
obj.ClusterDensities(i) = nan;
end
end
end
end
end
function d = pairwiseDist(X, Y)
d = sqrt(bsxfun(@plus, sum(X.^2, 2), sum(Y.^2, 2)') - 2*(X * Y'));
end