diff options
Diffstat (limited to 'src/main/java/org/apache/commons/math3/ml/clustering/DBSCANClusterer.java')
-rw-r--r-- | src/main/java/org/apache/commons/math3/ml/clustering/DBSCANClusterer.java | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/math3/ml/clustering/DBSCANClusterer.java b/src/main/java/org/apache/commons/math3/ml/clustering/DBSCANClusterer.java new file mode 100644 index 0000000..ce3d5cd --- /dev/null +++ b/src/main/java/org/apache/commons/math3/ml/clustering/DBSCANClusterer.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.math3.ml.clustering; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.math3.exception.NotPositiveException; +import org.apache.commons.math3.exception.NullArgumentException; +import org.apache.commons.math3.ml.distance.DistanceMeasure; +import org.apache.commons.math3.ml.distance.EuclideanDistance; +import org.apache.commons.math3.util.MathUtils; + +/** + * DBSCAN (density-based spatial clustering of applications with noise) algorithm. + * <p> + * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e. + * a point p is density connected to another point q, if there exists a chain of + * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q, + * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable. + * A point q is directly density-reachable from point p if it is in the ε-neighborhood + * of this point. + * <p> + * Any point that is not density-reachable from a formed cluster is treated as noise, and + * will thus not be present in the result. + * <p> + * The algorithm requires two parameters: + * <ul> + * <li>eps: the distance that defines the ε-neighborhood of a point + * <li>minPoints: the minimum number of density-connected points required to form a cluster + * </ul> + * + * @param <T> type of the points to cluster + * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a> + * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf"> + * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a> + * @since 3.2 + */ +public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> { + + /** Maximum radius of the neighborhood to be considered. */ + private final double eps; + + /** Minimum number of points needed for a cluster. */ + private final int minPts; + + /** Status of a point during the clustering process. */ + private enum PointStatus { + /** The point has is considered to be noise. */ + NOISE, + /** The point is already part of a cluster. */ + PART_OF_CLUSTER + } + + /** + * Creates a new instance of a DBSCANClusterer. + * <p> + * The euclidean distance will be used as default distance measure. + * + * @param eps maximum radius of the neighborhood to be considered + * @param minPts minimum number of points needed for a cluster + * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} + */ + public DBSCANClusterer(final double eps, final int minPts) + throws NotPositiveException { + this(eps, minPts, new EuclideanDistance()); + } + + /** + * Creates a new instance of a DBSCANClusterer. + * + * @param eps maximum radius of the neighborhood to be considered + * @param minPts minimum number of points needed for a cluster + * @param measure the distance measure to use + * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} + */ + public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure) + throws NotPositiveException { + super(measure); + + if (eps < 0.0d) { + throw new NotPositiveException(eps); + } + if (minPts < 0) { + throw new NotPositiveException(minPts); + } + this.eps = eps; + this.minPts = minPts; + } + + /** + * Returns the maximum radius of the neighborhood to be considered. + * @return maximum radius of the neighborhood + */ + public double getEps() { + return eps; + } + + /** + * Returns the minimum number of points needed for a cluster. + * @return minimum number of points needed for a cluster + */ + public int getMinPts() { + return minPts; + } + + /** + * Performs DBSCAN cluster analysis. + * + * @param points the points to cluster + * @return the list of clusters + * @throws NullArgumentException if the data points are null + */ + @Override + public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException { + + // sanity checks + MathUtils.checkNotNull(points); + + final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>(); + final Map<Clusterable, PointStatus> visited = new HashMap<Clusterable, PointStatus>(); + + for (final T point : points) { + if (visited.get(point) != null) { + continue; + } + final List<T> neighbors = getNeighbors(point, points); + if (neighbors.size() >= minPts) { + // DBSCAN does not care about center points + final Cluster<T> cluster = new Cluster<T>(); + clusters.add(expandCluster(cluster, point, neighbors, points, visited)); + } else { + visited.put(point, PointStatus.NOISE); + } + } + + return clusters; + } + + /** + * Expands the cluster to include density-reachable items. + * + * @param cluster Cluster to expand + * @param point Point to add to cluster + * @param neighbors List of neighbors + * @param points the data set + * @param visited the set of already visited points + * @return the expanded cluster + */ + private Cluster<T> expandCluster(final Cluster<T> cluster, + final T point, + final List<T> neighbors, + final Collection<T> points, + final Map<Clusterable, PointStatus> visited) { + cluster.addPoint(point); + visited.put(point, PointStatus.PART_OF_CLUSTER); + + List<T> seeds = new ArrayList<T>(neighbors); + int index = 0; + while (index < seeds.size()) { + final T current = seeds.get(index); + PointStatus pStatus = visited.get(current); + // only check non-visited points + if (pStatus == null) { + final List<T> currentNeighbors = getNeighbors(current, points); + if (currentNeighbors.size() >= minPts) { + seeds = merge(seeds, currentNeighbors); + } + } + + if (pStatus != PointStatus.PART_OF_CLUSTER) { + visited.put(current, PointStatus.PART_OF_CLUSTER); + cluster.addPoint(current); + } + + index++; + } + return cluster; + } + + /** + * Returns a list of density-reachable neighbors of a {@code point}. + * + * @param point the point to look for + * @param points possible neighbors + * @return the List of neighbors + */ + private List<T> getNeighbors(final T point, final Collection<T> points) { + final List<T> neighbors = new ArrayList<T>(); + for (final T neighbor : points) { + if (point != neighbor && distance(neighbor, point) <= eps) { + neighbors.add(neighbor); + } + } + return neighbors; + } + + /** + * Merges two lists together. + * + * @param one first list + * @param two second list + * @return merged lists + */ + private List<T> merge(final List<T> one, final List<T> two) { + final Set<T> oneSet = new HashSet<T>(one); + for (T item : two) { + if (!oneSet.contains(item)) { + one.add(item); + } + } + return one; + } +} |