#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
range = getattr(__builtins__, 'xrange', range)
# end of py2 compatability boilerplate
from scipy.cluster.hierarchy import linkage, inconsistent, fcluster
from scipy.cluster.hierarchy import cophenet
from matrixprofile import core
from matrixprofile.algorithms.pairwise_dist import pairwise_dist
[docs]def hierarchical_clusters(X, window_size, t, threshold=0.05, method='single',
depth=2, criterion='distance', n_jobs=1):
"""
Cluster M time series into hierarchical clusters using agglomerative
approach. This function is more or less a convenience wrapper around
SciPy's scipy.cluster.hierarchy functions, but uses the MPDist algorithm
to compute distances between each pair of time series.
Note
----
Memory usage could potentially high depending on the length of your
time series and how many distances are computed!
Parameters
----------
X : array_like
An M x N matrix where M is the time series and N is the observations at
a given time.
window_size : int
The window size used to compute the MPDist.
t : scalar
For criteria 'inconsistent', 'distance' or 'monocrit', this is the
threshold to apply when forming flat clusters.
For 'maxclust' criteria, this would be max number of clusters
requested.
threshold : float, Default 0.05
The percentile in which the MPDist is taken from. By default it is
set to 0.05 based on empircal research results from the paper.
Generally, you should not change this unless you know what you are
doing! This value must be a float greater than 0 and less than 1.
method : str, Default single
The linkage algorithm to use.
Options: {single, complete, average, weighted}
depth : int, Default 2
A non-negative value more than 0 to specify the number of levels below
a non-singleton cluster to allow.
criterion : str, Default distance
Options: {inconsistent, distance, maxclust, monocrit}
The criterion to use in forming flat clusters.
``inconsistent`` :
If a cluster node and all its
descendants have an inconsistent value less than or equal
to `t`, then all its leaf descendants belong to the
same flat cluster. When no non-singleton cluster meets
this criterion, every node is assigned to its own
cluster. (Default)
``distance`` :
Forms flat clusters so that the original
observations in each flat cluster have no greater a
cophenetic distance than `t`.
``maxclust`` :
Finds a minimum threshold ``r`` so that
the cophenetic distance between any two original
observations in the same flat cluster is no more than
``r`` and no more than `t` flat clusters are formed.
``monocrit`` :
Forms a flat cluster from a cluster node c
with index i when ``monocrit[j] <= t``.
For example, to threshold on the maximum mean distance
as computed in the inconsistency matrix R with a
threshold of 0.8 do::
MR = maxRstat(Z, R, 3)
cluster(Z, t=0.8, criterion='monocrit', monocrit=MR)
n_jobs : int, Default 1
The number of cpu cores used to compute the MPDist.
Returns
-------
clusters : dict
Clustering statistics, distances and labels.
>>> {
>>> pairwise_distances: MPDist between pairs of time series as
>>> np.ndarray,
>>> linkage_matrix: clustering linkage matrix as np.ndarray,
>>> inconsistency_statistics: inconsistency stats as np.ndarray,
>>> assignments: cluster label associated with input X location as
>>> np.ndarray,
>>> cophenet: float the cophenet statistic,
>>> cophenet_distances: cophenet distances between pairs of time
>>> series as np.ndarray
>>> class: hclusters
>>> }
"""
# valid SciPy clustering options to work with custom distance metric
valid_methods = set(['single', 'complete', 'average', 'weighted'])
valid_criterions = set([
'inconsistent', 'distance', 'monocrit', 'maxclust'
])
method = method.lower()
criterion = criterion.lower()
# error handling
if not core.is_array_like(X):
raise ValueError('X must be array like!')
if not isinstance(t, (float, int)):
raise ValueError('t must be a scalar (int or float)')
if not isinstance(threshold, float) or threshold <= 0 or threshold >= 1:
raise ValueError('threshold must be a float greater than 0 and less'\
' than 1')
if not isinstance(depth, int) or depth < 1:
raise ValueError('depth must be an integer greater than 0')
if method not in valid_methods:
opts_str = ', '.join(valid_methods)
raise ValueError('method may only be one of: ' + opts_str)
if criterion not in valid_criterions:
opts_str = ', '.join(valid_criterions)
raise ValueError('criterion may only be one of: ' + opts_str)
Y = pairwise_dist(X, window_size, threshold=threshold, n_jobs=n_jobs)
Z = linkage(Y, method=method)
R = inconsistent(Z, d=depth)
c, coph_dists = cophenet(Z, Y)
T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t)
return {
'pairwise_distances': Y,
'linkage_matrix': Z,
'inconsistency_statistics': R,
'assignments': T,
'cophenet': c,
'cophenet_distances': coph_dists,
'class': 'hclusters'
}