Source code for matrixprofile.algorithms.top_k_discords

# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

range = getattr(__builtins__, 'xrange', range)
# end of py2 compatability boilerplate


import numpy as np

from matrixprofile import core


def pmp_top_k_discords(profile, exclusion_zone=None, k=3):
    """
    Computes the top K discords for the given Pan-MatrixProfile. The return
    values is a list of row by col indices.

    Notes
    -----
    This algorithm is written to work with Euclidean distance. If you submit
    a PMP of Pearson metrics, then it is first converted to Euclidean.

    Parameters
    ----------
    profile : dict
        Data structure from a PMP algorithm.
    exclusion_zone : int, Default window / 2
        The zone to exclude around the found discords to reduce trivial
        findings. By default we use the row-wise window / 2.
    k : int
        Maximum number of discords to find.

    Returns
    -------
    dict : profile
        A 2D array of indices. The first column corresponds to the row index
        and the second column corresponds to the column index of the 
        submitted PMP. It is placed back on the original object passed in as
        'discords' key.

    """
    if not core.is_pmp_obj(profile):
        raise ValueError('Expecting PMP data structure!')

    # this function requires euclidean distance
    # convert if the metric is pearson
    metric = profile.get('metric', None)
    pmp = profile.get('pmp', None)
    windows = profile.get('windows', None)
    
    tmp = None
    if metric == 'pearson':
        tmp = core.pearson_to_euclidean(pmp, windows)
    else:
        tmp = np.copy(pmp).astype('d')        
    
    # replace nan and infs with -infinity
    # for whatever reason numpy argmax finds infinity as max so
    # this is a way to get around it by converting to -infinity
    tmp[core.nan_inf_indices(tmp)] = -np.inf
            
    # iterate finding the max value k times or until negative
    # infinity is obtained
    found = []
    
    for _ in range(k):
        max_idx = np.unravel_index(np.argmax(tmp), tmp.shape)
        window = windows[max_idx[0]]
        
        if tmp[max_idx] == -np.inf:
            break
        
        found.append(max_idx)
        
        # apply exclusion zone
        # the exclusion zone is based on 1/2 of the window size
        # used to compute that specific matrix profile
        n = tmp[max_idx[0]].shape[0]
        if exclusion_zone is None:
            exclusion_zone = int(np.floor(window / 2))

        ez_start = np.max([0, max_idx[1] - exclusion_zone])
        ez_stop = np.min([n, max_idx[1] + exclusion_zone])
        tmp[max_idx[0]][ez_start:ez_stop] = -np.inf
    
    profile['discords'] = np.array(found)

    return profile


def mp_top_k_discords(profile, exclusion_zone=None, k=3):
    """
    Find the top K number of discords (anomalies) given a matrix profile,
    exclusion zone and the desired number of discords. The exclusion zone
    nullifies entries on the left and right side of the first and subsequent
    discords to remove non-trivial matches. More specifically, a discord found
    at location X will more than likely have additional discords to the left or
    right of it.

    Parameters
    ----------
    profile : dict
        The MatrixProfile data structure.
    exclusion_zone : int, Default mp algorithm ez
        Desired number of values to exclude on both sides of the anomaly.
    k : int
        Desired number of discords to find.

    Returns
    -------
    dict : profile
        The original input profile with an additional "discords" key containing
        the a np.ndarray of discord indices.

    """
    if not core.is_mp_obj(profile):
        raise ValueError('Expecting MP data structure!')

    found = []
    tmp = np.copy(profile.get('mp', None)).astype('d')
    n = len(tmp)

    # TODO: this is based on STOMP standards when this motif finding algorithm
    # originally came out. Should we default this to 4.0 instead? That seems
    # to be the common value now per new research.
    window_size = profile.get('w', None)
    if exclusion_zone is None:
        exclusion_zone = profile.get('ez', None)
    
    # obtain indices in ascending order
    indices = np.argsort(tmp)
    
    # created flipped view for discords
    indices = indices[::-1]

    for idx in indices:
        if not np.isinf(tmp[idx]):
            found.append(idx)

            # apply exclusion zone
            if exclusion_zone > 0:
                exclusion_zone_start = np.max([0, idx - exclusion_zone])
                exclusion_zone_end = np.min([n, idx + exclusion_zone])
                tmp[exclusion_zone_start:exclusion_zone_end] = np.inf

        if len(found) >= k:
            break


    profile['discords'] = np.array(found, dtype='int')

    return profile


def top_k_discords(profile, exclusion_zone=None, k=3):
    """
    Find the top K number of discords (anomalies) given a mp or pmp,
    exclusion zone and the desired number of discords. The exclusion zone
    nullifies entries on the left and right side of the first and subsequent
    discords to remove non-trivial matches. More specifically, a discord found
    at location X will more than likely have additional discords to the left or
    right of it.

    Parameters
    ----------
    profile : dict
        A MatrixProfile or Pan-MatrixProfile structure.
    exclusion_zone : int, Default mp algorithm ez
        Desired number of values to exclude on both sides of the anomaly.
    k : int
        Desired number of discords to find.

    Returns
    -------
    dict : profile
        The original profile object with an additional 'discords' key. Take
        note that a MatrixProfile discord contains a single value while the
        Pan-MatrixProfile contains a row and column index.

    """
    if not core.is_mp_or_pmp_obj(profile):
        raise ValueError('Expecting MP or PMP data structure!')

    cls = profile.get('class', None)
    func = None

    if cls == 'MatrixProfile':
        func = mp_top_k_discords
    elif cls == 'PMP':
        func = pmp_top_k_discords
    else:
        raise ValueError('Unsupported data structure!')

    return func(
        profile,
        exclusion_zone=exclusion_zone,
        k=k,
    )