# Source code for matrixprofile.algorithms.scrimp

# -*- coding: utf-8 -*-

"""
This module consists of all code to implement the SCRIMP++ algorithm. SCRIMP++
is an anytime algorithm that computes the matrix profile for a given time
series (ts) over a given window size (m).

This algorithm was originally created at the University of California
Riverside. For further academic understanding, please review this paper:

Matrix Proﬁle XI: SCRIMP++: Time Series Motif Discovery at Interactive
Speed. Yan Zhu, Chin-Chia Michael Yeh, Zachary Zimmerman, Kaveh Kamgar
Eamonn Keogh, ICDM 2018.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

range = getattr(__builtins__, 'xrange', range)
# end of py2 compatability boilerplate

import math

import warnings

import numpy as np

from matrixprofile import core

def calc_distance_profile(X, y, n, m, meanx, sigmax):
"""
Computes the distance profile.

Parameters
----------
X : array_like
The FFT transformed time series.
y : array_like
The query.
n : int
The length of the time series.
m : int
The window size.
meanx : array_like
The moving mean of the time series.
sigmax : array_like
The moving standard deviation of the time series.

Returns
-------
array_like :
The distance profile.

"""
# reverse the query
y = np.flip(y, 0)

# make y same size as ts with zero fill
y = np.concatenate([y, np.zeros(n-m)])

# main trick of getting dot product in O(n log n) time
Y = np.fft.fft(y)
Z = X * Y
z = np.fft.ifft(Z)

# compute y stats in O(n)
sumy = np.sum(y)
sumy2 = np.sum(np.power(y, 2))
meany = sumy / m
sigmay2 = sumy2 / m - meany ** 2
sigmay = np.sqrt(sigmay2)

dist = (z[m - 1:n] - m * meanx * meany) / (sigmax * sigmay)
dist = m - dist
dist = np.real(2 * dist)

return np.sqrt(np.absolute(dist))

def calc_dotproduct_idx(dotproduct, m, mp, idx, sigmax, idx_nn, meanx):
dotproduct[idx] = (m - mp[idx] ** 2 / 2) * \
sigmax[idx] * sigmax[idx_nn] + m * meanx[idx] * meanx[idx_nn]

return dotproduct

def calc_dotproduct_end_idx(ts, dp, idx, m, endidx, idx_nn, idx_diff):
tmp_a = ts[idx+m:endidx+m]
tmp_b = ts[idx_nn+m:endidx+m+idx_diff]
tmp_c = ts[idx:endidx]
tmp_d = ts[idx_nn:endidx+idx_diff]
tmp_f = tmp_a * tmp_b - tmp_c * tmp_d

dp[idx+1:endidx+1] = dp[idx] + np.cumsum(tmp_f)

return dp

def calc_refine_distance_end_idx(refine_distance, dp, idx, endidx, meanx,
sigmax, idx_nn, idx_diff, m):
tmp_a = dp[idx+1:endidx+1]
tmp_b = meanx[idx+1:endidx+1]
tmp_c = meanx[idx_nn+1:endidx+idx_diff+1]
tmp_d = sigmax[idx+1:endidx+1]
tmp_e = sigmax[idx_nn+1:endidx+idx_diff+1]
tmp_f = tmp_b * tmp_c
tmp_g = tmp_d * tmp_e
tmp_h = (m-(tmp_a - m * tmp_f) / (tmp_g))
refine_distance[idx+1:endidx+1] = np.sqrt(np.abs(2 * tmp_h))

return refine_distance

def calc_dotproduct_begin_idx(ts, dp, beginidx, idx, idx_diff, m,
idx_nn):
indices = list(range(idx - 1, beginidx - 1, -1))

if not indices:
return dp

tmp_a = ts[indices]
indices_b = list(range(idx_nn - 1, beginidx + idx_diff - 1, -1))
tmp_b = ts[indices_b]
indices_c = list(range(idx + m - 1, beginidx + m - 1, -1))
tmp_c = ts[indices_c]
indices_d = list(range(idx_nn - 1 + m, beginidx + idx_diff + m - 1, -1))
tmp_d = ts[indices_d]

dp[indices] = dp[idx] + \
np.cumsum((tmp_a * tmp_b) - (tmp_c * tmp_d))

return dp

def calc_refine_distance_begin_idx(refine_distance, dp, beginidx, idx,
idx_diff, idx_nn, sigmax, meanx, m):
if not (beginidx < idx):
return refine_distance

tmp_a = dp[beginidx:idx]
tmp_b = meanx[beginidx:idx]
tmp_c = meanx[beginidx+idx_diff:idx_nn]
tmp_d = sigmax[beginidx:idx]
tmp_e = sigmax[beginidx+idx_diff:idx_nn]
tmp_f = tmp_b * tmp_c
tmp_g = tmp_d * tmp_e
tmp_h = (m-(tmp_a - m * tmp_f) / (tmp_g))

refine_distance[beginidx:idx] = np.sqrt(np.abs(2 * tmp_h))

return refine_distance

def apply_update_positions(matrix_profile, mp_index, refine_distance, beginidx,
endidx, orig_index, idx_diff):
tmp_a = refine_distance[beginidx:endidx+1]
tmp_b = matrix_profile[beginidx:endidx+1]
update_pos1 = np.argwhere(tmp_a < tmp_b).flatten()

if len(update_pos1) > 0:
update_pos1 = update_pos1 + beginidx
matrix_profile[update_pos1] = refine_distance[update_pos1]
mp_index[update_pos1] = orig_index[update_pos1] + idx_diff

tmp_a = refine_distance[beginidx:endidx + 1]
tmp_b = matrix_profile[beginidx + idx_diff:endidx + idx_diff + 1]
update_pos2 = np.argwhere(tmp_a < tmp_b).flatten()

if len(update_pos2) > 0:
update_pos2 = update_pos2 + beginidx
matrix_profile[update_pos2 + idx_diff] = refine_distance[update_pos2]
mp_index[update_pos2 + idx_diff] = orig_index[update_pos2]

return (matrix_profile, mp_index)

def compute_indices(profile_len, step_size, sample_pct):
"""
Computes the indices used for profile index iteration based on the
number of samples and step size.

Parameters
----------
profile_len : int
The length of the profile to be computed.
step_size : float
step_size : float, default 0.25
The sampling interval for the window. The paper suggest 0.25 is the
most practical. It should be a float value between 0 and 1.
sample_pct : float, default = 0.1 (10%)
Number of samples to compute distances for in the MP.

Returns
-------
array_like :
The indices to compute.

"""
compute_order = np.arange(0, profile_len, step=step_size)
sample_size = int(np.ceil(len(compute_order) * sample_pct))
samples = np.random.choice(compute_order, size=sample_size, replace=False)

return samples

[docs]def prescrimp(ts, window_size, query=None, step_size=0.25, sample_pct=0.1,
random_state=None, n_jobs=1):
"""
This is the PreScrimp algorithm from the SCRIMP++ paper. It is primarly
used to compute the approximate matrix profile. In this case we use
a sample percentage to mock "the anytime/approximate nature".

Parameters
----------
ts : np.ndarray
The time series to compute the matrix profile for.
window_size : int
The window size.
query : array_like
Optionally, a query can be provided to perform a similarity join.
step_size : float, default 0.25
The sampling interval for the window. The paper suggest 0.25 is the
most practical. It should be a float value between 0 and 1.
sample_pct : float, default = 0.1 (10%)
Number of samples to compute distances for in the MP.
random_state : int, default None
Set the random seed generator for reproducible results.
n_jobs : int, Default = 1
Number of cpu cores to use.

Note
----
The matrix profiles computed from prescrimp will always be the approximate
solution.

Returns
-------
dict : profile
A MatrixProfile data structure.

>>> {
>>>    'mp': The matrix profile,
>>>    'pi': The matrix profile 1NN indices,
>>>    'rmp': The right matrix profile,
>>>    'rpi': The right matrix profile 1NN indices,
>>>    'lmp': The left matrix profile,
>>>    'lpi': The left matrix profile 1NN indices,
>>>    'metric': The distance metric computed for the mp,
>>>    'w': The window size used to compute the matrix profile,
>>>    'ez': The exclusion zone used,
>>>    'join': Flag indicating if a similarity join was computed,
>>>    'sample_pct': Percentage of samples used in computing the MP,
>>>    'data': {
>>>        'ts': Time series data,
>>>        'query': Query data if supplied
>>>    }
>>>    'class': "MatrixProfile"
>>>    'algorithm': "prescrimp"
>>>}

Raises
------
ValueError
If window_size < 4.
If window_size > query length / 2.
If ts is not a list or np.array.
If query is not a list or np.array.
If ts or query is not one dimensional.
If sample_pct is not between 0 and 1.

"""
is_join = core.is_similarity_join(ts, query)
if not is_join:
query = ts

# data conversion to np.array
ts = core.to_np_array(ts)
query = core.to_np_array(query)

# validate step_size
if not isinstance(step_size, float) or step_size > 1 or step_size < 0:
raise ValueError('step_size should be a float between 0 and 1.')

# validate sample_pct
if not isinstance(sample_pct, float) or sample_pct > 1 or sample_pct < 0:
raise ValueError('sample_pct should be a float between 0 and 1.')

# validate random_state
if random_state is not None:
try:
np.random.seed(random_state)
except:
raise ValueError('Invalid random_state value given.')

if window_size < 4:
error = "window size must be at least 4."
raise ValueError(error)

if window_size > len(query) / 2:
error = "Time series is too short relative to desired window size"
raise ValueError(error)

# precompute some common values - profile length, query length etc.
step_size = int(math.floor(window_size * step_size))
profile_length = core.get_profile_length(ts, query, window_size)
data_length = len(ts)
exclusion_zone = int(np.ceil(window_size / 4.0))

matrix_profile = np.zeros(profile_length)
mp_index = np.zeros(profile_length, dtype='int')

X = np.fft.fft(ts)
mux, sigx = core.moving_avg_std(ts, window_size)

dotproduct = np.zeros(profile_length)
refine_distance = np.full(profile_length, np.inf)
orig_index = np.arange(profile_length)

# iterate over sampled indices and update the matrix profile
# compute_order = compute_indices(profile_length, step_size, sample_pct)
compute_order = np.arange(0, profile_length, step=step_size)

for iteration, idx in enumerate(compute_order):
subsequence = ts[idx:idx + window_size]

# compute distance profile
distance_profile = calc_distance_profile(X, subsequence, data_length,
window_size, mux, sigx)

# apply exclusion zone
distance_profile = core.apply_exclusion_zone(exclusion_zone, is_join,
window_size, data_length, idx, distance_profile)

# find and store nearest neighbor
if iteration == 0:
matrix_profile = distance_profile
mp_index[:] = idx
else:
update_pos = distance_profile < matrix_profile
mp_index[update_pos] = idx
matrix_profile[update_pos] = distance_profile[update_pos]

idx_min = np.argmin(distance_profile)
matrix_profile[idx] = distance_profile[idx_min]
mp_index[idx] = idx_min
idx_nn = mp_index[idx]

# compute the target indices
idx_diff = idx_nn - idx
endidx = np.min([
profile_length - 1,
idx + step_size - 1,
profile_length - idx_diff - 1
])
beginidx = np.max([0, idx - step_size + 1, 2 - idx_diff])

# compute dot product and refine distance for the idx, begin idx
# and end idx
dotproduct = calc_dotproduct_idx(dotproduct, window_size,
matrix_profile, idx, sigx, idx_nn, mux)

dotproduct = calc_dotproduct_end_idx(ts, dotproduct, idx, window_size,
endidx, idx_nn, idx_diff)

refine_distance = calc_refine_distance_end_idx(
refine_distance, dotproduct, idx, endidx, mux, sigx, idx_nn,
idx_diff, window_size)

dotproduct = calc_dotproduct_begin_idx(
ts, dotproduct, beginidx, idx, idx_diff, window_size, idx_nn)

refine_distance = calc_refine_distance_begin_idx(
refine_distance, dotproduct, beginidx, idx, idx_diff, idx_nn,
sigx, mux, window_size)

matrix_profile, mp_index = apply_update_positions(matrix_profile,
mp_index,
refine_distance,
beginidx,
endidx,
orig_index, idx_diff)

return {
'mp': matrix_profile,
'pi': mp_index,
'rmp': None,
'rpi': None,
'lmp': None,
'lpi': None,
'w': window_size,
'ez': exclusion_zone,
'join': is_join,
'sample_pct': sample_pct,
'metric': 'euclidean',
'data': {
'ts': ts,
'query': query if is_join else None
},
'class': 'MatrixProfile',
'algorithm': 'prescrimp',
}

[docs]def scrimp_plus_plus(ts, window_size, query=None, step_size=0.25, sample_pct=0.1,
random_state=None, n_jobs=1):
"""SCRIMP++ is an anytime algorithm that computes the matrix profile for a
given time series (ts) over a given window size (m). Essentially, it allows
for an approximate solution to be provided for quicker analysis. In the
case of this implementation, sample percentage is used. An approximate
solution is given based a sample percentage from 0 to 1. The default sample
percentage is currently 10%.

This algorithm was created at the University of California Riverside. For
further academic understanding, please review this paper:

Matrix Proﬁle XI: SCRIMP++: Time Series Motif Discovery at Interactive
Speed. Yan Zhu, Chin-Chia Michael Yeh, Zachary Zimmerman, Kaveh Kamgar
Eamonn Keogh, ICDM 2018.

Parameters
----------
ts : np.ndarray
The time series to compute the matrix profile for.
window_size : int
The window size.
query : array_like
Optionally, a query can be provided to perform a similarity join.
step_size : float, default 0.25
The sampling interval for the window. The paper suggest 0.25 is the
most practical. It should be a float value between 0 and 1.
sample_pct : float, default = 0.1 (10%)
Number of samples to compute distances for in the MP.
random_state : int, default None
Set the random seed generator for reproducible results.
n_jobs : int, Default = 1
Number of cpu cores to use.

Returns
-------
dict : profile
A MatrixProfile data structure.

>>> {
>>>    'mp': The matrix profile,
>>>    'pi': The matrix profile 1NN indices,
>>>    'rmp': The right matrix profile,
>>>    'rpi': The right matrix profile 1NN indices,
>>>    'lmp': The left matrix profile,
>>>    'lpi': The left matrix profile 1NN indices,
>>>    'metric': The distance metric computed for the mp,
>>>    'w': The window size used to compute the matrix profile,
>>>    'ez': The exclusion zone used,
>>>    'join': Flag indicating if a similarity join was computed,
>>>    'sample_pct': Percentage of samples used in computing the MP,
>>>    'data': {
>>>        'ts': Time series data,
>>>        'query': Query data if supplied
>>>    }
>>>    'class': "MatrixProfile"
>>>    'algorithm': "scrimp++"
>>> }

Raises
------
ValueError
If window_size < 4.
If window_size > query length / 2.
If ts is not a list or np.array.
If query is not a list or np.array.
If ts or query is not one dimensional.
If sample_pct is not between 0 and 1.

"""
# validate random_state
if random_state is not None:
try:
np.random.seed(random_state)
except:
raise ValueError('Invalid random_state value given.')

###########################
# PreSCRIMP
###########################
profile = prescrimp(ts, window_size, query=query, step_size=step_size,
sample_pct=sample_pct, random_state=random_state, n_jobs=n_jobs)

# data conversion to np.array
ts = profile['data']['ts']
query = profile['data']['query']
if isinstance(query, type(None)):
query = ts

# precompute some common values - profile length, query length etc.
step_size = int(math.floor(window_size * step_size))
profile_length = core.get_profile_length(ts, query, window_size)
data_length = len(ts)
exclusion_zone = profile['ez']
window_size = profile['w']

# precompute some statistics on ts
data_mu, data_sig = core.moving_avg_std(ts, window_size)

###########################
# SCRIMP
###########################

# randomly sort indices for compute order
orig_index = np.arange(profile_length)
compute_order = np.copy(orig_index[orig_index > exclusion_zone])
#np.random.shuffle(compute_order)

# Only refine to provided sample_pct
sample_size = int(np.ceil(len(compute_order) * sample_pct))
compute_order = np.random.choice(compute_order, size=sample_size,
replace=False)

# initialize some values
curlastz = np.zeros(profile_length)
curdistance = np.zeros(profile_length)
dist1 = np.full(profile_length, np.inf)
dist2 = np.full(profile_length, np.inf)

for idx in compute_order:
# compute last z
curlastz[idx] = np.sum(ts[0:window_size] * ts[idx:idx + window_size])
curlastz[idx+1:] = curlastz[idx] + np.cumsum(
(ts[window_size:data_length - idx] * ts[idx + window_size:data_length]) -\
(ts[0:profile_length - idx - 1] * ts[idx:profile_length - 1])
)

# compute distances
curdistance[idx:] = np.sqrt(np.abs(
2 * (window_size - (curlastz[idx:profile_length + 1] -\
window_size * (data_mu[idx:] * data_mu[0:profile_length - idx])) /\
(data_sig[idx:] * data_sig[0:profile_length - idx]))
))

dist1[0:idx - 1] = np.inf
dist1[idx:] = curdistance[idx:]

dist2[0:profile_length - idx] = curdistance[idx:]
dist2[profile_length - idx + 2:] = np.inf

loc1 = dist1 < profile['mp']
if loc1.any():
profile['mp'][loc1] = dist1[loc1]
profile['pi'][loc1] = orig_index[loc1] - idx

loc2 = dist2 < profile['mp']
if loc2.any():
profile['mp'][loc2] = dist2[loc2]
profile['pi'][loc2] = orig_index[loc2] + idx

profile['algorithm'] = 'scrimp++'
profile['sample_pct'] = sample_pct

return profile