from scipy import stats, special
import numpy as np
import matplotlib.pyplot as plt
sample_1 = stats.uniform.rvs(size=100)
sample_2 = stats.uniform.rvs(size=100)
fig, ax = plt.subplots()
ax.plot(range(100), sample_1)
ax.plot(range(100), sample_2)
[<matplotlib.lines.Line2D at 0x7f7f8840c040>]
p = 2, Euclidian Distance
Lower distance, higher similarity
# Euclidian distance
from scipy.spatial import distance
distance.euclidean(sample_1, sample_2)
# or
distance.cdist(sample_1.reshape(1, -1), sample_2.reshape(1, -1), 'euclidean')
# or
distance.cdist(sample_1.reshape(1, -1), sample_2.reshape(1, -1), 'minkowski', p=2.)
# Standardized Euclidean distance
distance.cdist(sample_1.reshape(1, -1), sample_2.reshape(1, -1), 'seuclidean', V=None)
array([[14.14213562]])
# Manhattan distance
distance.cdist(sample_1.reshape(1, -1), sample_2.reshape(1, -1), 'minkowski', p=1.)
# or
distance.cdist(sample_1.reshape(1, -1), sample_2.reshape(1, -1), 'cityblock')
array([[34.0490069]])
distance.cdist(sample_1.reshape(1, -1), sample_2.reshape(1, -1), 'cosine')
array([[0.2620535]])
distance.cdist(sample_1.reshape(1, -1), sample_2.reshape(1, -1), 'correlation')
array([[1.04351762]])
distance.cdist(sample_1.reshape(1, -1), sample_2.reshape(1, -1), 'chebyshev')
array([[0.97902329]])
distance.cdist(sample_1, sample_2, 'mahalanobis', VI=None) # each sample need at least 100 records
stats.kstest(sample_1, sample_2)
KstestResult(statistic=0.1, pvalue=0.7020569828664881)
from sklearn import metrics
metrics.mutual_info_score(sample_1, sample_2)
/opt/anaconda3/lib/python3.8/site-packages/sklearn/metrics/cluster/_supervised.py:58: UserWarning: Clustering metrics expects discrete values but received continuous values for label, and continuous values for target warnings.warn(msg, UserWarning)
4.605170185988092
from scipy.stats import wasserstein_distance
wasserstein_distance(sample_1, sample_2)
0.02437976189308293