import plotly
plotly.offline.init_notebook_mode()
import pandas as pd
data =pd.read_csv('weatherAUS.csv', encoding='utf-8')
data = data[data['Location'] == 'Albury']
data = data.set_index('Date', drop=False)
data.head()
Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | |||||||||||||||||||||
2008-12-01 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
2008-12-02 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
2008-12-03 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
2008-12-04 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
2008-12-05 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
5 rows × 23 columns
import plotly.express as px
fig = px.line(data.iloc[:800, :], x='Date', y='MinTemp')
fig.show()
* both variables should be normally distribute
* linearity, a straight line relationship between each of the two variable
* homoscedasticity, data are equally distributed
* $n_{c}$, # of concordan; $n_{d}$, # of discordant; n, # of pairs
*$d_{i}$, the difference of the ranks of $x_{i}$ and $y_{i}$
*n, # of samples
from scipy import stats
stats.pearsonr(data.iloc[:100, :]['MinTemp'], data.iloc[:100, :]['MaxTemp'])
PearsonRResult(statistic=0.653071047724115, pvalue=1.7892482507532763e-13)
import seaborn as sns
sns.jointplot(x = 'MinTemp', y = 'Humidity9am', edgecolor = 'red', data = data)
<seaborn.axisgrid.JointGrid at 0x132437290>
from dython.nominal import associations
associations(data[['MinTemp', 'MaxTemp', 'WindGustSpeed', 'Humidity9am']])
{'corr': MinTemp MaxTemp WindGustSpeed Humidity9am MinTemp 1.000000 0.793268 0.402512 -0.544578 MaxTemp 0.793268 1.000000 0.269381 -0.697633 WindGustSpeed 0.402512 0.269381 1.000000 -0.291787 Humidity9am -0.544578 -0.697633 -0.291787 1.000000, 'ax': <Axes: >}
import numpy as np
from numpy import polyfit
def get_trend(X, degree = 3):
"""Get trend by fitting 1d data with linear regression
X (1d array), attribute values
degree (int), degree of linear regression
"""
indices = list(range(len(X)))
coef = polyfit(indices, X, degree)
ploy = np.poly1d(coef)
return ploy(indices)
def get_seasonality(X, periods = 4, degree = 3):
"""Get seasonality by spliting 1d into several periods and fitting with linear regression
X (1d array), attribute values
periods (int), number of yearly period
degress (int), degree of linear regression
"""
indices = [i%(365/periods) for i in range(0, len(X))]
coef = polyfit(indices, X, degree)
ploy = np.poly1d(coef)
return ploy(indices)
# select subset of the data and remove nan values
data_temp = data.iloc[60:200, :]['MinTemp']
idx = np.isfinite(data_temp)
data_temp = data_temp[idx]
# get trend
data_trend = get_trend(data_temp.values)
# get seasonality
# remove trend, then estimate seasonality
data_seasonality = get_seasonality(data_temp - data_trend)
data_temp.shape, data_trend.shape, data_seasonality.shape
# visualization
data_temp = pd.DataFrame({'Temp':data_temp.values, 'Trend':data_trend, 'Seasonality':data_seasonality})
data_temp[['Temp', 'Trend', 'Seasonality']].plot()
<Axes: >
from statsmodels.tsa.seasonal import STL
data_temp = data.iloc[0:200, :]['MinTemp']
res = STL(data_temp, period = 4).fit()
# res.trend, res.seasonal, res.resid
# residue, calculated by subtracting the values of the seasonal and trend component from the time series
plot = res.plot()
from matplotlib import pyplot
from pandas.plotting import lag_plot
lag_plot(data.iloc[:100, :]['MinTemp'], lag = 1)
<Axes: xlabel='y(t)', ylabel='y(t + 1)'>
from pandas.plotting import autocorrelation_plot
ax = autocorrelation_plot(data.iloc[:100, :]['MinTemp'])
from statsmodels.tsa import stattools
stattools.adfuller(data.iloc[:200, :]['MinTemp'])
(-0.2155323177456027, 0.9366447039032281, 9, 190, {'1%': -3.4652439354133255, '5%': -2.8768752281673717, '10%': -2.574944653739612}, 919.2489508353337)