import pandas as pd
import numpy as np
data = pd.read_csv('weatherAUS.csv')
data['Date']= pd.to_datetime(data['Date'])
data['Index'] = data['Date']
data = data.set_index('Index')
data.head()
Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Index | |||||||||||||||||||||
2008-12-01 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
2008-12-02 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
2008-12-03 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
2008-12-04 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
2008-12-05 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
5 rows × 23 columns
ax = data.iloc[:1000, :].plot(x = 'Date', y = 'MinTemp') # matplotlib.axes._subplots.AxesSubplot
ax.set_xlabel('Date Label')
ax.set_ylabel('Temperature')
Text(0, 0.5, 'Temperature')
# Single plot for DataFrame
data.iloc[:1000, :].plot(x = 'Date', y = 'MinTemp')
<AxesSubplot: xlabel='Date'>
# Multiple plots for DataFrame
data.iloc[:1000, :].plot(x = 'Date', y = ['MinTemp', 'MaxTemp'])
<AxesSubplot: xlabel='Date'>
# Use index as x for Series
data.iloc[:1000, :]['MinTemp'].plot()
<AxesSubplot: xlabel='Index'>
# Multiple plots using index as x
data.iloc[:1000, :][['MinTemp', 'MaxTemp']].plot()
<AxesSubplot: xlabel='Index'>
# Bar chart for Series
data_counts = data['WindDir9am'].value_counts() # Series
data_counts.plot.bar()
<AxesSubplot: >
# Bar chart for DataFrame
data_counts = data['WindDir9am'].value_counts().to_frame()
data_counts.reset_index(inplace=True)
data_counts.columns = ['Type', 'Count'] # DataFrame
data_counts.plot.bar(x = 'Type', y = 'Count', rot = 90)
<AxesSubplot: xlabel='Type'>
# Series hist
data['MinTemp'].plot.hist(bins=30)
<AxesSubplot: ylabel='Frequency'>
# DataFrame hist
data[['MinTemp', 'MaxTemp']].plot.hist(alpha=0.5, bins=30)
<AxesSubplot: ylabel='Frequency'>
# Stacked hist
data[['MinTemp', 'MaxTemp']].plot.hist(stacked=True, bins=30)
<AxesSubplot: ylabel='Frequency'>
# Group by
data.iloc[:6000, :].hist(column = 'MinTemp', by='Location')
array([<AxesSubplot: title={'center': 'Albury'}>, <AxesSubplot: title={'center': 'BadgerysCreek'}>], dtype=object)
# Box plot with outliers
# IQR = Q3 - Q1
# lower bound = (Q1 - 1.5 IQR)
# upper bound = (Q3 + 1.5 * IQR)
# points that fall outside the lower and upper bounds are labeled as outliers
color = dict(boxes='DarkGreen', whiskers='DarkOrange', medians='DarkBlue', caps='red')
data[['MinTemp', 'MaxTemp']].plot.box(color = color, sym='b+')
<AxesSubplot: >
# use index as x
data.iloc[:100, :][['Humidity9am', 'Humidity3pm']].plot.area(stacked=False)
<AxesSubplot: xlabel='Index'>
# use a specific column as x
data.iloc[:100, :].plot.area(x = 'Date', y = ['Humidity9am', 'Humidity3pm'], stacked=False)
<AxesSubplot: xlabel='Date'>
data.iloc[:100, :].plot.scatter(x = 'Date', y = 'MinTemp', rot=45)
<AxesSubplot: xlabel='Date', ylabel='MinTemp'>
ax = data.iloc[:100, :].plot.scatter(x = 'Date', y = 'MinTemp', label='Minimum Temperature', rot=45) data.iloc[:100, :].plot.scatter(x = 'Date', y = 'MaxTemp', color='DarkGreen', label='Maxminum Temperature', ax=ax, rot=45)
data.iloc[:100, :].plot.hexbin(x = 'MinTemp', y = 'MaxTemp', gridsize=25)
<AxesSubplot: xlabel='MinTemp', ylabel='MaxTemp'>
# Series
data['WindDir9am'].value_counts().plot.pie(figsize=(6, 6))
<AxesSubplot: ylabel='WindDir9am'>
data['MinTemp'].plot.kde()
<AxesSubplot: ylabel='Density'>
from pandas.plotting import andrews_curves
data = pd.read_csv('Iris.csv')
data.head()
Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|---|
0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
andrews_curves(data[['MinTemp', 'Rainfall', 'WindGustSpeed', 'Humidity9am', 'RainTomorrow']], 'RainTomorrow')
#andrews_curves(data[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']], 'Species')
<AxesSubplot: >
from pandas.plotting import parallel_coordinates
parallel_coordinates(data[['MinTemp', 'Rainfall', 'WindGustSpeed', 'Humidity9am', 'RainTomorrow']], 'RainTomorrow')
<AxesSubplot: >
from pandas.plotting import lag_plot
lag_plot(data['MinTemp'], lag=1)
<AxesSubplot: xlabel='y(t)', ylabel='y(t + 1)'>
# Manually calculate correlation coefficiencies for each lag value
correlations = []
for i in range(6000):
correlations.append(data['MinTemp'].autocorr(lag=i))
s = pd.Series(correlations)
ax = s.plot()
ax.set_xlabel('Lag')
Text(0.5, 0, 'Lag')
# Use autocorrelation plot
# If there is any missed value, autocorrelation plot does not plot
from pandas.plotting import autocorrelation_plot
ax = autocorrelation_plot(data.iloc[:100, :]['MinTemp'])
from pandas.plotting import bootstrap_plot
# size, number of data points in each sampling
# samples, number of times of sampling
bootstrap_plot(data['MinTemp'], size=50, samples=500)
from pandas.plotting import radviz
radviz(data[['MinTemp', 'Rainfall', 'WindGustSpeed', 'Humidity9am', 'RainTomorrow']], 'RainTomorrow')
<AxesSubplot: >