Pandas¶

import pandas as pd
import numpy as np

A Series 1D labeled homogeneously-typed array
A DataFrame General 2D labeled, size-mutable tabular structure with potentially heterogeneously-typed column
The length of a Series cannot be changed, but columns can be inserted into a DataFrame
The vast majority of methods produce new objects and leave the input data untouched. In general we like to favor immutability where sensible

Series¶

Create Series¶

Pandas Series is nothing but a column

# create series from scalar
s = pd.Series(1, index=list(range(4)), dtype='float32')

# create series from list
s = pd.Series([1, 3, 5, np.nan, 6, 8])

# create series from numpy array
s = pd.Series(np.arange(6)*10, index = [10, 11, 12, 13, 14, 15])

# create a serie of objects
data = np.array(['g','e','e','k','s'])
s = pd.Series(data)

Accessing¶

s = pd.Series(np.arange(6)*10, index = [10, 11, 12, 13, 14, 15])
s.values # numpy array
s.get(key = 11) # access element by index

s.dtype # data type

s[11] # numpy.float32, access element by explicit index, index may not start from zero

s.head(2)
s.loc[11:13] # Series, refering the explicit index

s[2:4] # Series, access elements by implicit index, the third element and the forth element
s.iloc[2:4] # Series, refering the implicit index

12    20
13    30
dtype: int64

Operations¶

s.to_numpy() # convert Series to numpy.ndarray
s.tolist() # convert Series to list

# histogram
s.value_counts()

# Binary Operations
data = pd.Series([5, 2, 3,7], index=['a', 'b', 'c', 'd'])
data1 = pd.Series([1, 6, 4, 9], index=['a', 'b', 'd', 'e'])

dataAdd = data.add(data1) # add the values in two Series by explicity index
dataAdd = data.add(data1, fill_value=0) # add the values in two Series by implicit index

dataSub = data.sub(data1)
dataSub = data.sub(data1, fill_value=0)

# Conversion Operation on Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s.dropna(inplace = True) # dropping null value
s = s.astype('int') # change data type

DataFrame¶

Create DataFrame¶

# create dataframe from dict
data = {
    'apples': [3, 2, 0, 1], 
    'oranges': [0, 3, 7, 2]
}

purchases = pd.DataFrame(data)
purchases

# assign index for each row
purchases = pd.DataFrame(data, index=['June', 'Robert', 'Lily', 'David'])
purchases

# create dataframe from numpy array
data = pd.DataFrame(np.arange(15).reshape(5, 3), index = ['R1', 'R2', 'R3', 'R4', 'R5'], columns = ['A', 'B', 'C'])
data

# create dataframe from mixed data type
data = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list([10, 11, 12, 13]), dtype='float32'), # setup index
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
data

Accessing¶

# display
purchases.head(2) # first rows
purchases.tail(2) # last rows

# get info
purchases.info()

# get row index
purchases.index # Index

# get column names
purchases.columns # Index

# get shape
purchases.shape

# get a summary of the distribution of continuous variables
purchases.describe()

# the frequency of all values in a column
purchases['apples'].value_counts()

# data type
purchases.dtypes

# get a specific cell
purchases.iloc[0, 0] # 3, access by implicit index
purchases.iat[0, 0] # getting fast access to a scalar, access by implicit index
purchases.at['June', 'apples'] # access by explicit row index and explicit column index

# get a row
purchases.iloc[0] # first row, Series
purchases.iloc[-1] # last row, Series
purchases.loc['June'] # first row, Series

# iterating over rows
for row in purchases.iterrows():
    print(row[0], row[1]) # row index, row data

# get a column
purchases.iloc[:, 0] # first column, Series
purchases.iloc[:, -1] # last column, Series
purchases.loc[:, 'apples'] # first column, Series
purchases['apples'] # first column, Series
purchases.apples # first column, Series

# iterating over columns
columns = list(purchases)
 
for i in columns:
    print(purchases[i])

# Multiple columns and rows
purchases.iloc[0:2, :1] # DataFrame
purchases.loc[['June', 'Lily']] # multiple rows, DataFrame
purchases.loc[['June', 'Lily'], ['apples', 'oranges']] # multiple columns and rows, DataFrame
purchases[['apples', 'oranges']] # multiple columns and rows, DataFrame

# Logical indexing
l = purchases['apples'] > 0 # boolean Series
purchases[l] # selecting rows with boolean Series, DataFrame

purchases[l][['apples', 'oranges']] # selecting rows with boolean Series and column names, DataFrame
purchases.loc[l, ['apples', 'oranges']] # same as above
purchases.loc[l, 'apples'] # selecting rows with boolean Series in first column, Series
purchases.loc[l, ['apples']] # selecting rows with boolean Series in first column, DataFrame

purchases.loc[purchases['apples']> 0, 'apples':'oranges'] # selecting rows with boolean Series and column names, DataFrame
purchases.loc[purchases['apples'].apply(lambda x: x%2 == 0) == True] # lambda function
purchases[purchases> 1] # selecting values from a DataFrame where a boolean condition is met
purchases[purchases['apples'].isin([2, 3])] # Using the isin() method for filtering

purchases.loc[(purchases.apples > 1) & (purchases.oranges > 0), 'apples'] # and
purchases.loc[(purchases.apples > 1) | (purchases.oranges > 0), 'apples'] # or
purchases.loc[~(purchases.apples > 1), 'apples'] # not

IO¶

# outputs
purchases.to_csv('new_purchases.csv') # save as csv filedat
purchases.to_json('new_purchases.json') # save as json file
#purchases.to_sql('new_purchases', con) # save to database

# inputs
df = pd.read_csv('new_purchases.csv', index_col=0) # read from csv file, use the first column as index
df = pd.read_csv('data.csv', skiprows=[1,2], header=None) # skip rows, no header row
df = pd.read_json('new_purchases.json') # read from json file

DataFrame Operations¶

df = pd.read_csv('data.csv', index_col=0)
#df = df.set_index('Unnamed: 0') # set a column to be index
df = df.rename(index= {'June':'Row_1', 'Robert':'Row_2', 'Lily':'Row_3', 'David':'Row_4'}) # rename index name
df = df.rename(columns = {'apples':'Column_1', 'oranges':'Column_2'}) # rename column name

df = df.drop_duplicates() # remove duplicates

# convert to numpy array
array = df.to_numpy() # numpy.ndarray, with index and columns removed

# transpose
df.T # with index and columns switched

# sort by column
df.sort_values(by='Column_1')
df.sort_values(by=['Column_1', 'Column_2'])

# set a scalar
df.iat[1, 0] = 0
df.at['Row_1', 'Column_1'] = 0

# change a column by numpy array
df.loc[:, 'Column_1'] = np.arange(df.shape[0])*10
df.iloc[:, 1] = np.arange(df.shape[0])*10

# make change with logical indexing
df.loc[df['apples'] > 1, 'oranges'] = 100
df.loc[df.apples > 1, ['oranges']] = 100

# make change with where
df['logic'] = np.where(df['oranges'] > 2, 'high', 'low') # where(logicalExpression, trueValue, falseValue)

# filling missing data
df.fillna(value=5)

# get the boolean mask where values are nan
pd.isna(df)

# applying functions to the data
df.apply(np.cumsum) # DataFrame

df.apply(lambda x: x.max() - x.min()) # Series

# change data type
df['C'] = df['C'].astype(int)

Insertion¶

# append rows
df = pd.read_csv('data.csv', index_col=0)

# insert numpy array
data = np.arange(6).reshape(3, 2)
data = pd.DataFrame(data, index = [1, 2, 3], columns = df.columns)
df = df.append(data) # attach dataframe

# insert Series as a row
s = pd.Series([10, 100], index = df.columns)
df = df.append(s, ignore_index=True) # index is reset

# insert DataFrame as a row
temp = pd.DataFrame(np.array([10, 20]).reshape(1, -1), columns = df.columns)
df = df.append(temp, ignore_index=True)
df

# merge two dataframes along row axis
data = {
    'apples': [3, 2, 0, 1], 
    'oranges': [0, 3, 7, 2]
}
df2 = pd.DataFrame(data)
frames = pd.concat([df, df2])
frames

df = pd.read_csv('data.csv', index_col=0)
#df = df.set_index('Unnamed: 0') # set a column to be index
df = df.rename(columns = {'apples':'Column_1', 'oranges':'Column_2'}) # rename column name

# insert a column with same value in each index
df['Column_4'] = 50

# insert a column with values based on other columns
df['Percentage'] =  (df['Column_2'] / df['Column_4'] ) * 100
df = df.assign(Percentage_2 =lambda x: (x['Column_2'] / x['Column_4'] ) * 100)

# insert numpy array as a single column
df.loc[:, 'Column_3'] = np.arange(df.shape[0])*10
df.insert(1, "Column_3", np.arange(df.shape[0])*10) # insert a column as the second column

# insert Series as a single column
# setting a new column automatically aligns the data by the indexes
s = pd.Series(np.arange(10, 50, 10), index = df.index)
df['Column_3'] = s
df.insert(1, "Column_3", s) # save as the above statement

# insert multiple columns
df = df.assign(Marks = range(df.shape[0]),  Total = [50]*df.shape[0])

# concatenate dataframes along column axis
#df2 = df2.set_index(df.index)
frames = pd.concat([df, df2], axis=1)

data = df[['Column_1', 'Column_2']].join(df2)
data

Deletion¶

# delete columns
df = df.drop(['Column_1'], axis = 1)

# delete rows
df = df.drop(['Row_1', 'Row_2'], axis=0)

# remove first two rows
df = df.iloc[2:]

# drop any rows that have missing data
df.dropna(how='any')

Statistics¶

# statistics
df.mean() # Series

df.describe()

Database Operations¶

# merge
left = pd.DataFrame({'key': ['foo1', 'foo2'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo1', 'foo2'], 'rval': [4, 5]})

pd.merge(left, right, on='key')

# groupby
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})

df.groupby('A').sum()

Plotting¶

# plot with built-in plot in Pandas
df.plot(kind='line',x='Column_1',y='Column_2')

# plot with matplotlib
import matplotlib.pyplot as plt
plt.plot(df['Column_1'], df['Column_2'])

	C	D
A
bar	-0.226824	-3.228073
foo	1.838853	0.994510

	A	B	C	D	E	F
10	1.0	2013-01-02	1.0	3	test	foo
11	1.0	2013-01-02	1.0	3	train	foo
12	1.0	2013-01-02	1.0	3	test	foo
13	1.0	2013-01-02	1.0	3	train	foo

	apples	oranges
0	3	0
1	2	3
2	0	7
3	1	2

	apples	oranges
June	3	0
Robert	2	3
Lily	0	7
David	1	2

	apples	oranges
David	1	2
June	3	0
Lily	0	7
Robert	2	3

	Row_1	Row_2	Row_3	Row_4
Column_1	3	2	0	1
Column_2	0	3	7	2

Pandas¶

Series¶

Create Series¶

Accessing¶

Operations¶

DataFrame¶

Create DataFrame¶

Accessing¶

IO¶

DataFrame Operations¶

Insertion¶

Deletion¶

Statistics¶

Database Operations¶

Plotting¶

Reference¶