Pandas

In [2]:
import pandas as pd
import numpy as np
  • A Series 1D labeled homogeneously-typed array
  • A DataFrame General 2D labeled, size-mutable tabular structure with potentially heterogeneously-typed column
  • The length of a Series cannot be changed, but columns can be inserted into a DataFrame
  • The vast majority of methods produce new objects and leave the input data untouched. In general we like to favor immutability where sensible

Series

Create Series

  • Pandas Series is nothing but a column
In [419]:
# create series from scalar
s = pd.Series(1, index=list(range(4)), dtype='float32')
In [420]:
# create series from list
s = pd.Series([1, 3, 5, np.nan, 6, 8])
In [8]:
# create series from numpy array
s = pd.Series(np.arange(6)*10, index = [10, 11, 12, 13, 14, 15])
In [9]:
# create a serie of objects
data = np.array(['g','e','e','k','s'])
s = pd.Series(data)

Accessing

In [25]:
s = pd.Series(np.arange(6)*10, index = [10, 11, 12, 13, 14, 15])
s.values # numpy array
s.get(key = 11) # access element by index

s.dtype # data type

s[11] # numpy.float32, access element by explicit index, index may not start from zero

s.head(2)
s.loc[11:13] # Series, refering the explicit index

s[2:4] # Series, access elements by implicit index, the third element and the forth element
s.iloc[2:4] # Series, refering the implicit index
Out[25]:
12    20
13    30
dtype: int64

Operations

In [ ]:
s.to_numpy() # convert Series to numpy.ndarray
s.tolist() # convert Series to list
In [ ]:
# histogram
s.value_counts()
In [32]:
# Binary Operations
data = pd.Series([5, 2, 3,7], index=['a', 'b', 'c', 'd'])
data1 = pd.Series([1, 6, 4, 9], index=['a', 'b', 'd', 'e'])

dataAdd = data.add(data1) # add the values in two Series by explicity index
dataAdd = data.add(data1, fill_value=0) # add the values in two Series by implicit index

dataSub = data.sub(data1)
dataSub = data.sub(data1, fill_value=0)
In [33]:
# Conversion Operation on Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s.dropna(inplace = True) # dropping null value
s = s.astype('int') # change data type

DataFrame

Create DataFrame

In [5]:
# create dataframe from dict
data = {
    'apples': [3, 2, 0, 1], 
    'oranges': [0, 3, 7, 2]
}

purchases = pd.DataFrame(data)
purchases
Out[5]:
apples oranges
0 3 0
1 2 3
2 0 7
3 1 2
In [8]:
# assign index for each row
purchases = pd.DataFrame(data, index=['June', 'Robert', 'Lily', 'David'])
purchases
Out[8]:
apples oranges
June 3 0
Robert 2 3
Lily 0 7
David 1 2
In [35]:
# create dataframe from numpy array
data = pd.DataFrame(np.arange(15).reshape(5, 3), index = ['R1', 'R2', 'R3', 'R4', 'R5'], columns = ['A', 'B', 'C'])
data
Out[35]:
A B C
R1 0 1 2
R2 3 4 5
R3 6 7 8
R4 9 10 11
R5 12 13 14
In [37]:
# create dataframe from mixed data type
data = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list([10, 11, 12, 13]), dtype='float32'), # setup index
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
data
Out[37]:
A B C D E F
10 1.0 2013-01-02 1.0 3 test foo
11 1.0 2013-01-02 1.0 3 train foo
12 1.0 2013-01-02 1.0 3 test foo
13 1.0 2013-01-02 1.0 3 train foo

Accessing

In [ ]:
# display
purchases.head(2) # first rows
purchases.tail(2) # last rows

# get info
purchases.info()

# get row index
purchases.index # Index

# get column names
purchases.columns # Index

# get shape
purchases.shape

# get a summary of the distribution of continuous variables
purchases.describe()

# the frequency of all values in a column
purchases['apples'].value_counts()

# data type
purchases.dtypes
In [ ]:
# get a specific cell
purchases.iloc[0, 0] # 3, access by implicit index
purchases.iat[0, 0] # getting fast access to a scalar, access by implicit index
purchases.at['June', 'apples'] # access by explicit row index and explicit column index
In [ ]:
# get a row
purchases.iloc[0] # first row, Series
purchases.iloc[-1] # last row, Series
purchases.loc['June'] # first row, Series
In [ ]:
# iterating over rows
for row in purchases.iterrows():
    print(row[0], row[1]) # row index, row data
In [ ]:
# get a column
purchases.iloc[:, 0] # first column, Series
purchases.iloc[:, -1] # last column, Series
purchases.loc[:, 'apples'] # first column, Series
purchases['apples'] # first column, Series
purchases.apples # first column, Series
In [ ]:
# iterating over columns
columns = list(purchases)
 
for i in columns:
    print(purchases[i])
In [ ]:
# Multiple columns and rows
purchases.iloc[0:2, :1] # DataFrame
purchases.loc[['June', 'Lily']] # multiple rows, DataFrame
purchases.loc[['June', 'Lily'], ['apples', 'oranges']] # multiple columns and rows, DataFrame
purchases[['apples', 'oranges']] # multiple columns and rows, DataFrame
In [ ]:
# Logical indexing
l = purchases['apples'] > 0 # boolean Series
purchases[l] # selecting rows with boolean Series, DataFrame

purchases[l][['apples', 'oranges']] # selecting rows with boolean Series and column names, DataFrame
purchases.loc[l, ['apples', 'oranges']] # same as above
purchases.loc[l, 'apples'] # selecting rows with boolean Series in first column, Series
purchases.loc[l, ['apples']] # selecting rows with boolean Series in first column, DataFrame

purchases.loc[purchases['apples']> 0, 'apples':'oranges'] # selecting rows with boolean Series and column names, DataFrame
purchases.loc[purchases['apples'].apply(lambda x: x%2 == 0) == True] # lambda function
purchases[purchases> 1] # selecting values from a DataFrame where a boolean condition is met
purchases[purchases['apples'].isin([2, 3])] # Using the isin() method for filtering

purchases.loc[(purchases.apples > 1) & (purchases.oranges > 0), 'apples'] # and
purchases.loc[(purchases.apples > 1) | (purchases.oranges > 0), 'apples'] # or
purchases.loc[~(purchases.apples > 1), 'apples'] # not

IO

In [75]:
# outputs
purchases.to_csv('new_purchases.csv') # save as csv filedat
purchases.to_json('new_purchases.json') # save as json file
#purchases.to_sql('new_purchases', con) # save to database

# inputs
df = pd.read_csv('new_purchases.csv', index_col=0) # read from csv file, use the first column as index
df = pd.read_csv('data.csv', skiprows=[1,2], header=None) # skip rows, no header row
df = pd.read_json('new_purchases.json') # read from json file
Out[75]:
apples oranges
David 1 2
June 3 0
Lily 0 7
Robert 2 3

DataFrame Operations

In [3]:
df = pd.read_csv('data.csv', index_col=0)
#df = df.set_index('Unnamed: 0') # set a column to be index
df = df.rename(index= {'June':'Row_1', 'Robert':'Row_2', 'Lily':'Row_3', 'David':'Row_4'}) # rename index name
df = df.rename(columns = {'apples':'Column_1', 'oranges':'Column_2'}) # rename column name

df = df.drop_duplicates() # remove duplicates
In [53]:
# convert to numpy array
array = df.to_numpy() # numpy.ndarray, with index and columns removed
In [57]:
# transpose
df.T # with index and columns switched
Out[57]:
Row_1 Row_2 Row_3 Row_4
Column_1 3 2 0 1
Column_2 0 3 7 2
In [ ]:
# sort by column
df.sort_values(by='Column_1')
df.sort_values(by=['Column_1', 'Column_2'])
In [318]:
# set a scalar
df.iat[1, 0] = 0
df.at['Row_1', 'Column_1'] = 0
In [ ]:
# change a column by numpy array
df.loc[:, 'Column_1'] = np.arange(df.shape[0])*10
df.iloc[:, 1] = np.arange(df.shape[0])*10

# make change with logical indexing
df.loc[df['apples'] > 1, 'oranges'] = 100
df.loc[df.apples > 1, ['oranges']] = 100

# make change with where
df['logic'] = np.where(df['oranges'] > 2, 'high', 'low') # where(logicalExpression, trueValue, falseValue)
In [ ]:
# filling missing data
df.fillna(value=5)

# get the boolean mask where values are nan
pd.isna(df)
In [ ]:
# applying functions to the data
df.apply(np.cumsum) # DataFrame

df.apply(lambda x: x.max() - x.min()) # Series
In [ ]:
# change data type
df['C'] = df['C'].astype(int)

Insertion

In [5]:
# append rows
df = pd.read_csv('data.csv', index_col=0)

# insert numpy array
data = np.arange(6).reshape(3, 2)
data = pd.DataFrame(data, index = [1, 2, 3], columns = df.columns)
df = df.append(data) # attach dataframe

# insert Series as a row
s = pd.Series([10, 100], index = df.columns)
df = df.append(s, ignore_index=True) # index is reset

# insert DataFrame as a row
temp = pd.DataFrame(np.array([10, 20]).reshape(1, -1), columns = df.columns)
df = df.append(temp, ignore_index=True)
df

# merge two dataframes along row axis
data = {
    'apples': [3, 2, 0, 1], 
    'oranges': [0, 3, 7, 2]
}
df2 = pd.DataFrame(data)
frames = pd.concat([df, df2])
frames
Out[5]:
apples oranges
0 3 0
1 2 3
2 0 7
3 1 2
4 0 1
5 2 3
6 4 5
7 10 100
8 10 20
0 3 0
1 2 3
2 0 7
3 1 2
In [69]:
df = pd.read_csv('data.csv', index_col=0)
#df = df.set_index('Unnamed: 0') # set a column to be index
df = df.rename(columns = {'apples':'Column_1', 'oranges':'Column_2'}) # rename column name

# insert a column with same value in each index
df['Column_4'] = 50

# insert a column with values based on other columns
df['Percentage'] =  (df['Column_2'] / df['Column_4'] ) * 100
df = df.assign(Percentage_2 =lambda x: (x['Column_2'] / x['Column_4'] ) * 100)

# insert numpy array as a single column
df.loc[:, 'Column_3'] = np.arange(df.shape[0])*10
df.insert(1, "Column_3", np.arange(df.shape[0])*10) # insert a column as the second column

# insert Series as a single column
# setting a new column automatically aligns the data by the indexes
s = pd.Series(np.arange(10, 50, 10), index = df.index)
df['Column_3'] = s
df.insert(1, "Column_3", s) # save as the above statement

# insert multiple columns
df = df.assign(Marks = range(df.shape[0]),  Total = [50]*df.shape[0])

# concatenate dataframes along column axis
#df2 = df2.set_index(df.index)
frames = pd.concat([df, df2], axis=1)

data = df[['Column_1', 'Column_2']].join(df2)
data
Out[69]:
Column_1 Column_2 apples oranges
June 3 0 3 0
Robert 2 3 2 3
Lily 0 7 0 7
David 1 2 1 2

Deletion

In [393]:
# delete columns
df = df.drop(['Column_1'], axis = 1)
In [ ]:
# delete rows
df = df.drop(['Row_1', 'Row_2'], axis=0)

# remove first two rows
df = df.iloc[2:]

# drop any rows that have missing data
df.dropna(how='any')

Statistics

In [ ]:
# statistics
df.mean() # Series

df.describe()

Database Operations

In [360]:
# merge
left = pd.DataFrame({'key': ['foo1', 'foo2'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo1', 'foo2'], 'rval': [4, 5]})

pd.merge(left, right, on='key')
Out[360]:
key lval rval
0 foo1 1 4
1 foo2 2 5
In [373]:
# groupby
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})

df.groupby('A').sum()
Out[373]:
C D
A
bar -0.226824 -3.228073
foo 1.838853 0.994510

Plotting

In [ ]:
# plot with built-in plot in Pandas
df.plot(kind='line',x='Column_1',y='Column_2')

# plot with matplotlib
import matplotlib.pyplot as plt
plt.plot(df['Column_1'], df['Column_2'])