import pandas as pd
import numpy as np
# create series from scalar
s = pd.Series(1, index=list(range(4)), dtype='float32')
# create series from list
s = pd.Series([1, 3, 5, np.nan, 6, 8])
# create series from numpy array
s = pd.Series(np.arange(6)*10, index = [10, 11, 12, 13, 14, 15])
# create a serie of objects
data = np.array(['g','e','e','k','s'])
s = pd.Series(data)
s = pd.Series(np.arange(6)*10, index = [10, 11, 12, 13, 14, 15])
s.values # numpy array
s.get(key = 11) # access element by index
s.dtype # data type
s[11] # numpy.float32, access element by explicit index, index may not start from zero
s.head(2)
s.loc[11:13] # Series, refering the explicit index
s[2:4] # Series, access elements by implicit index, the third element and the forth element
s.iloc[2:4] # Series, refering the implicit index
s.to_numpy() # convert Series to numpy.ndarray
s.tolist() # convert Series to list
# histogram
s.value_counts()
# Binary Operations
data = pd.Series([5, 2, 3,7], index=['a', 'b', 'c', 'd'])
data1 = pd.Series([1, 6, 4, 9], index=['a', 'b', 'd', 'e'])
dataAdd = data.add(data1) # add the values in two Series by explicity index
dataAdd = data.add(data1, fill_value=0) # add the values in two Series by implicit index
dataSub = data.sub(data1)
dataSub = data.sub(data1, fill_value=0)
# Conversion Operation on Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s.dropna(inplace = True) # dropping null value
s = s.astype('int') # change data type
# create dataframe from dict
data = {
'apples': [3, 2, 0, 1],
'oranges': [0, 3, 7, 2]
}
purchases = pd.DataFrame(data)
purchases
# assign index for each row
purchases = pd.DataFrame(data, index=['June', 'Robert', 'Lily', 'David'])
purchases
# create dataframe from numpy array
data = pd.DataFrame(np.arange(15).reshape(5, 3), index = ['R1', 'R2', 'R3', 'R4', 'R5'], columns = ['A', 'B', 'C'])
data
# create dataframe from mixed data type
data = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list([10, 11, 12, 13]), dtype='float32'), # setup index
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
data
# display
purchases.head(2) # first rows
purchases.tail(2) # last rows
# get info
purchases.info()
# get row index
purchases.index # Index
# get column names
purchases.columns # Index
# get shape
purchases.shape
# get a summary of the distribution of continuous variables
purchases.describe()
# the frequency of all values in a column
purchases['apples'].value_counts()
# data type
purchases.dtypes
# get a specific cell
purchases.iloc[0, 0] # 3, access by implicit index
purchases.iat[0, 0] # getting fast access to a scalar, access by implicit index
purchases.at['June', 'apples'] # access by explicit row index and explicit column index
# get a row
purchases.iloc[0] # first row, Series
purchases.iloc[-1] # last row, Series
purchases.loc['June'] # first row, Series
# iterating over rows
for row in purchases.iterrows():
print(row[0], row[1]) # row index, row data
# get a column
purchases.iloc[:, 0] # first column, Series
purchases.iloc[:, -1] # last column, Series
purchases.loc[:, 'apples'] # first column, Series
purchases['apples'] # first column, Series
purchases.apples # first column, Series
# iterating over columns
columns = list(purchases)
for i in columns:
print(purchases[i])
# Multiple columns and rows
purchases.iloc[0:2, :1] # DataFrame
purchases.loc[['June', 'Lily']] # multiple rows, DataFrame
purchases.loc[['June', 'Lily'], ['apples', 'oranges']] # multiple columns and rows, DataFrame
purchases[['apples', 'oranges']] # multiple columns and rows, DataFrame
# Logical indexing
l = purchases['apples'] > 0 # boolean Series
purchases[l] # selecting rows with boolean Series, DataFrame
purchases[l][['apples', 'oranges']] # selecting rows with boolean Series and column names, DataFrame
purchases.loc[l, ['apples', 'oranges']] # same as above
purchases.loc[l, 'apples'] # selecting rows with boolean Series in first column, Series
purchases.loc[l, ['apples']] # selecting rows with boolean Series in first column, DataFrame
purchases.loc[purchases['apples']> 0, 'apples':'oranges'] # selecting rows with boolean Series and column names, DataFrame
purchases.loc[purchases['apples'].apply(lambda x: x%2 == 0) == True] # lambda function
purchases[purchases> 1] # selecting values from a DataFrame where a boolean condition is met
purchases[purchases['apples'].isin([2, 3])] # Using the isin() method for filtering
purchases.loc[(purchases.apples > 1) & (purchases.oranges > 0), 'apples'] # and
purchases.loc[(purchases.apples > 1) | (purchases.oranges > 0), 'apples'] # or
purchases.loc[~(purchases.apples > 1), 'apples'] # not
# outputs
purchases.to_csv('new_purchases.csv') # save as csv filedat
purchases.to_json('new_purchases.json') # save as json file
#purchases.to_sql('new_purchases', con) # save to database
# inputs
df = pd.read_csv('new_purchases.csv', index_col=0) # read from csv file, use the first column as index
df = pd.read_csv('data.csv', skiprows=[1,2], header=None) # skip rows, no header row
df = pd.read_json('new_purchases.json') # read from json file
df = pd.read_csv('data.csv', index_col=0)
#df = df.set_index('Unnamed: 0') # set a column to be index
df = df.rename(index= {'June':'Row_1', 'Robert':'Row_2', 'Lily':'Row_3', 'David':'Row_4'}) # rename index name
df = df.rename(columns = {'apples':'Column_1', 'oranges':'Column_2'}) # rename column name
df = df.drop_duplicates() # remove duplicates
# convert to numpy array
array = df.to_numpy() # numpy.ndarray, with index and columns removed
# transpose
df.T # with index and columns switched
# sort by column
df.sort_values(by='Column_1')
df.sort_values(by=['Column_1', 'Column_2'])
# set a scalar
df.iat[1, 0] = 0
df.at['Row_1', 'Column_1'] = 0
# change a column by numpy array
df.loc[:, 'Column_1'] = np.arange(df.shape[0])*10
df.iloc[:, 1] = np.arange(df.shape[0])*10
# make change with logical indexing
df.loc[df['apples'] > 1, 'oranges'] = 100
df.loc[df.apples > 1, ['oranges']] = 100
# make change with where
df['logic'] = np.where(df['oranges'] > 2, 'high', 'low') # where(logicalExpression, trueValue, falseValue)
# filling missing data
df.fillna(value=5)
# get the boolean mask where values are nan
pd.isna(df)
# applying functions to the data
df.apply(np.cumsum) # DataFrame
df.apply(lambda x: x.max() - x.min()) # Series
# change data type
df['C'] = df['C'].astype(int)
# append rows
df = pd.read_csv('data.csv', index_col=0)
# insert numpy array
data = np.arange(6).reshape(3, 2)
data = pd.DataFrame(data, index = [1, 2, 3], columns = df.columns)
df = df.append(data) # attach dataframe
# insert Series as a row
s = pd.Series([10, 100], index = df.columns)
df = df.append(s, ignore_index=True) # index is reset
# insert DataFrame as a row
temp = pd.DataFrame(np.array([10, 20]).reshape(1, -1), columns = df.columns)
df = df.append(temp, ignore_index=True)
df
# merge two dataframes along row axis
data = {
'apples': [3, 2, 0, 1],
'oranges': [0, 3, 7, 2]
}
df2 = pd.DataFrame(data)
frames = pd.concat([df, df2])
frames
df = pd.read_csv('data.csv', index_col=0)
#df = df.set_index('Unnamed: 0') # set a column to be index
df = df.rename(columns = {'apples':'Column_1', 'oranges':'Column_2'}) # rename column name
# insert a column with same value in each index
df['Column_4'] = 50
# insert a column with values based on other columns
df['Percentage'] = (df['Column_2'] / df['Column_4'] ) * 100
df = df.assign(Percentage_2 =lambda x: (x['Column_2'] / x['Column_4'] ) * 100)
# insert numpy array as a single column
df.loc[:, 'Column_3'] = np.arange(df.shape[0])*10
df.insert(1, "Column_3", np.arange(df.shape[0])*10) # insert a column as the second column
# insert Series as a single column
# setting a new column automatically aligns the data by the indexes
s = pd.Series(np.arange(10, 50, 10), index = df.index)
df['Column_3'] = s
df.insert(1, "Column_3", s) # save as the above statement
# insert multiple columns
df = df.assign(Marks = range(df.shape[0]), Total = [50]*df.shape[0])
# concatenate dataframes along column axis
#df2 = df2.set_index(df.index)
frames = pd.concat([df, df2], axis=1)
data = df[['Column_1', 'Column_2']].join(df2)
data
# delete columns
df = df.drop(['Column_1'], axis = 1)
# delete rows
df = df.drop(['Row_1', 'Row_2'], axis=0)
# remove first two rows
df = df.iloc[2:]
# drop any rows that have missing data
df.dropna(how='any')
# statistics
df.mean() # Series
df.describe()
# merge
left = pd.DataFrame({'key': ['foo1', 'foo2'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo1', 'foo2'], 'rval': [4, 5]})
pd.merge(left, right, on='key')
# groupby
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
df.groupby('A').sum()
# plot with built-in plot in Pandas
df.plot(kind='line',x='Column_1',y='Column_2')
# plot with matplotlib
import matplotlib.pyplot as plt
plt.plot(df['Column_1'], df['Column_2'])