import numpy as np
import pandas as pd
import pyarrow as pa
df = pd.DataFrame({'one': [20, np.nan, 2.5],'two': ['january', 'february', 'march'],'three': [True, False, True]},index=list('abc'))
# Scalar
pa.scalar(1) # <pyarrow.Int64Scalar: 1>
pa.scalar(1, type=pa.int16()) # <pyarrow.Int16Scalar: 1>
pa.scalar("string") # <pyarrow.StringScalar: 'string'>
pa.scalar([1, 2]) # <pyarrow.ListScalar: [1, 2]>
pa.scalar([1, 2], type=pa.list_(pa.int16())) # <pyarrow.ListScalar: [1, 2]>
pa.scalar(df['one']) # <pyarrow.ListScalar: [20.0, None, 2.5]>, from Pandas
<pyarrow.ListScalar: [20.0, None, 2.5]>
# Array
pa.array([1, 2, None, 3]) # <pyarrow.lib.Int64Array object at 0x7fc00f7fc520>
pa.array(df['one']) # <pyarrow.lib.DoubleArray object at 0x7fc00f7fc830>
<pyarrow.lib.DoubleArray object at 0x7fc00f7fcad0> [ 20, null, 2.5 ]
# Record Batch
pa.RecordBatch.from_pandas(df) # create record batch from Pandas
data = [
pa.array([1, 2, 3, 4]),
pa.array(['foo', 'bar', 'baz', None]),
pa.array([True, None, False, True])
]
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])
# Table
pa.Table.from_pandas(df)
table = pa.Table.from_batches([batch])
# Access Scalar
n = pa.scalar(1)
n.as_py() # Return this value as a Python int
pa.scalar("string").as_py() # 'string'
pa.scalar([1, 2]).as_py() # [1, 2]
[1, 2]
# Access Array
a = pa.array([1, 2, None, 3])
b = pa.array([2, 4, 6, 8])
a.tolist() # [1, 2, None, 3]
b.to_numpy() # numpy.ndarray
#a.to_numpy() # not take None
a.to_pandas() # pandas.core.series.Series
a.to_pylist() # list
a.to_string() # str
#dir(a)
'[\n 1,\n 2,\n null,\n 3\n]'
# Access Record Batch
batch['f0'] # pyarrow.lib.Int64Array object at 0x7fc00f836360>
batch.num_columns # 3
batch.num_rows # 4
batch.to_pandas()['f0'] # pandas.core.series.Series
0 1 1 2 2 3 3 4 Name: f0, dtype: int64
# Access Table
# Get column names
table.column_names # ['f0', 'f1', 'f2']
# Select a column
chunk = table['f0'] # pyarrow.lib.ChunkedArray
chunk = table[0] # # pyarrow.lib.ChunkedArray
table['f0'].to_pandas() # pandas.core.series.Series
# Select multiple columns
table.select([0, 1]) # pyarrow.Table
table.select(['f0', 'f1']) # pyarrow.Table
# Rename columns
table2 = table.rename_columns(['n0', 'n1', 'n2']) # create a new table
# Remove column
table3 = table.remove_column(0) # create a new table
# Access ChunkedArray
chunk = table['f0'] # pyarrow.lib.ChunkedArray
type(chunk[0]) # pyarrow.lib.Int64Scalar
chunk[1:3] # <pyarrow.lib.ChunkedArray object at 0x7fc00cc0e6b0>
chunk.take([1, 2]) # <pyarrow.lib.ChunkedArray object at 0x7fc00cc1a290>
type(chunk.to_pylist()) # list
list
# Table
# write parquet
import pyarrow.parquet as pq
pq.write_table(table, 'example.parquet')
# read parquet
table2 = pq.read_table('example.parquet')
table2
pyarrow.Table f0: int64 f1: string f2: bool