import numpy as np
import pandas as pd
import pyarrow as pa

df = pd.DataFrame({'one': [20, np.nan, 2.5],'two': ['january', 'february', 'march'],'three': [True, False, True]},index=list('abc'))


# Scalar
pa.scalar(1) # <pyarrow.Int64Scalar: 1>
pa.scalar(1, type=pa.int16()) # <pyarrow.Int16Scalar: 1>
pa.scalar("string") # <pyarrow.StringScalar: 'string'>
pa.scalar([1, 2]) # <pyarrow.ListScalar: [1, 2]>
pa.scalar([1, 2], type=pa.list_(pa.int16())) # <pyarrow.ListScalar: [1, 2]>
pa.scalar(df['one']) # <pyarrow.ListScalar: [20.0, None, 2.5]>, from Pandas

<pyarrow.ListScalar: [20.0, None, 2.5]>


# Array
pa.array([1, 2, None, 3]) # <pyarrow.lib.Int64Array object at 0x7fc00f7fc520>
pa.array(df['one']) # <pyarrow.lib.DoubleArray object at 0x7fc00f7fc830>

<pyarrow.lib.DoubleArray object at 0x7fc00f7fcad0>
[
  20,
  null,
  2.5
]


# Record Batch
pa.RecordBatch.from_pandas(df) # create record batch from Pandas

data = [
    pa.array([1, 2, 3, 4]),
    pa.array(['foo', 'bar', 'baz', None]),
    pa.array([True, None, False, True])
]
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])


# Table
pa.Table.from_pandas(df)
table = pa.Table.from_batches([batch])


# Access Scalar
n = pa.scalar(1)
n.as_py() # Return this value as a Python int
pa.scalar("string").as_py() # 'string'
pa.scalar([1, 2]).as_py() # [1, 2]

[1, 2]


# Access Array
a = pa.array([1, 2, None, 3])
b = pa.array([2, 4, 6, 8])
a.tolist() # [1, 2, None, 3]
b.to_numpy() # numpy.ndarray
#a.to_numpy() # not take None
a.to_pandas() # pandas.core.series.Series
a.to_pylist() # list
a.to_string() # str
#dir(a)

'[\n  1,\n  2,\n  null,\n  3\n]'


# Access Record Batch
batch['f0'] # pyarrow.lib.Int64Array object at 0x7fc00f836360>
batch.num_columns # 3
batch.num_rows # 4
batch.to_pandas()['f0'] # pandas.core.series.Series

0    1
1    2
2    3
3    4
Name: f0, dtype: int64


# Access Table

# Get column names
table.column_names # ['f0', 'f1', 'f2']

# Select a column
chunk = table['f0'] # pyarrow.lib.ChunkedArray
chunk = table[0] # # pyarrow.lib.ChunkedArray
table['f0'].to_pandas() # pandas.core.series.Series

# Select multiple columns
table.select([0, 1]) # pyarrow.Table
table.select(['f0', 'f1']) # pyarrow.Table

# Rename columns
table2 = table.rename_columns(['n0', 'n1', 'n2']) # create a new table

# Remove column
table3 = table.remove_column(0) # create a new table


# Access ChunkedArray
chunk = table['f0'] # pyarrow.lib.ChunkedArray
type(chunk[0]) # pyarrow.lib.Int64Scalar
chunk[1:3] # <pyarrow.lib.ChunkedArray object at 0x7fc00cc0e6b0>
chunk.take([1, 2]) # <pyarrow.lib.ChunkedArray object at 0x7fc00cc1a290>
type(chunk.to_pylist()) # list

list


# Table
# write parquet
import pyarrow.parquet as pq
pq.write_table(table, 'example.parquet')

# read parquet
table2 = pq.read_table('example.parquet')
table2

pyarrow.Table
f0: int64
f1: string
f2: bool

PyArrow¶

Installation¶

Why PyArrow¶

Data Type¶

Access¶

IO¶

Reference¶