import vaex
df = vaex.example()
df.head()
# | id | x | y | z | vx | vy | vz | E | L | Lz | FeH |
---|---|---|---|---|---|---|---|---|---|---|---|
<i style='opacity: 0.6'>0</i> | 0 | 1.23187 | -0.396929 | -0.598058 | 301.155 | 174.059 | 27.4275 | -149431 | 407.389 | 333.956 | -1.00539 |
<i style='opacity: 0.6'>1</i> | 23 | -0.163701 | 3.65422 | -0.254906 | -195 | 170.472 | 142.53 | -124248 | 890.241 | 684.668 | -1.70867 |
<i style='opacity: 0.6'>2</i> | 32 | -2.12026 | 3.32605 | 1.70784 | -48.6342 | 171.647 | -2.07944 | -138501 | 372.241 | -202.176 | -1.83361 |
<i style='opacity: 0.6'>3</i> | 8 | 4.71559 | 4.58525 | 2.25154 | -232.421 | -294.851 | 62.8587 | -60037 | 1297.63 | -324.688 | -1.47869 |
<i style='opacity: 0.6'>4</i> | 16 | 7.21719 | 11.9947 | -1.06456 | -1.68917 | 181.329 | -11.3336 | -83206.8 | 1332.8 | 1328.95 | -1.85705 |
<i style='opacity: 0.6'>5</i> | 16 | -7.78437 | 5.98977 | -0.682695 | 86.7009 | -238.778 | -2.31309 | -86497.6 | 1353.25 | 1339.42 | -1.91944 |
<i style='opacity: 0.6'>6</i> | 12 | 8.08373 | -3.27348 | 5.54687 | -57.4544 | 120.117 | 5.37438 | -101867 | 1100.8 | 782.915 | -1.93517 |
<i style='opacity: 0.6'>7</i> | 26 | -3.55719 | 5.41363 | 0.0917156 | -67.0511 | -145.933 | 39.6374 | -127682 | 921.008 | 882.101 | -1.79423 |
<i style='opacity: 0.6'>8</i> | 25 | 3.9848 | 5.40691 | 2.57724 | -38.7449 | -152.407 | -92.9073 | -113632 | 493.316 | -397.824 | -1.18076 |
<i style='opacity: 0.6'>9</i> | 8 | -20.8139 | -3.29468 | 13.4866 | 99.4067 | 28.6749 | -115.079 | -55825.3 | 1088.46 | -269.324 | -1.28892 |
type(df) # vaex.hdf5.dataset.Hdf5MemoryMapped
type(df.x) # Expression
type(df.x.values) # numpy.ndarray, in-memory representation of an expression
type(df['x', 'y']) # vaex.dataframe.DataFrameArrays
type(df['x', 'y'].values) # # numpy.ndarray, in-memory representation of a DataFrame
numpy.ndarray
# from numpy
import numpy as np
x = np.arange(5)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df.head()
# | x | y |
---|---|---|
<i style='opacity: 0.6'>0</i> | 0 | 0 |
<i style='opacity: 0.6'>1</i> | 1 | 1 |
<i style='opacity: 0.6'>2</i> | 2 | 4 |
<i style='opacity: 0.6'>3</i> | 3 | 9 |
<i style='opacity: 0.6'>4</i> | 4 | 16 |
# from Pandas DataFrame
import pandas as pd
data = {
'apples': [3, 2, 0, 1],
'oranges': [0, 3, 7, 2]
}
purchases = pd.DataFrame(data)
df = vaex.from_pandas(purchases)
df.head()
# | apples | oranges |
---|---|---|
<i style='opacity: 0.6'>0</i> | 3 | 0 |
<i style='opacity: 0.6'>1</i> | 2 | 3 |
<i style='opacity: 0.6'>2</i> | 0 | 7 |
<i style='opacity: 0.6'>3</i> | 1 | 2 |
# from dict
df = vaex.from_dict(data)
df.head()
# | apples | oranges |
---|---|---|
<i style='opacity: 0.6'>0</i> | 3 | 0 |
<i style='opacity: 0.6'>1</i> | 2 | 3 |
<i style='opacity: 0.6'>2</i> | 0 | 7 |
<i style='opacity: 0.6'>3</i> | 1 | 2 |
# from array
animal = ['dog', 'dog', 'cat', 'guinea pig', 'guinea pig', 'dog']
age = [2, 1, 5, 1, 3, 7]
cuteness = [9, 10, 5, 8, 4, 8]
df = vaex.from_arrays(animal=animal, age=age, cuteness=cuteness)
df
# | animal | age | cuteness |
---|---|---|---|
<i style='opacity: 0.6'>0</i> | dog | 2 | 9 |
<i style='opacity: 0.6'>1</i> | dog | 1 | 10 |
<i style='opacity: 0.6'>2</i> | cat | 5 | 5 |
<i style='opacity: 0.6'>3</i> | guinea pig | 1 | 8 |
<i style='opacity: 0.6'>4</i> | guinea pig | 3 | 4 |
<i style='opacity: 0.6'>5</i> | dog | 7 | 8 |
# input
df = vaex.from_csv('vaex.csv')
df = vaex.open('vaex.hdf5')
# output
df.export_csv('vaex2.csv')
df.export_hdf5('vaex2.hdf5')
df = vaex.example()
# row selection
df_negative = df[df.x < 0] # select by attribute values
rows = df[:5] # access by row indices
df['x'] # Expression
df[['x']] # DataFrame
# column selection
columns = df['x', 'y', 'z'] # access by column names
# or df[['x', 'y', 'z']]
columns = df[:, :5] # access by column indices
selection = df[:5, :5] # select rows and columns by indices
# Joining
# requiring about 8GB for a billion row 10^9 dataset
a = np.array(['a', 'b', 'c'])
x = np.arange(1,4)
df1 = vaex.from_arrays(a=a, x=x)
b = np.array(['a', 'b', 'd'])
y = x**2
df2 = vaex.from_arrays(b=b, y=y)
df1.join(df2, left_on='a', right_on='b', how='inner')
# | a | x | b | y |
---|---|---|---|---|
<i style='opacity: 0.6'>0</i> | a | 1 | a | 1 |
<i style='opacity: 0.6'>1</i> | b | 2 | b | 4 |
# Group-by and agg
animal = ['dog', 'dog', 'cat', 'guinea pig', 'guinea pig', 'dog']
age = [2, 1, 5, 1, 3, 7]
cuteness = [9, 10, 5, 8, 4, 8]
df_pets = vaex.from_arrays(animal=animal, age=age, cuteness=cuteness)
df_pets.groupby(by='animal').agg({'age_mean': vaex.agg.mean('age'), 'cuteness_mean': vaex.agg.mean('cuteness'), 'cuteness_std': vaex.agg.std('cuteness')})
# | animal | age_mean | cuteness_mean | cuteness_std |
---|---|---|---|---|
<i style='opacity: 0.6'>0</i> | dog | 3.33333 | 9 | 0.816497 |
<i style='opacity: 0.6'>1</i> | cat | 5 | 5 | 0 |
<i style='opacity: 0.6'>2</i> | guinea pig | 2 | 6 | 2 |
# number counts in 64 bins distributed between x = -10, and x = 10
counts_x = df.count(binby=df.x, limits=[-10, 10], shape=64)
import matplotlib.pylab as plt
plt.plot(np.linspace(-10, 10, 64), counts_x)
[<matplotlib.lines.Line2D at 0x7f887e1bff10>]
xycounts = df.count(binby=[df.x, df.y], limits=[[-10, 10], [-10, 20]], shape=(64, 128))
plt.imshow(xycounts.T, origin='lower', extent=[-10, 10, -10, 20])
<matplotlib.image.AxesImage at 0x7f887e34e290>
v = np.sqrt(df.vx**2 + df.vy**2 + df.vz**2)
xy_mean_v = df.mean(v, binby=[df.x, df.y], limits=[[-10, 10], [-10, 20]], shape=(64, 128))
plt.imshow(xy_mean_v.T, origin='lower', extent=[-10, 10, -10, 20])
<matplotlib.image.AxesImage at 0x7f887e3ee9d0>
df = vaex.example()
#plot statistic for that bin
df.plot1d(df.x, what='mean(E)', limits='99.7%');
#df.plot1d(df.x, what=vaex.stat.mean('E'), limits='99.7%');
# 2D plot
df.plot(df.x, df.y, what=vaex.stat.mean(df.E)**2, limits='99.7%');
df.plot('x', 'y', what=['mean(x)', 'correlation(vx, vy)'])
<matplotlib.image.AxesImage at 0x7f8862e547d0>
# Selections for plotting
df.plot(df.x, df.y, what=np.log(vaex.stat.count()+1), limits='99.7%', selection=[None, df.x < df.y, df.x < -10]);
# plot with vaex
df.plot(df.x, df.y)
<matplotlib.image.AxesImage at 0x7f882d65a050>
# plot with matplotlib
fig, ax = plt.subplots(figsize=(14,7))
x = df.x.evaluate() # x = df.x.values, numpy.ndarray
y = df.y.evaluate() # y = df.y.values, numpy.ndarray
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x7f8862d76110>
# Pandas DataFrames
pandas_df = df.to_pandas_df() # pandas.core.frame.DataFrame
# sklearn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df[['x']], df[['y']])
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(df[['x']], df[['y']])
/Users/lchen/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning) /Users/lchen/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None, oob_score=False, random_state=0, verbose=0, warm_start=False)
# Zeek
# Vaex has no dtype=dtypes
# Vaex does not support 'category'
# Vaex does not have to_datetime
# Vaex does not have to_timedelta
import vaex
from zat.log_to_dataframe import LogToDataFrame
from zat import zeek_log_reader
from datetime import timedelta, datetime
def convert_to_timedelta(float_num):
np.timedelta64(timedelta(seconds=float_num))
def convert_to_datetime(float_num):
return np.datetime64(datetime.utcfromtimestamp(float_num))
def vaex_read_zeek_log(filename):
reader = LogToDataFrame() # create LogToDataFrame object
_zeek_reader = zeek_log_reader.ZeekLogReader(filename)
_, field_names, field_types, _ = _zeek_reader._parse_zeek_header(filename) # read field names and types
temp2 = vaex.from_csv(filename, names=field_names, comment="#", na_values='-', sep='\t')
#pandas_types = reader.pd_column_types(field_names, field_types)
for name, zeek_type in zip(field_names, field_types):
if zeek_type == 'time':
temp2[name] = temp2[name].apply(convert_to_datetime)
if zeek_type == 'interval':
temp2[name] = temp2[name].apply(convert_to_timedelta)
return temp2
conn_df = vaex_read_zeek_log('conn.log')
conn_df.head()
# | ts | uid | id.orig_h | id.orig_p | id.resp_h | id.resp_p | proto | service | duration | orig_bytes | resp_bytes | conn_state | local_orig | missed_bytes | history | orig_pkts | orig_ip_bytes | resp_pkts | resp_ip_bytes | tunnel_parents |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<i style='opacity: 0.6'>0</i> | 2013-09-15 23:44:27.706265 | CoyZrY2g74UvMMgp4a | 192.168.33.10 | 1032 | 54.245.228.191 | 80 | tcp | http | None | 601 | 38393 | RSTO | nan | 0 | ShADadR | 22 | 1489 | 31 | 39641 | (empty) |
<i style='opacity: 0.6'>1</i> | 2013-09-15 23:44:28.093470 | CaJxA82D4HGxRzEgjc | 192.168.33.10 | 1039 | 54.245.228.191 | 80 | tcp | http | None | 311 | 14886 | RSTO | nan | 0 | ShADadR | 11 | 759 | 13 | 15414 | (empty) |
<i style='opacity: 0.6'>2</i> | 2013-09-15 23:44:28.082700 | CLp0PF2EcdkpGemkW5 | 192.168.33.10 | 1036 | 54.230.86.87 | 443 | tcp | ssl | None | 612 | 6801 | SF | nan | 0 | ShADadFf | 11 | 1060 | 11 | 7292 | (empty) |
<i style='opacity: 0.6'>3</i> | 2013-09-15 23:44:28.082016 | CvNL1g1RGVeCAszIRj | 192.168.33.10 | 1035 | 54.230.86.87 | 443 | tcp | ssl | None | 597 | 9842 | SF | nan | 0 | ShADadFf | 14 | 1165 | 13 | 10413 | (empty) |
<i style='opacity: 0.6'>4</i> | 2013-09-15 23:44:28.088450 | CqMpUScSnQV692X4e | 192.168.33.10 | 1037 | 54.230.86.87 | 443 | tcp | ssl | None | 595 | 7659 | SF | nan | 0 | ShADadFf | 12 | 1083 | 11 | 8107 | (empty) |
<i style='opacity: 0.6'>5</i> | 2013-09-15 23:44:28.081161 | CYpAce4RL5CsL2Mih4 | 192.168.33.10 | 1034 | 54.230.86.87 | 443 | tcp | ssl | None | 575 | 14252 | SF | nan | 0 | ShADadFf | 15 | 1183 | 16 | 14943 | (empty) |
<i style='opacity: 0.6'>6</i> | 2013-09-15 23:44:28.091612 | CHOCCV3ZKH82IRmP8 | 192.168.33.10 | 1038 | 54.230.86.87 | 443 | tcp | ssl | None | 603 | 10231 | SF | nan | 0 | ShADadFf | 13 | 1131 | 14 | 10842 | (empty) |
<i style='opacity: 0.6'>7</i> | 2013-09-15 23:44:28.080461 | CnKO90rg6a1qQ9Fc | 192.168.33.10 | 1033 | 54.230.86.87 | 443 | tcp | ssl | None | 567 | 43245 | SF | nan | 0 | ShADadFf | 28 | 1695 | 36 | 44736 | (empty) |
<i style='opacity: 0.6'>8</i> | 2013-09-15 23:44:28.628872 | CxEMBn1vFoG4tchRBi | 192.168.33.10 | 1049 | 54.230.86.87 | 443 | tcp | ssl | None | 636 | 12518 | SF | nan | 0 | ShADadFf | 13 | 1164 | 14 | 13086 | (empty) |
<i style='opacity: 0.6'>9</i> | 2013-09-15 23:44:28.636041 | CSdxbb38oFEe3p1swa | 192.168.33.10 | 1050 | 54.230.86.87 | 443 | tcp | ssl | None | 637 | 9715 | SF | nan | 0 | ShADadFf | 12 | 1125 | 12 | 10203 | (empty) |
df = vaex.example()
limits = [-10, 10]
delayed_count = df.count(df.E, binby=df.x, limits=limits, shape=4, delay=True)
delayed_sum = df.sum(df.E, binby=df.x, limits=limits, shape=4, delay=True)
@vaex.delayed
def calculate_mean(sums, counts):
print('calculating mean')
return sums/counts
delayed_mean = calculate_mean(delayed_sum, delayed_count)
# execuate computation
df.execute()
# get results
counts = delayed_count.get()
sums = delayed_sum.get()
means = delayed_mean.get()
means
calculating mean
array([ -94323.68051598, -118749.23850834, -119119.46292653, -95021.66183457])
# accessor function
@vaex.register_dataframe_accessor('scale', override=True)
class ScalingOps(object):
def __init__(self, df):
self.df = df
def add(self, a):
df = self.df.copy()
for col in df.get_column_names(strings=False):
if df[col].dtype:
df[col] = df[col] * a
return df
df.scale.add(1)
# apply
def add_x(value):
return value*10
df['x*10'] = df.apply(add_x, df.x)