import vaex
df = vaex.example()


df.head()


type(df) # vaex.hdf5.dataset.Hdf5MemoryMapped
type(df.x) # Expression
type(df.x.values) # numpy.ndarray, in-memory representation of an expression
type(df['x', 'y']) # vaex.dataframe.DataFrameArrays
type(df['x', 'y'].values) # # numpy.ndarray, in-memory representation of a DataFrame

numpy.ndarray


# from numpy
import numpy as np
x = np.arange(5)
y = x**2
df = vaex.from_arrays(x=x, y=y)
df.head()


# from Pandas DataFrame
import pandas as pd
data = {
    'apples': [3, 2, 0, 1], 
    'oranges': [0, 3, 7, 2]
}

purchases = pd.DataFrame(data)
df = vaex.from_pandas(purchases)
df.head()


# from dict
df = vaex.from_dict(data)
df.head()


# from array
animal = ['dog', 'dog', 'cat', 'guinea pig', 'guinea pig', 'dog']
age = [2, 1, 5, 1, 3, 7]
cuteness = [9, 10, 5, 8, 4, 8]
df = vaex.from_arrays(animal=animal, age=age, cuteness=cuteness)
df


# input
df = vaex.from_csv('vaex.csv')
df = vaex.open('vaex.hdf5')

# output
df.export_csv('vaex2.csv')
df.export_hdf5('vaex2.hdf5')


df = vaex.example()

# row selection
df_negative = df[df.x < 0] # select by attribute values
rows = df[:5] # access by row indices


df['x'] # Expression
df[['x']] # DataFrame


# column selection
columns = df['x', 'y', 'z'] # access by column names
# or df[['x', 'y', 'z']]
columns = df[:, :5] # access by column indices


selection = df[:5, :5] # select rows and columns by indices


# Joining
# requiring about 8GB for a billion row 10^9 dataset
a = np.array(['a', 'b', 'c'])
x = np.arange(1,4)
df1 = vaex.from_arrays(a=a, x=x)

b = np.array(['a', 'b', 'd'])
y = x**2
df2 = vaex.from_arrays(b=b, y=y)

df1.join(df2, left_on='a', right_on='b', how='inner')


# Group-by and agg
animal = ['dog', 'dog', 'cat', 'guinea pig', 'guinea pig', 'dog']
age = [2, 1, 5, 1, 3, 7]
cuteness = [9, 10, 5, 8, 4, 8]
df_pets = vaex.from_arrays(animal=animal, age=age, cuteness=cuteness)

df_pets.groupby(by='animal').agg({'age_mean': vaex.agg.mean('age'), 'cuteness_mean': vaex.agg.mean('cuteness'), 'cuteness_std': vaex.agg.std('cuteness')})


# number counts in 64 bins distributed between x = -10, and x = 10
counts_x = df.count(binby=df.x, limits=[-10, 10], shape=64)


import matplotlib.pylab as plt
plt.plot(np.linspace(-10, 10, 64), counts_x)

[<matplotlib.lines.Line2D at 0x7f887e1bff10>]


xycounts = df.count(binby=[df.x, df.y], limits=[[-10, 10], [-10, 20]], shape=(64, 128))
plt.imshow(xycounts.T, origin='lower', extent=[-10, 10, -10, 20])

<matplotlib.image.AxesImage at 0x7f887e34e290>


v = np.sqrt(df.vx**2 + df.vy**2 + df.vz**2)
xy_mean_v = df.mean(v, binby=[df.x, df.y], limits=[[-10, 10], [-10, 20]], shape=(64, 128))
plt.imshow(xy_mean_v.T, origin='lower', extent=[-10, 10, -10, 20])

<matplotlib.image.AxesImage at 0x7f887e3ee9d0>


df = vaex.example()


#plot statistic for that bin
df.plot1d(df.x, what='mean(E)', limits='99.7%');
#df.plot1d(df.x, what=vaex.stat.mean('E'), limits='99.7%');


# 2D plot
df.plot(df.x, df.y, what=vaex.stat.mean(df.E)**2, limits='99.7%');


df.plot('x', 'y', what=['mean(x)', 'correlation(vx, vy)'])

<matplotlib.image.AxesImage at 0x7f8862e547d0>


# Selections for plotting
df.plot(df.x, df.y, what=np.log(vaex.stat.count()+1), limits='99.7%', selection=[None, df.x < df.y, df.x < -10]);


# plot with vaex
df.plot(df.x, df.y)

<matplotlib.image.AxesImage at 0x7f882d65a050>


# plot with matplotlib
fig, ax = plt.subplots(figsize=(14,7))
x = df.x.evaluate() # x = df.x.values, numpy.ndarray
y = df.y.evaluate() # y = df.y.values, numpy.ndarray
plt.scatter(x, y)

<matplotlib.collections.PathCollection at 0x7f8862d76110>


# Pandas DataFrames
pandas_df = df.to_pandas_df() # pandas.core.frame.DataFrame


# sklearn
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(df[['x']], df[['y']])

from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(df[['x']], df[['y']])

/Users/lchen/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)
/Users/lchen/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)


# Zeek
# Vaex has no dtype=dtypes
# Vaex does not support 'category'
# Vaex does not have to_datetime
# Vaex does not have to_timedelta
import vaex
from zat.log_to_dataframe import LogToDataFrame
from zat import zeek_log_reader
from datetime import timedelta, datetime

def convert_to_timedelta(float_num):
    np.timedelta64(timedelta(seconds=float_num))

def convert_to_datetime(float_num):
    return np.datetime64(datetime.utcfromtimestamp(float_num))

def vaex_read_zeek_log(filename):

    reader = LogToDataFrame() # create LogToDataFrame object

    _zeek_reader = zeek_log_reader.ZeekLogReader(filename)
    _, field_names, field_types, _ = _zeek_reader._parse_zeek_header(filename) # read field names and types
    
    temp2 = vaex.from_csv(filename, names=field_names, comment="#", na_values='-', sep='\t')

    #pandas_types = reader.pd_column_types(field_names, field_types)
    for name, zeek_type in zip(field_names, field_types):
        if zeek_type == 'time':
            temp2[name]  = temp2[name].apply(convert_to_datetime)
        if zeek_type == 'interval':
            temp2[name] = temp2[name].apply(convert_to_timedelta)
    
    return temp2


conn_df = vaex_read_zeek_log('conn.log')
conn_df.head()


df = vaex.example()
limits = [-10, 10]
delayed_count = df.count(df.E, binby=df.x, limits=limits, shape=4, delay=True)
delayed_sum = df.sum(df.E, binby=df.x, limits=limits, shape=4, delay=True)

@vaex.delayed
def calculate_mean(sums, counts):
    print('calculating mean')
    return sums/counts

delayed_mean = calculate_mean(delayed_sum, delayed_count)

# execuate computation
df.execute()

# get results
counts = delayed_count.get()
sums = delayed_sum.get()
means = delayed_mean.get()
means

calculating mean

array([ -94323.68051598, -118749.23850834, -119119.46292653,
        -95021.66183457])


# accessor function
@vaex.register_dataframe_accessor('scale', override=True)
class ScalingOps(object):
    def __init__(self, df):
        self.df = df

    def add(self, a):
        df = self.df.copy()
        for col in df.get_column_names(strings=False):
            if df[col].dtype:
                df[col] = df[col] * a
        return df
    
df.scale.add(1)


# apply
def add_x(value):
    return value*10
    
df['x*10'] = df.apply(add_x, df.x)

#	id	x	y	z	vx	vy	vz	E	L	Lz	FeH
<i style='opacity: 0.6'>0</i>	0	1.23187	-0.396929	-0.598058	301.155	174.059	27.4275	-149431	407.389	333.956	-1.00539
<i style='opacity: 0.6'>1</i>	23	-0.163701	3.65422	-0.254906	-195	170.472	142.53	-124248	890.241	684.668	-1.70867
<i style='opacity: 0.6'>2</i>	32	-2.12026	3.32605	1.70784	-48.6342	171.647	-2.07944	-138501	372.241	-202.176	-1.83361
<i style='opacity: 0.6'>3</i>	8	4.71559	4.58525	2.25154	-232.421	-294.851	62.8587	-60037	1297.63	-324.688	-1.47869
<i style='opacity: 0.6'>4</i>	16	7.21719	11.9947	-1.06456	-1.68917	181.329	-11.3336	-83206.8	1332.8	1328.95	-1.85705
<i style='opacity: 0.6'>5</i>	16	-7.78437	5.98977	-0.682695	86.7009	-238.778	-2.31309	-86497.6	1353.25	1339.42	-1.91944
<i style='opacity: 0.6'>6</i>	12	8.08373	-3.27348	5.54687	-57.4544	120.117	5.37438	-101867	1100.8	782.915	-1.93517
<i style='opacity: 0.6'>7</i>	26	-3.55719	5.41363	0.0917156	-67.0511	-145.933	39.6374	-127682	921.008	882.101	-1.79423
<i style='opacity: 0.6'>8</i>	25	3.9848	5.40691	2.57724	-38.7449	-152.407	-92.9073	-113632	493.316	-397.824	-1.18076
<i style='opacity: 0.6'>9</i>	8	-20.8139	-3.29468	13.4866	99.4067	28.6749	-115.079	-55825.3	1088.46	-269.324	-1.28892

#	x	y
<i style='opacity: 0.6'>0</i>	0	0
<i style='opacity: 0.6'>1</i>	1	1
<i style='opacity: 0.6'>2</i>	2	4
<i style='opacity: 0.6'>3</i>	3	9
<i style='opacity: 0.6'>4</i>	4	16

#	apples	oranges
<i style='opacity: 0.6'>0</i>	3	0
<i style='opacity: 0.6'>1</i>	2	3
<i style='opacity: 0.6'>2</i>	0	7
<i style='opacity: 0.6'>3</i>	1	2

#	apples	oranges
<i style='opacity: 0.6'>0</i>	3	0
<i style='opacity: 0.6'>1</i>	2	3
<i style='opacity: 0.6'>2</i>	0	7
<i style='opacity: 0.6'>3</i>	1	2

#	ts	uid	id.orig_h	id.orig_p	id.resp_h	id.resp_p	proto	service	duration	orig_bytes	resp_bytes	conn_state	local_orig	history	orig_pkts	orig_ip_bytes	resp_pkts	resp_ip_bytes	tunnel_parents
<i style='opacity: 0.6'>0</i>	2013-09-15 23:44:27.706265	CoyZrY2g74UvMMgp4a	192.168.33.10	1032	54.245.228.191	80	tcp	http	None	601	38393	RSTO	nan	ShADadR	22	1489	31	39641	(empty)
<i style='opacity: 0.6'>1</i>	2013-09-15 23:44:28.093470	CaJxA82D4HGxRzEgjc	192.168.33.10	1039	54.245.228.191	80	tcp	http	None	311	14886	RSTO	nan	ShADadR	11	759	13	15414	(empty)
<i style='opacity: 0.6'>2</i>	2013-09-15 23:44:28.082700	CLp0PF2EcdkpGemkW5	192.168.33.10	1036	54.230.86.87	443	tcp	ssl	None	612	6801	SF	nan	ShADadFf	11	1060	11	7292	(empty)
<i style='opacity: 0.6'>3</i>	2013-09-15 23:44:28.082016	CvNL1g1RGVeCAszIRj	192.168.33.10	1035	54.230.86.87	443	tcp	ssl	None	597	9842	SF	nan	ShADadFf	14	1165	13	10413	(empty)
<i style='opacity: 0.6'>4</i>	2013-09-15 23:44:28.088450	CqMpUScSnQV692X4e	192.168.33.10	1037	54.230.86.87	443	tcp	ssl	None	595	7659	SF	nan	ShADadFf	12	1083	11	8107	(empty)
<i style='opacity: 0.6'>5</i>	2013-09-15 23:44:28.081161	CYpAce4RL5CsL2Mih4	192.168.33.10	1034	54.230.86.87	443	tcp	ssl	None	575	14252	SF	nan	ShADadFf	15	1183	16	14943	(empty)
<i style='opacity: 0.6'>6</i>	2013-09-15 23:44:28.091612	CHOCCV3ZKH82IRmP8	192.168.33.10	1038	54.230.86.87	443	tcp	ssl	None	603	10231	SF	nan	ShADadFf	13	1131	14	10842	(empty)
<i style='opacity: 0.6'>7</i>	2013-09-15 23:44:28.080461	CnKO90rg6a1qQ9Fc	192.168.33.10	1033	54.230.86.87	443	tcp	ssl	None	567	43245	SF	nan	ShADadFf	28	1695	36	44736	(empty)
<i style='opacity: 0.6'>8</i>	2013-09-15 23:44:28.628872	CxEMBn1vFoG4tchRBi	192.168.33.10	1049	54.230.86.87	443	tcp	ssl	None	636	12518	SF	nan	ShADadFf	13	1164	14	13086	(empty)
<i style='opacity: 0.6'>9</i>	2013-09-15 23:44:28.636041	CSdxbb38oFEe3p1swa	192.168.33.10	1050	54.230.86.87	443	tcp	ssl	None	637	9715	SF	nan	ShADadFf	12	1125	12	10203	(empty)

Vaex¶

Installation¶

Load Example¶

Data Management¶

Data Structure¶

Construct DataFrame¶

IO¶

Accessing¶

Database Operations¶

Statistics¶

Plotting¶

Compatibility¶

Delayed Computation¶

Adding Accessors¶

Reference¶

#	animal	age_mean	cuteness_mean	cuteness_std
<i style='opacity: 0.6'>0</i>	dog	3.33333	9	0.816497
<i style='opacity: 0.6'>1</i>	cat	5	5	0
<i style='opacity: 0.6'>2</i>	guinea pig	2	6	2