groupby¶
groupby_multi_different_functions¶
Benchmark setup
from pandas_vb_common import *
fac1 = np.array(['A', 'B', 'C'], dtype='O')
fac2 = np.array(['one', 'two'], dtype='O')
df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=100000)),
'key2': fac2.take(np.random.randint(0, 2, size=100000)),
'value1' : np.random.randn(100000),
'value2' : np.random.randn(100000),
'value3' : np.random.randn(100000)})
Benchmark statement
df.groupby(['key1', 'key2']).agg({'value1' : 'mean',
'value2' : 'var',
'value3' : 'sum'})
Performance graph
groupby_apply_dict_return¶
Benchmark setup
from pandas_vb_common import *
labels = np.arange(1000).repeat(10)
data = Series(randn(len(labels)))
f = lambda x: {'first': x.values[0], 'last': x.values[-1]}
Benchmark statement
data.groupby(labels).apply(f)
Performance graph
groupby_frame_cython_many_columns¶
Benchmark setup
from pandas_vb_common import *
labels = np.random.randint(0, 100, size=1000)
df = DataFrame(randn(1000, 1000))
Benchmark statement
df.groupby(labels).sum()
Performance graph
groupby_indices¶
Benchmark setup
from pandas_vb_common import *
try:
rng = date_range('1/1/2000', '12/31/2005', freq='H')
year, month, day = rng.year, rng.month, rng.day
except:
rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour())
year = rng.map(lambda x: x.year)
month = rng.map(lambda x: x.month)
day = rng.map(lambda x: x.day)
ts = Series(np.random.randn(len(rng)), index=rng)
Benchmark statement
len(ts.groupby([year, month, day]))
Performance graph
groupby_multi_cython¶
Benchmark setup
from pandas_vb_common import *
N = 100000
ngroups = 100
def get_test_data(ngroups=100, n=N):
unique_groups = range(ngroups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)
random.shuffle(arr)
return arr
# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
'key2' : get_test_data(ngroups=ngroups),
'data1' : np.random.randn(N),
'data2' : np.random.randn(N)})
def f():
df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())
simple_series = Series(np.random.randn(N))
key1 = df['key1']
Benchmark statement
df.groupby(['key1', 'key2']).sum()
Performance graph
series_value_counts_int64¶
Benchmark setup
from pandas_vb_common import *
s = Series(np.random.randint(0, 1000, size=100000))
Benchmark statement
s.value_counts()
Performance graph
groupby_multi_python¶
Benchmark setup
from pandas_vb_common import *
N = 100000
ngroups = 100
def get_test_data(ngroups=100, n=N):
unique_groups = range(ngroups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)
random.shuffle(arr)
return arr
# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
'key2' : get_test_data(ngroups=ngroups),
'data1' : np.random.randn(N),
'data2' : np.random.randn(N)})
def f():
df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())
simple_series = Series(np.random.randn(N))
key1 = df['key1']
Benchmark statement
df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())
Performance graph
groupby_first¶
Benchmark setup
from pandas_vb_common import *
labels = np.arange(10000).repeat(10)
data = Series(randn(len(labels)))
data[::3] = np.nan
data[1::3] = np.nan
labels = labels.take(np.random.permutation(len(labels)))
Benchmark statement
data.groupby(labels).first()
Performance graph
groupby_multi_different_numpy_functions¶
Benchmark setup
from pandas_vb_common import *
fac1 = np.array(['A', 'B', 'C'], dtype='O')
fac2 = np.array(['one', 'two'], dtype='O')
df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=100000)),
'key2': fac2.take(np.random.randint(0, 2, size=100000)),
'value1' : np.random.randn(100000),
'value2' : np.random.randn(100000),
'value3' : np.random.randn(100000)})
Benchmark statement
df.groupby(['key1', 'key2']).agg({'value1' : np.mean,
'value2' : np.var,
'value3' : np.sum})
Performance graph
groupby_last¶
Benchmark setup
from pandas_vb_common import *
labels = np.arange(10000).repeat(10)
data = Series(randn(len(labels)))
data[::3] = np.nan
data[1::3] = np.nan
labels = labels.take(np.random.permutation(len(labels)))
Benchmark statement
data.groupby(labels).last()
Performance graph
groupby_multi_size¶
Benchmark setup
from pandas_vb_common import *
df = DataFrame({'key1': np.random.randint(0, 500, size=100000),
'key2': np.random.randint(0, 100, size=100000),
'value1' : np.random.randn(100000),
'value2' : np.random.randn(100000),
'value3' : np.random.randn(100000)})
Benchmark statement
df.groupby(['key1', 'key2']).size()
Performance graph
groupby_frame_singlekey_integer¶
Benchmark setup
from pandas_vb_common import *
data = np.random.randn(100000, 1)
labels = np.random.randint(0, 1000, size=100000)
df = DataFrame(data)
Benchmark statement
df.groupby(labels).sum()
Performance graph
groupby_pivot_table¶
Benchmark setup
from pandas_vb_common import *
fac1 = np.array(['A', 'B', 'C'], dtype='O')
fac2 = np.array(['one', 'two'], dtype='O')
ind1 = np.random.randint(0, 3, size=100000)
ind2 = np.random.randint(0, 2, size=100000)
df = DataFrame({'key1': fac1.take(ind1),
'key2': fac2.take(ind2),
'key3': fac2.take(ind2),
'value1' : np.random.randn(100000),
'value2' : np.random.randn(100000),
'value3' : np.random.randn(100000)})
Benchmark statement
df.pivot_table(rows='key1', cols=['key2', 'key3'])
Performance graph
groupby_series_simple_cython¶
Benchmark setup
from pandas_vb_common import *
N = 100000
ngroups = 100
def get_test_data(ngroups=100, n=N):
unique_groups = range(ngroups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)
random.shuffle(arr)
return arr
# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
'key2' : get_test_data(ngroups=ngroups),
'data1' : np.random.randn(N),
'data2' : np.random.randn(N)})
def f():
df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())
simple_series = Series(np.random.randn(N))
key1 = df['key1']
Benchmark statement
simple_series.groupby(key1).sum()
Performance graph
groupby_multi_series_op¶
Benchmark setup
from pandas_vb_common import *
N = 100000
ngroups = 100
def get_test_data(ngroups=100, n=N):
unique_groups = range(ngroups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)
random.shuffle(arr)
return arr
# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
'key2' : get_test_data(ngroups=ngroups),
'data1' : np.random.randn(N),
'data2' : np.random.randn(N)})
def f():
df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())
simple_series = Series(np.random.randn(N))
key1 = df['key1']
Benchmark statement
df.groupby(['key1', 'key2'])['data1'].agg(np.std)
Performance graph