Vbench performance benchmarks for pandas

reindex

reindex_daterange_backfill

Benchmark setup

from pandas_vb_common import *

rng = DateRange('1/1/2000', periods=10000, offset=datetools.Minute())

ts = Series(np.random.randn(len(rng)), index=rng)
ts2 = ts[::2]
ts3 = ts2.reindex(ts.index)

def pad():
    try:
        ts2.reindex(ts.index, method='pad')
    except:
        ts2.reindex(ts.index, fillMethod='pad')
def backfill():
    try:
        ts2.reindex(ts.index, method='backfill')
    except:
        ts2.reindex(ts.index, fillMethod='backfill')

Benchmark statement

backfill()

Performance graph

_images/reindex_daterange_backfill.png

reindex_frame_level_align

Benchmark setup

from pandas_vb_common import *

index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)],
                   labels=[np.arange(10).repeat(10000),
                           np.tile(np.arange(100).repeat(100), 10),
                           np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(index.values)
df = DataFrame(np.random.randn(len(index), 4), index=index)
df_level = DataFrame(np.random.randn(100, 4), index=index.levels[1])

Benchmark statement

df.align(df_level, level=1, copy=False)

Performance graph

_images/reindex_frame_level_align.png

dataframe_reindex_columns

Benchmark setup

from pandas_vb_common import *

df = DataFrame(index=range(10000), data=np.random.rand(10000,30),
               columns=range(30))

Benchmark statement

df.reindex(columns=df.columns[1:5])

Performance graph

_images/dataframe_reindex_columns.png

frame_drop_dup_na_inplace

Benchmark setup

from pandas_vb_common import *

import pandas._tseries as lib
N = 10000
K = 10

key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)

df = DataFrame({'key1' : key1, 'key2' : key2,
                'value' : np.random.randn(N * K)})

df.ix[:10000, :] = np.nan

Benchmark statement

df.drop_duplicates(['key1', 'key2'], inplace=True)

Performance graph

_images/frame_drop_dup_na_inplace.png

frame_sort_index_by_columns

Benchmark setup

from pandas_vb_common import *

import pandas._tseries as lib
N = 10000
K = 10

key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)

df = DataFrame({'key1' : key1, 'key2' : key2,
                'value' : np.random.randn(N * K)})

Benchmark statement

df.sort_index(by=['key1', 'key2'])

Performance graph

_images/frame_sort_index_by_columns.png

reindex_daterange_pad

Benchmark setup

from pandas_vb_common import *

rng = DateRange('1/1/2000', periods=10000, offset=datetools.Minute())

ts = Series(np.random.randn(len(rng)), index=rng)
ts2 = ts[::2]
ts3 = ts2.reindex(ts.index)

def pad():
    try:
        ts2.reindex(ts.index, method='pad')
    except:
        ts2.reindex(ts.index, fillMethod='pad')
def backfill():
    try:
        ts2.reindex(ts.index, method='backfill')
    except:
        ts2.reindex(ts.index, fillMethod='backfill')

Benchmark statement

pad()

Performance graph

_images/reindex_daterange_pad.png

frame_fillna_many_columns_pad

Benchmark setup

from pandas_vb_common import *

values = np.random.randn(1000, 1000)
values[::2] = np.nan
df = DataFrame(values)

Benchmark statement

df.fillna(method='pad')

Performance graph

_images/frame_fillna_many_columns_pad.png

lib_fast_zip_fillna

Benchmark setup

from pandas_vb_common import *

import pandas._tseries as lib
N = 10000
K = 10

key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)

df = DataFrame({'key1' : key1, 'key2' : key2,
                'value' : np.random.randn(N * K)})

df.ix[:10000, :] = np.nan

Benchmark statement

lib.fast_zip_fillna(df.values.T)

Performance graph

_images/lib_fast_zip_fillna.png

reindex_fillna_backfill

Benchmark setup

from pandas_vb_common import *

rng = DateRange('1/1/2000', periods=10000, offset=datetools.Minute())

ts = Series(np.random.randn(len(rng)), index=rng)
ts2 = ts[::2]
ts3 = ts2.reindex(ts.index)

def pad():
    try:
        ts2.reindex(ts.index, method='pad')
    except:
        ts2.reindex(ts.index, fillMethod='pad')
def backfill():
    try:
        ts2.reindex(ts.index, method='backfill')
    except:
        ts2.reindex(ts.index, fillMethod='backfill')

Benchmark statement

ts3.fillna(method='backfill')

Performance graph

_images/reindex_fillna_backfill.png

frame_drop_duplicates

Benchmark setup

from pandas_vb_common import *

import pandas._tseries as lib
N = 10000
K = 10

key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)

df = DataFrame({'key1' : key1, 'key2' : key2,
                'value' : np.random.randn(N * K)})

Benchmark statement

df.drop_duplicates(['key1', 'key2'])

Performance graph

_images/frame_drop_duplicates.png

dataframe_reindex_daterange

Benchmark setup

from pandas_vb_common import *

rng = DateRange('1/1/1970', periods=10000, offset=datetools.Minute())
df = DataFrame(np.random.rand(10000, 10), index=rng,
               columns=range(10))
df['foo'] = 'bar'
rng2 = Index(rng[::2])

Benchmark statement

df.reindex(rng2)

Performance graph

_images/dataframe_reindex_daterange.png

frame_drop_dup_inplace

Benchmark setup

from pandas_vb_common import *

import pandas._tseries as lib
N = 10000
K = 10

key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)

df = DataFrame({'key1' : key1, 'key2' : key2,
                'value' : np.random.randn(N * K)})

Benchmark statement

df.drop_duplicates(['key1', 'key2'], inplace=True)

Performance graph

_images/frame_drop_dup_inplace.png

reindex_multiindex

Benchmark setup

from pandas_vb_common import *

N = 1000
K = 20

level1 = np.array([tm.rands(10) for _ in xrange(N)], dtype='O').repeat(K)
level2 = np.tile(np.array([tm.rands(10) for _ in xrange(K)], dtype='O'),
                 N)
index = MultiIndex.from_arrays([level1, level2])

s1 = Series(np.random.randn(N * K), index=index)
s2 = s1[::2]

Benchmark statement

s1.reindex(s2.index)

Performance graph

_images/reindex_multiindex.png

reindex_frame_level_reindex

Benchmark setup

from pandas_vb_common import *

index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)],
                   labels=[np.arange(10).repeat(10000),
                           np.tile(np.arange(100).repeat(100), 10),
                           np.tile(np.tile(np.arange(100), 100), 10)])
random.shuffle(index.values)
df = DataFrame(np.random.randn(len(index), 4), index=index)
df_level = DataFrame(np.random.randn(100, 4), index=index.levels[1])

Benchmark statement

df_level.reindex(df.index, level=1)

Performance graph

_images/reindex_frame_level_reindex.png

reindex_fillna_pad

Benchmark setup

from pandas_vb_common import *

rng = DateRange('1/1/2000', periods=10000, offset=datetools.Minute())

ts = Series(np.random.randn(len(rng)), index=rng)
ts2 = ts[::2]
ts3 = ts2.reindex(ts.index)

def pad():
    try:
        ts2.reindex(ts.index, method='pad')
    except:
        ts2.reindex(ts.index, fillMethod='pad')
def backfill():
    try:
        ts2.reindex(ts.index, method='backfill')
    except:
        ts2.reindex(ts.index, fillMethod='backfill')

Benchmark statement

ts3.fillna(method='pad')

Performance graph

_images/reindex_fillna_pad.png

frame_drop_duplicates_na

Benchmark setup

from pandas_vb_common import *

import pandas._tseries as lib
N = 10000
K = 10

key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)

df = DataFrame({'key1' : key1, 'key2' : key2,
                'value' : np.random.randn(N * K)})

df.ix[:10000, :] = np.nan

Benchmark statement

df.drop_duplicates(['key1', 'key2'])

Performance graph

_images/frame_drop_duplicates_na.png

lib_fast_zip

Benchmark setup

from pandas_vb_common import *

import pandas._tseries as lib
N = 10000
K = 10

key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)
key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K)

df = DataFrame({'key1' : key1, 'key2' : key2,
                'value' : np.random.randn(N * K)})

Benchmark statement

lib.fast_zip(df.values.T)

Performance graph

_images/lib_fast_zip.png