Vbench performance benchmarks for pandas

parser

read_csv_thou_vb

Benchmark setup

from pandas_vb_common import *

from pandas import read_csv
import os
N = 10000
K = 8
format = lambda x: '{:,}'.format(x)
df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)))
df = df.applymap(format)
df.to_csv('test.csv', sep='|')

Benchmark statement

read_csv('test.csv', sep='|', thousands=',')

Performance graph

_images/read_csv_thou_vb.png

read_csv_comment_vb

Benchmark setup

from pandas_vb_common import *

from pandas import read_csv
import os
N = 10000
K = 8
format = lambda x: '%f' % x
df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)))
df = df.applymap(format)
df.ix[:5, 0] = '#'
df.to_csv('test.csv', sep='|')

Benchmark statement

read_csv('test.csv', sep='|', comment='#')

Performance graph

_images/read_csv_comment_vb.png

read_table_multiple_date_baseline

Benchmark setup

from pandas_vb_common import *

from pandas import read_table
from cStringIO import StringIO
import os
N = 10000
K = 8
data = '''KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
'''
data = data * 2000

Benchmark statement

read_table(StringIO(data), sep=',', header=None, parse_dates=[1])

Performance graph

_images/read_table_multiple_date_baseline.png

read_csv_vb

Benchmark setup

from pandas_vb_common import *

from pandas import read_csv
import os
N = 10000
K = 8
df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)))
df.to_csv('test.csv', sep='|')

Benchmark statement

read_csv('test.csv', sep='|')

Performance graph

_images/read_csv_vb.png

read_table_multiple_date

Benchmark setup

from pandas_vb_common import *

from pandas import read_table
from cStringIO import StringIO
import os
N = 10000
K = 8
data = '''KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
'''
data = data * 2000

Benchmark statement

read_table(StringIO(data), sep=',', header=None, parse_dates=[[1,2], [1,3]])

Performance graph

_images/read_table_multiple_date.png