Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions benchmarks/pandas/bench_at_iat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Benchmark: Series.at, Series.iat, DataFrame.at, DataFrame.iat — fast scalar access"""
import json
import time
import pandas as pd

N = 100_000
WARMUP = 3
ITERATIONS = 10

labels = [f"r{i}" for i in range(N)]
values = [i * 1.5 for i in range(N)]

s = pd.Series(values, index=labels)
df = pd.DataFrame({"a": values, "b": [v * 2 for v in values]}, index=labels)

mid_label = f"r{N // 2}"

for _ in range(WARMUP):
_ = s.at[mid_label]
_ = s.iat[N // 2]
_ = df.at[mid_label, "a"]
_ = df.iat[N // 2, 0]

start = time.perf_counter()
for _ in range(ITERATIONS):
_ = s.at[mid_label]
_ = s.iat[N // 2]
_ = df.at[mid_label, "a"]
_ = df.iat[N // 2, 0]
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "at_iat",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
32 changes: 32 additions & 0 deletions benchmarks/pandas/bench_cross_join.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Benchmark: cross_join — Cartesian product of two 300-row DataFrames (90k result rows)"""
import json
import time
import pandas as pd

N = 300
WARMUP = 3
ITERATIONS = 10

left = pd.DataFrame({
"id_a": list(range(N)),
"val_a": [i * 1.5 for i in range(N)],
})
right = pd.DataFrame({
"id_b": list(range(N)),
"val_b": [i * 2.5 for i in range(N)],
})

for _ in range(WARMUP):
pd.merge(left, right, how="cross")

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.merge(left, right, how="cross")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "cross_join",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
56 changes: 56 additions & 0 deletions benchmarks/pandas/bench_cut_bins_to_frame.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
NUM_BINS = 20
WARMUP = 5
ITERATIONS = 50

data = np.array([(i % 1000) * 0.1 for i in range(SIZE)])

for _ in range(WARMUP):
# pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result
cut_result = pd.cut(data, NUM_BINS)
# Summary DataFrame equivalent to cutBinsToFrame
counts = cut_result.value_counts(sort=False)
summary = pd.DataFrame({
"bin": counts.index.astype(str),
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
"count": counts.values,
"frequency": counts.values / len(data),
})
# cutBinCounts equivalent: counts dict
count_dict = dict(zip(counts.index.astype(str), counts.values))
# binEdges equivalent: DataFrame of interval edges
edges = pd.DataFrame({
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
})

start = time.perf_counter()
for _ in range(ITERATIONS):
cut_result = pd.cut(data, NUM_BINS)
counts = cut_result.value_counts(sort=False)
summary = pd.DataFrame({
"bin": counts.index.astype(str),
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
"count": counts.values,
"frequency": counts.values / len(data),
})
count_dict = dict(zip(counts.index.astype(str), counts.values))
edges = pd.DataFrame({
"left": [iv.left for iv in counts.index],
"right": [iv.right for iv in counts.index],
})
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "cut_bins_to_frame",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
31 changes: 31 additions & 0 deletions benchmarks/pandas/bench_filter_series.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Benchmark: Series.filter — filter Series index labels by items/like/regex"""
import json
import time
import pandas as pd

N = 100_000
WARMUP = 3
ITERATIONS = 10

labels = [f"label_{i}" for i in range(N)]
values = [i * 0.5 for i in range(N)]
s = pd.Series(values, index=labels)

keep_items = [f"label_{i * 100}" for i in range(1_000)]

for _ in range(WARMUP):
s.filter(items=keep_items)
s.filter(like="label_5")

start = time.perf_counter()
for _ in range(ITERATIONS):
s.filter(items=keep_items)
s.filter(like="label_5")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "filter_series",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
30 changes: 30 additions & 0 deletions benchmarks/pandas/bench_join_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Benchmark: join_all — sequential left-join of 4 DataFrames each with 5k rows"""
import json
import time
import pandas as pd

N = 5_000
WARMUP = 3
ITERATIONS = 10

idx = [str(i) for i in range(N)]

base = pd.DataFrame({"a": list(range(N))}, index=idx)
df1 = pd.DataFrame({"b": [i * 2 for i in range(N)]}, index=idx)
df2 = pd.DataFrame({"c": [i * 3 for i in range(N)]}, index=idx)
df3 = pd.DataFrame({"d": [i * 4 for i in range(N)]}, index=idx)

for _ in range(WARMUP):
base.join([df1, df2, df3])

start = time.perf_counter()
for _ in range(ITERATIONS):
base.join([df1, df2, df3])
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "join_all",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
35 changes: 35 additions & 0 deletions benchmarks/pandas/bench_math_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Benchmark: math_ops — abs / round on Series and DataFrame of 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

s = pd.Series(np.where(np.arange(SIZE) % 2 == 0, -(np.arange(SIZE) + 0.567), np.arange(SIZE) + 0.567))
df = pd.DataFrame({
"a": -(np.arange(SIZE) + 0.123),
"b": np.arange(SIZE) + 0.456,
})

for _ in range(WARMUP):
s.abs()
df.abs()
s.round(1)
df.round(1)

start = time.perf_counter()
for _ in range(ITERATIONS):
s.abs()
df.abs()
s.round(1)
df.round(1)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "math_ops",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
34 changes: 34 additions & 0 deletions benchmarks/pandas/bench_merge_asof.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Benchmark: merge_asof — backward asof join of two 10k-row sorted DataFrames"""
import json
import time
import pandas as pd

N = 10_000
WARMUP = 3
ITERATIONS = 10

# Trades sorted by time: 0, 2, 4, ...
trade_times = list(range(0, N * 2, 2))
prices = [100.0 + i * 0.5 for i in range(N)]

# Quotes sorted by time, sparser: 0, 3, 6, ...
quote_times = list(range(0, N * 3, 3))
bids = [99.0 + i * 0.5 for i in range(N)]

trades = pd.DataFrame({"time": trade_times, "price": prices})
quotes = pd.DataFrame({"time": quote_times, "bid": bids})

for _ in range(WARMUP):
pd.merge_asof(trades, quotes, on="time")

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.merge_asof(trades, quotes, on="time")
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "merge_asof",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
42 changes: 42 additions & 0 deletions benchmarks/pandas/bench_na_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Benchmark: na_ops — isna / notna / ffill / bfill on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

data = pd.array([i if i % 5 != 0 else pd.NA for i in range(SIZE)], dtype="Int64")
s = pd.Series(data, dtype="float64")
s[np.arange(SIZE) % 5 == 0] = np.nan

df = pd.DataFrame({
"a": s,
"b": pd.Series([float(i * 2) if i % 7 != 0 else np.nan for i in range(SIZE)]),
})

for _ in range(WARMUP):
pd.isna(s)
pd.notna(s)
s.ffill()
s.bfill()
df.ffill()
df.bfill()

start = time.perf_counter()
for _ in range(ITERATIONS):
pd.isna(s)
pd.notna(s)
s.ffill()
s.bfill()
df.ffill()
df.bfill()
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "na_ops",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
36 changes: 36 additions & 0 deletions benchmarks/pandas/bench_notna_boolean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Benchmark: notna_boolean — boolean-mask indexing on 100k rows."""
import json, time
import numpy as np
import pandas as pd

SIZE = 100_000
WARMUP = 5
ITERATIONS = 50

s = pd.Series(np.arange(SIZE))
mask = pd.Series(np.arange(SIZE) % 2 == 0)
bool_arr = np.arange(SIZE) % 3 != 0

df = pd.DataFrame({
"a": np.arange(SIZE),
"b": np.arange(SIZE) * 2,
})

for _ in range(WARMUP):
s[mask]
s[~mask]
df[bool_arr]

start = time.perf_counter()
for _ in range(ITERATIONS):
s[mask]
s[~mask]
df[bool_arr]
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "notna_boolean",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
34 changes: 34 additions & 0 deletions benchmarks/pandas/bench_pow_mod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Benchmark: Series.pow, Series.mod, DataFrame.pow on 100k rows"""
import json, time
import numpy as np
import pandas as pd

ROWS = 100_000
WARMUP = 3
ITERATIONS = 10

data = (np.arange(ROWS) % 100) + 1
s = pd.Series(data.astype(float))
df = pd.DataFrame({
"a": ((np.arange(ROWS) % 100) + 1).astype(float),
"b": ((np.arange(ROWS) % 50) + 1).astype(float),
})

for _ in range(WARMUP):
s.pow(2)
s.mod(7)
df.pow(2)

start = time.perf_counter()
for _ in range(ITERATIONS):
s.pow(2)
s.mod(7)
df.pow(2)
total = (time.perf_counter() - start) * 1000

print(json.dumps({
"function": "pow_mod",
"mean_ms": total / ITERATIONS,
"iterations": ITERATIONS,
"total_ms": total,
}))
Loading