diff --git a/benchmarks/pandas/bench_at_iat.py b/benchmarks/pandas/bench_at_iat.py new file mode 100644 index 00000000..662c5e43 --- /dev/null +++ b/benchmarks/pandas/bench_at_iat.py @@ -0,0 +1,37 @@ +"""Benchmark: Series.at, Series.iat, DataFrame.at, DataFrame.iat — fast scalar access""" +import json +import time +import pandas as pd + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +labels = [f"r{i}" for i in range(N)] +values = [i * 1.5 for i in range(N)] + +s = pd.Series(values, index=labels) +df = pd.DataFrame({"a": values, "b": [v * 2 for v in values]}, index=labels) + +mid_label = f"r{N // 2}" + +for _ in range(WARMUP): + _ = s.at[mid_label] + _ = s.iat[N // 2] + _ = df.at[mid_label, "a"] + _ = df.iat[N // 2, 0] + +start = time.perf_counter() +for _ in range(ITERATIONS): + _ = s.at[mid_label] + _ = s.iat[N // 2] + _ = df.at[mid_label, "a"] + _ = df.iat[N // 2, 0] +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "at_iat", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_cross_join.py b/benchmarks/pandas/bench_cross_join.py new file mode 100644 index 00000000..ad1de45b --- /dev/null +++ b/benchmarks/pandas/bench_cross_join.py @@ -0,0 +1,32 @@ +"""Benchmark: cross_join — Cartesian product of two 300-row DataFrames (90k result rows)""" +import json +import time +import pandas as pd + +N = 300 +WARMUP = 3 +ITERATIONS = 10 + +left = pd.DataFrame({ + "id_a": list(range(N)), + "val_a": [i * 1.5 for i in range(N)], +}) +right = pd.DataFrame({ + "id_b": list(range(N)), + "val_b": [i * 2.5 for i in range(N)], +}) + +for _ in range(WARMUP): + pd.merge(left, right, how="cross") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.merge(left, right, how="cross") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "cross_join", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_cut_bins_to_frame.py b/benchmarks/pandas/bench_cut_bins_to_frame.py new file mode 100644 index 00000000..5ae5908c --- /dev/null +++ b/benchmarks/pandas/bench_cut_bins_to_frame.py @@ -0,0 +1,56 @@ +"""Benchmark: cut_bins_to_frame — pd.cut with value_counts and bin summary on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +NUM_BINS = 20 +WARMUP = 5 +ITERATIONS = 50 + +data = np.array([(i % 1000) * 0.1 for i in range(SIZE)]) + +for _ in range(WARMUP): + # pandas equivalent of cutBinsToFrame: cut + value_counts on the categorical result + cut_result = pd.cut(data, NUM_BINS) + # Summary DataFrame equivalent to cutBinsToFrame + counts = cut_result.value_counts(sort=False) + summary = pd.DataFrame({ + "bin": counts.index.astype(str), + "left": [iv.left for iv in counts.index], + "right": [iv.right for iv in counts.index], + "count": counts.values, + "frequency": counts.values / len(data), + }) + # cutBinCounts equivalent: counts dict + count_dict = dict(zip(counts.index.astype(str), counts.values)) + # binEdges equivalent: DataFrame of interval edges + edges = pd.DataFrame({ + "left": [iv.left for iv in counts.index], + "right": [iv.right for iv in counts.index], + }) + +start = time.perf_counter() +for _ in range(ITERATIONS): + cut_result = pd.cut(data, NUM_BINS) + counts = cut_result.value_counts(sort=False) + summary = pd.DataFrame({ + "bin": counts.index.astype(str), + "left": [iv.left for iv in counts.index], + "right": [iv.right for iv in counts.index], + "count": counts.values, + "frequency": counts.values / len(data), + }) + count_dict = dict(zip(counts.index.astype(str), counts.values)) + edges = pd.DataFrame({ + "left": [iv.left for iv in counts.index], + "right": [iv.right for iv in counts.index], + }) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "cut_bins_to_frame", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_filter_series.py b/benchmarks/pandas/bench_filter_series.py new file mode 100644 index 00000000..ec653243 --- /dev/null +++ b/benchmarks/pandas/bench_filter_series.py @@ -0,0 +1,31 @@ +"""Benchmark: Series.filter — filter Series index labels by items/like/regex""" +import json +import time +import pandas as pd + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +labels = [f"label_{i}" for i in range(N)] +values = [i * 0.5 for i in range(N)] +s = pd.Series(values, index=labels) + +keep_items = [f"label_{i * 100}" for i in range(1_000)] + +for _ in range(WARMUP): + s.filter(items=keep_items) + s.filter(like="label_5") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.filter(items=keep_items) + s.filter(like="label_5") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "filter_series", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_join_all.py b/benchmarks/pandas/bench_join_all.py new file mode 100644 index 00000000..040aa028 --- /dev/null +++ b/benchmarks/pandas/bench_join_all.py @@ -0,0 +1,30 @@ +"""Benchmark: join_all — sequential left-join of 4 DataFrames each with 5k rows""" +import json +import time +import pandas as pd + +N = 5_000 +WARMUP = 3 +ITERATIONS = 10 + +idx = [str(i) for i in range(N)] + +base = pd.DataFrame({"a": list(range(N))}, index=idx) +df1 = pd.DataFrame({"b": [i * 2 for i in range(N)]}, index=idx) +df2 = pd.DataFrame({"c": [i * 3 for i in range(N)]}, index=idx) +df3 = pd.DataFrame({"d": [i * 4 for i in range(N)]}, index=idx) + +for _ in range(WARMUP): + base.join([df1, df2, df3]) + +start = time.perf_counter() +for _ in range(ITERATIONS): + base.join([df1, df2, df3]) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "join_all", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_math_ops.py b/benchmarks/pandas/bench_math_ops.py new file mode 100644 index 00000000..1159ec02 --- /dev/null +++ b/benchmarks/pandas/bench_math_ops.py @@ -0,0 +1,35 @@ +"""Benchmark: math_ops — abs / round on Series and DataFrame of 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.where(np.arange(SIZE) % 2 == 0, -(np.arange(SIZE) + 0.567), np.arange(SIZE) + 0.567)) +df = pd.DataFrame({ + "a": -(np.arange(SIZE) + 0.123), + "b": np.arange(SIZE) + 0.456, +}) + +for _ in range(WARMUP): + s.abs() + df.abs() + s.round(1) + df.round(1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.abs() + df.abs() + s.round(1) + df.round(1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "math_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_merge_asof.py b/benchmarks/pandas/bench_merge_asof.py new file mode 100644 index 00000000..5517d2f8 --- /dev/null +++ b/benchmarks/pandas/bench_merge_asof.py @@ -0,0 +1,34 @@ +"""Benchmark: merge_asof — backward asof join of two 10k-row sorted DataFrames""" +import json +import time +import pandas as pd + +N = 10_000 +WARMUP = 3 +ITERATIONS = 10 + +# Trades sorted by time: 0, 2, 4, ... +trade_times = list(range(0, N * 2, 2)) +prices = [100.0 + i * 0.5 for i in range(N)] + +# Quotes sorted by time, sparser: 0, 3, 6, ... +quote_times = list(range(0, N * 3, 3)) +bids = [99.0 + i * 0.5 for i in range(N)] + +trades = pd.DataFrame({"time": trade_times, "price": prices}) +quotes = pd.DataFrame({"time": quote_times, "bid": bids}) + +for _ in range(WARMUP): + pd.merge_asof(trades, quotes, on="time") + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.merge_asof(trades, quotes, on="time") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "merge_asof", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_na_ops.py b/benchmarks/pandas/bench_na_ops.py new file mode 100644 index 00000000..b7d0adf0 --- /dev/null +++ b/benchmarks/pandas/bench_na_ops.py @@ -0,0 +1,42 @@ +"""Benchmark: na_ops — isna / notna / ffill / bfill on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +data = pd.array([i if i % 5 != 0 else pd.NA for i in range(SIZE)], dtype="Int64") +s = pd.Series(data, dtype="float64") +s[np.arange(SIZE) % 5 == 0] = np.nan + +df = pd.DataFrame({ + "a": s, + "b": pd.Series([float(i * 2) if i % 7 != 0 else np.nan for i in range(SIZE)]), +}) + +for _ in range(WARMUP): + pd.isna(s) + pd.notna(s) + s.ffill() + s.bfill() + df.ffill() + df.bfill() + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.isna(s) + pd.notna(s) + s.ffill() + s.bfill() + df.ffill() + df.bfill() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "na_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_notna_boolean.py b/benchmarks/pandas/bench_notna_boolean.py new file mode 100644 index 00000000..96c0a59d --- /dev/null +++ b/benchmarks/pandas/bench_notna_boolean.py @@ -0,0 +1,36 @@ +"""Benchmark: notna_boolean — boolean-mask indexing on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE)) +mask = pd.Series(np.arange(SIZE) % 2 == 0) +bool_arr = np.arange(SIZE) % 3 != 0 + +df = pd.DataFrame({ + "a": np.arange(SIZE), + "b": np.arange(SIZE) * 2, +}) + +for _ in range(WARMUP): + s[mask] + s[~mask] + df[bool_arr] + +start = time.perf_counter() +for _ in range(ITERATIONS): + s[mask] + s[~mask] + df[bool_arr] +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "notna_boolean", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_pow_mod.py b/benchmarks/pandas/bench_pow_mod.py new file mode 100644 index 00000000..3458eb26 --- /dev/null +++ b/benchmarks/pandas/bench_pow_mod.py @@ -0,0 +1,34 @@ +"""Benchmark: Series.pow, Series.mod, DataFrame.pow on 100k rows""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = (np.arange(ROWS) % 100) + 1 +s = pd.Series(data.astype(float)) +df = pd.DataFrame({ + "a": ((np.arange(ROWS) % 100) + 1).astype(float), + "b": ((np.arange(ROWS) % 50) + 1).astype(float), +}) + +for _ in range(WARMUP): + s.pow(2) + s.mod(7) + df.pow(2) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.pow(2) + s.mod(7) + df.pow(2) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "pow_mod", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_read_html.py b/benchmarks/pandas/bench_read_html.py new file mode 100644 index 00000000..03dd0199 --- /dev/null +++ b/benchmarks/pandas/bench_read_html.py @@ -0,0 +1,52 @@ +""" +Benchmark: pd.read_html — parse HTML tables into DataFrames. +Outputs JSON: {"function": "read_html", "mean_ms": ..., "iterations": ..., "total_ms": ...} +""" +import json +import time +import math + +try: + import pandas as pd +except ImportError: + import subprocess, sys + subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas", "--quiet"]) + import pandas as pd + +try: + import lxml # noqa: F401 +except ImportError: + import subprocess, sys + subprocess.check_call([sys.executable, "-m", "pip", "install", "lxml", "--quiet"]) + +ROWS = 1_000 +WARMUP = 3 +ITERATIONS = 20 + + +def build_html(rows: int) -> str: + header = "idnamevaluescore" + body_rows = [ + f"{i}item_{i % 100}{i * 1.5:.2f}{math.sin(i * 0.01):.6f}" + for i in range(rows) + ] + return f"{header}{''.join(body_rows)}
" + + +html = build_html(ROWS) + +# Warm-up +for _ in range(WARMUP): + pd.read_html(html) + +start = time.perf_counter() +for _ in range(ITERATIONS): + pd.read_html(html) +total_ms = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "read_html", + "mean_ms": total_ms / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total_ms, +})) diff --git a/benchmarks/pandas/bench_reduce_ops.py b/benchmarks/pandas/bench_reduce_ops.py new file mode 100644 index 00000000..2be36963 --- /dev/null +++ b/benchmarks/pandas/bench_reduce_ops.py @@ -0,0 +1,37 @@ +"""Benchmark: reduce_ops — nunique / any / all on Series and DataFrame of 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE) % 1000) +bool_s = pd.Series(np.arange(SIZE) > 0) +df = pd.DataFrame({ + "a": np.arange(SIZE) % 500, + "b": np.arange(SIZE) % 200, + "c": np.arange(SIZE) % 100, +}) + +for _ in range(WARMUP): + s.nunique() + bool_s.any() + bool_s.all() + df.nunique() + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.nunique() + bool_s.any() + bool_s.all() + df.nunique() +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "reduce_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_rename_ops.py b/benchmarks/pandas/bench_rename_ops.py new file mode 100644 index 00000000..897f520b --- /dev/null +++ b/benchmarks/pandas/bench_rename_ops.py @@ -0,0 +1,36 @@ +"""Benchmark: rename_ops — rename / add_prefix / add_suffix on Series/DataFrame of 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +s = pd.Series(np.arange(SIZE), index=[f"row_{i}" for i in range(SIZE)]) +df = pd.DataFrame({ + "col_a": np.arange(SIZE), + "col_b": np.arange(SIZE) * 2, + "col_c": np.arange(SIZE) * 3, +}) + +for _ in range(WARMUP): + s.rename(lambda lbl: f"new_{lbl}") + df.rename(columns={"col_a": "a", "col_b": "b"}) + df.add_prefix("pre_") + df.add_suffix("_suf") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rename(lambda lbl: f"new_{lbl}") + df.rename(columns={"col_a": "a", "col_b": "b"}) + df.add_prefix("pre_") + df.add_suffix("_suf") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "rename_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_shift_diff.py b/benchmarks/pandas/bench_shift_diff.py new file mode 100644 index 00000000..878d05c6 --- /dev/null +++ b/benchmarks/pandas/bench_shift_diff.py @@ -0,0 +1,28 @@ +"""Benchmark: Series.shift and Series.diff on 100k-element Series""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.arange(ROWS, dtype=float) * 1.5 +s = pd.Series(data) + +for _ in range(WARMUP): + s.shift(1) + s.diff(1) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.shift(1) + s.diff(1) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "shift_diff", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_sort_ops.py b/benchmarks/pandas/bench_sort_ops.py new file mode 100644 index 00000000..929558f3 --- /dev/null +++ b/benchmarks/pandas/bench_sort_ops.py @@ -0,0 +1,32 @@ +"""Benchmark: Series.sort_values and DataFrame.sort_values on 100k rows""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +data = np.sin(np.arange(ROWS, dtype=float)) * 1000 +s = pd.Series(data) +df = pd.DataFrame({ + "a": np.sin(np.arange(ROWS, dtype=float)) * 1000, + "b": np.cos(np.arange(ROWS, dtype=float)) * 500, +}) + +for _ in range(WARMUP): + s.sort_values() + df.sort_values("a") + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.sort_values() + df.sort_values("a") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "sort_ops", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_to_json_denormalize.py b/benchmarks/pandas/bench_to_json_denormalize.py new file mode 100644 index 00000000..ae51decf --- /dev/null +++ b/benchmarks/pandas/bench_to_json_denormalize.py @@ -0,0 +1,41 @@ +"""Benchmark: to_json_denormalize — json orient variants on 10k-row DataFrame.""" +import json, time +import numpy as np +import pandas as pd + +ROWS = 10_000 +WARMUP = 5 +ITERATIONS = 30 + +# DataFrame matching the tsb benchmark (nested-structure-like columns) +df = pd.DataFrame({ + "name": [f"user_{i}" for i in range(ROWS)], + "address.city": [f"city_{i % 100}" for i in range(ROWS)], + "address.zip": [str(10000 + (i % 9000)) for i in range(ROWS)], + "score": np.arange(ROWS) * 0.01, +}) + +for _ in range(WARMUP): + # pandas equivalent of toJsonDenormalize: to_dict("records") then reconstruct nesting + recs = df.to_dict("records") + # pandas equivalent of toJsonRecords: orient="records" + df.to_json(orient="records") + # pandas equivalent of toJsonSplit: orient="split" + df.to_json(orient="split") + # pandas equivalent of toJsonIndex: orient="index" + df.to_json(orient="index") + +start = time.perf_counter() +for _ in range(ITERATIONS): + recs = df.to_dict("records") + df.to_json(orient="records") + df.to_json(orient="split") + df.to_json(orient="index") +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "to_json_denormalize", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_truncate_df.py b/benchmarks/pandas/bench_truncate_df.py new file mode 100644 index 00000000..4f8b0c2a --- /dev/null +++ b/benchmarks/pandas/bench_truncate_df.py @@ -0,0 +1,31 @@ +"""Benchmark: DataFrame.truncate — slice rows by before/after on 100k-row DataFrame""" +import json +import time +import pandas as pd +import numpy as np + +N = 100_000 +WARMUP = 3 +ITERATIONS = 10 + +index = list(range(N)) +df = pd.DataFrame({ + "a": np.arange(N, dtype=float), + "b": np.arange(N, dtype=float) * 2, + "c": np.arange(N, dtype=float) * 3, +}, index=index) + +for _ in range(WARMUP): + df.truncate(before=10_000, after=90_000) + +start = time.perf_counter() +for _ in range(ITERATIONS): + df.truncate(before=10_000, after=90_000) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "truncate_df", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_value_counts_full.py b/benchmarks/pandas/bench_value_counts_full.py new file mode 100644 index 00000000..284bb8ed --- /dev/null +++ b/benchmarks/pandas/bench_value_counts_full.py @@ -0,0 +1,28 @@ +"""Benchmark: value_counts_full — value_counts(bins=N) on Series of 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 5 +ITERATIONS = 50 + +rng = np.random.default_rng(42) +s = pd.Series(rng.random(SIZE) * 100) + +for _ in range(WARMUP): + s.value_counts(bins=10) + s.value_counts(bins=20) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.value_counts(bins=10) + s.value_counts(bins=20) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "value_counts_full", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/pandas/bench_window_extended.py b/benchmarks/pandas/bench_window_extended.py new file mode 100644 index 00000000..ddafc28a --- /dev/null +++ b/benchmarks/pandas/bench_window_extended.py @@ -0,0 +1,32 @@ +"""Benchmark: window_extended — rolling sem/skew/kurt/quantile on 100k rows.""" +import json, time +import numpy as np +import pandas as pd + +SIZE = 100_000 +WARMUP = 3 +ITERATIONS = 20 +WINDOW = 10 + +s = pd.Series(np.sin(np.arange(SIZE) / 100) * 100 + np.arange(SIZE) * 0.001) + +for _ in range(WARMUP): + s.rolling(WINDOW).sem() + s.rolling(WINDOW).skew() + s.rolling(WINDOW).kurt() + s.rolling(WINDOW).quantile(0.5) + +start = time.perf_counter() +for _ in range(ITERATIONS): + s.rolling(WINDOW).sem() + s.rolling(WINDOW).skew() + s.rolling(WINDOW).kurt() + s.rolling(WINDOW).quantile(0.5) +total = (time.perf_counter() - start) * 1000 + +print(json.dumps({ + "function": "window_extended", + "mean_ms": total / ITERATIONS, + "iterations": ITERATIONS, + "total_ms": total, +})) diff --git a/benchmarks/tsb/bench_at_iat.ts b/benchmarks/tsb/bench_at_iat.ts new file mode 100644 index 00000000..ed33ba07 --- /dev/null +++ b/benchmarks/tsb/bench_at_iat.ts @@ -0,0 +1,45 @@ +/** + * Benchmark: seriesAt, seriesIat, dataFrameAt, dataFrameIat — fast scalar access + * Outputs JSON: {"function": "at_iat", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, seriesAt, seriesIat, dataFrameAt, dataFrameIat } from "../../src/index.ts"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const labels = Array.from({ length: N }, (_, i) => `r${i}`); +const values = Array.from({ length: N }, (_, i) => i * 1.5); + +const s = new Series({ data: values, index: labels }); +const df = DataFrame.fromColumns( + { a: values, b: values.map((v) => v * 2) }, + { index: labels }, +); + +const midLabel = `r${Math.floor(N / 2)}`; + +for (let i = 0; i < WARMUP; i++) { + seriesAt(s, midLabel); + seriesIat(s, N / 2); + dataFrameAt(df, midLabel, "a"); + dataFrameIat(df, N / 2, 0); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesAt(s, midLabel); + seriesIat(s, N / 2); + dataFrameAt(df, midLabel, "a"); + dataFrameIat(df, N / 2, 0); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "at_iat", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cross_join.ts b/benchmarks/tsb/bench_cross_join.ts new file mode 100644 index 00000000..0bdf02fb --- /dev/null +++ b/benchmarks/tsb/bench_cross_join.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: crossJoin — Cartesian product of two 300-row DataFrames (90k result rows). + * Outputs JSON: {"function": "cross_join", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, crossJoin } from "../../src/index.ts"; + +const N = 300; +const WARMUP = 3; +const ITERATIONS = 10; + +// Distinct column names so no suffix needed +const left = DataFrame.fromColumns({ + id_a: Array.from({ length: N }, (_, i) => i), + val_a: Array.from({ length: N }, (_, i) => i * 1.5), +}); +const right = DataFrame.fromColumns({ + id_b: Array.from({ length: N }, (_, i) => i), + val_b: Array.from({ length: N }, (_, i) => i * 2.5), +}); + +for (let i = 0; i < WARMUP; i++) { + crossJoin(left, right); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + crossJoin(left, right); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cross_join", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_cut_bins_to_frame.ts b/benchmarks/tsb/bench_cut_bins_to_frame.ts new file mode 100644 index 00000000..135fcd91 --- /dev/null +++ b/benchmarks/tsb/bench_cut_bins_to_frame.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: cut_bins_to_frame — cutBinsToFrame / cutBinCounts / binEdges on 100k data points. + * Outputs JSON: {"function": "cut_bins_to_frame", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { cut, cutBinsToFrame, cutBinCounts, binEdges } from "../../src/index.ts"; + +const SIZE = 100_000; +const NUM_BINS = 20; +const WARMUP = 5; +const ITERATIONS = 50; + +const data = Array.from({ length: SIZE }, (_, i) => (i % 1000) * 0.1); +const binResult = cut(data, NUM_BINS); + +for (let i = 0; i < WARMUP; i++) { + cutBinsToFrame(binResult, { data }); + cutBinCounts(binResult); + binEdges(binResult); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + cutBinsToFrame(binResult, { data }); + cutBinCounts(binResult); + binEdges(binResult); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "cut_bins_to_frame", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_filter_series.ts b/benchmarks/tsb/bench_filter_series.ts new file mode 100644 index 00000000..d1bdef87 --- /dev/null +++ b/benchmarks/tsb/bench_filter_series.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: filterSeries — filter Series index labels by items/like/regex + * Outputs JSON: {"function": "filter_series", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, filterSeries } from "../../src/index.ts"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Series with string labels: "label_0", "label_1", ..., "label_N-1" +const labels = Array.from({ length: N }, (_, i) => `label_${i}`); +const values = Array.from({ length: N }, (_, i) => i * 0.5); +const s = new Series({ data: values, index: labels }); + +// Pre-build a set of 1000 items to keep +const keepItems = Array.from({ length: 1_000 }, (_, i) => `label_${i * 100}`); + +for (let i = 0; i < WARMUP; i++) { + filterSeries(s, { items: keepItems }); + filterSeries(s, { like: "label_5" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + filterSeries(s, { items: keepItems }); + filterSeries(s, { like: "label_5" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "filter_series", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_join_all.ts b/benchmarks/tsb/bench_join_all.ts new file mode 100644 index 00000000..2dfb3358 --- /dev/null +++ b/benchmarks/tsb/bench_join_all.ts @@ -0,0 +1,36 @@ +/** + * Benchmark: joinAll — sequential left-join of 4 DataFrames each with 5k rows. + * Outputs JSON: {"function": "join_all", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, joinAll } from "../../src/index.ts"; + +const N = 5_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const idx = Array.from({ length: N }, (_, i) => String(i)); + +// Base DataFrame and three others — distinct column names, shared index +const base = DataFrame.fromColumns({ a: Array.from({ length: N }, (_, i) => i) }, { index: idx }); +const df1 = DataFrame.fromColumns({ b: Array.from({ length: N }, (_, i) => i * 2) }, { index: idx }); +const df2 = DataFrame.fromColumns({ c: Array.from({ length: N }, (_, i) => i * 3) }, { index: idx }); +const df3 = DataFrame.fromColumns({ d: Array.from({ length: N }, (_, i) => i * 4) }, { index: idx }); + +for (let i = 0; i < WARMUP; i++) { + joinAll(base, [df1, df2, df3]); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + joinAll(base, [df1, df2, df3]); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "join_all", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_math_ops.ts b/benchmarks/tsb/bench_math_ops.ts new file mode 100644 index 00000000..5559bde5 --- /dev/null +++ b/benchmarks/tsb/bench_math_ops.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: math_ops — absSeries / absDataFrame / roundSeries / roundDataFrame on 100k rows. + * Outputs JSON: {"function": "math_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, absSeries, absDataFrame, roundSeries, roundDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => (i % 2 === 0 ? -(i + 0.567) : i + 0.567)) }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => -(i + 0.123)), + b: Array.from({ length: SIZE }, (_, i) => i + 0.456), +}); + +for (let i = 0; i < WARMUP; i++) { + absSeries(s); + absDataFrame(df); + roundSeries(s, 1); + roundDataFrame(df, 1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + absSeries(s); + absDataFrame(df); + roundSeries(s, 1); + roundDataFrame(df, 1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "math_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_merge_asof.ts b/benchmarks/tsb/bench_merge_asof.ts new file mode 100644 index 00000000..9ef2a2b8 --- /dev/null +++ b/benchmarks/tsb/bench_merge_asof.ts @@ -0,0 +1,39 @@ +/** + * Benchmark: mergeAsof — backward asof join of two 10k-row sorted DataFrames. + * Outputs JSON: {"function": "merge_asof", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, mergeAsof } from "../../src/index.ts"; + +const N = 10_000; +const WARMUP = 3; +const ITERATIONS = 10; + +// Trades sorted by time: 0, 2, 4, ... +const tradeTimes = Array.from({ length: N }, (_, i) => i * 2); +const prices = Array.from({ length: N }, (_, i) => 100.0 + i * 0.5); + +// Quotes sorted by time, sparser: 0, 3, 6, ... +const quoteTimes = Array.from({ length: N }, (_, i) => i * 3); +const bids = Array.from({ length: N }, (_, i) => 99.0 + i * 0.5); + +const trades = DataFrame.fromColumns({ time: tradeTimes, price: prices }); +const quotes = DataFrame.fromColumns({ time: quoteTimes, bid: bids }); + +for (let i = 0; i < WARMUP; i++) { + mergeAsof(trades, quotes, { on: "time" }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + mergeAsof(trades, quotes, { on: "time" }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "merge_asof", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_na_ops.ts b/benchmarks/tsb/bench_na_ops.ts new file mode 100644 index 00000000..31990d0c --- /dev/null +++ b/benchmarks/tsb/bench_na_ops.ts @@ -0,0 +1,47 @@ +/** + * Benchmark: na_ops — isna / notna / ffillSeries / bfillSeries on 100k rows. + * Outputs JSON: {"function": "na_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, isna, notna, ffillSeries, bfillSeries, dataFrameFfill, dataFrameBfill } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const data: (number | null)[] = Array.from({ length: SIZE }, (_, i) => + i % 5 === 0 ? null : i, +); +const s = new Series({ data }); +const df = DataFrame.fromColumns({ + a: data, + b: Array.from({ length: SIZE }, (_, i) => (i % 7 === 0 ? null : i * 2)), +}); + +for (let i = 0; i < WARMUP; i++) { + isna(s); + notna(s); + ffillSeries(s); + bfillSeries(s); + dataFrameFfill(df); + dataFrameBfill(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + isna(s); + notna(s); + ffillSeries(s); + bfillSeries(s); + dataFrameFfill(df); + dataFrameBfill(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "na_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_notna_boolean.ts b/benchmarks/tsb/bench_notna_boolean.ts new file mode 100644 index 00000000..ecd113db --- /dev/null +++ b/benchmarks/tsb/bench_notna_boolean.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: notna_boolean — keepTrue / keepFalse / filterBy on 100k rows. + * Outputs JSON: {"function": "notna_boolean", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, keepTrue, keepFalse, filterBy } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i) }); +const mask = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 2 === 0) }); +const boolArr = Array.from({ length: SIZE }, (_, i) => i % 3 !== 0); + +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i), + b: Array.from({ length: SIZE }, (_, i) => i * 2), +}); + +for (let i = 0; i < WARMUP; i++) { + keepTrue(s, mask); + keepFalse(s, mask); + filterBy(df, boolArr); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + keepTrue(s, mask); + keepFalse(s, mask); + filterBy(df, boolArr); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "notna_boolean", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_pow_mod.ts b/benchmarks/tsb/bench_pow_mod.ts new file mode 100644 index 00000000..1873099c --- /dev/null +++ b/benchmarks/tsb/bench_pow_mod.ts @@ -0,0 +1,40 @@ +/** + * Benchmark: seriesPow, seriesMod, dataFramePow on 100k rows + */ +import { Series, DataFrame, seriesPow, seriesMod, dataFramePow } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => (i % 100) + 1); +const s = new Series({ data }); + +const dfData = { + a: Array.from({ length: ROWS }, (_, i) => (i % 100) + 1), + b: Array.from({ length: ROWS }, (_, i) => (i % 50) + 1), +}; +const df = new DataFrame(dfData); + +for (let i = 0; i < WARMUP; i++) { + seriesPow(s, 2); + seriesMod(s, 7); + dataFramePow(df, 2); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + seriesPow(s, 2); + seriesMod(s, 7); + dataFramePow(df, 2); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "pow_mod", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_read_html.ts b/benchmarks/tsb/bench_read_html.ts new file mode 100644 index 00000000..3cbc7149 --- /dev/null +++ b/benchmarks/tsb/bench_read_html.ts @@ -0,0 +1,43 @@ +/** + * Benchmark: readHtml — parse HTML tables into DataFrames. + * Outputs JSON: {"function": "read_html", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { readHtml } from "../../src/index.js"; + +const ROWS = 1_000; +const WARMUP = 3; +const ITERATIONS = 20; + +// Build a realistic HTML string with a 1000-row table. +function buildHtml(rows: number): string { + const header = "idnamevaluescore"; + const bodyRows: string[] = []; + for (let i = 0; i < rows; i++) { + bodyRows.push( + `${i}item_${i % 100}${(i * 1.5).toFixed(2)}${Math.sin(i * 0.01).toFixed(6)}`, + ); + } + return `${header}${bodyRows.join("")}
`; +} + +const html = buildHtml(ROWS); + +// Warm-up +for (let i = 0; i < WARMUP; i++) { + readHtml(html); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + readHtml(html); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "read_html", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_reduce_ops.ts b/benchmarks/tsb/bench_reduce_ops.ts new file mode 100644 index 00000000..f2e524f7 --- /dev/null +++ b/benchmarks/tsb/bench_reduce_ops.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: reduce_ops — nuniqueSeries / anySeries / allSeries / nunique(df) on 100k rows. + * Outputs JSON: {"function": "reduce_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, nuniqueSeries, anySeries, allSeries, nunique } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i % 1000) }); +const boolSeries = new Series({ data: Array.from({ length: SIZE }, (_, i) => i > 0) }); +const df = DataFrame.fromColumns({ + a: Array.from({ length: SIZE }, (_, i) => i % 500), + b: Array.from({ length: SIZE }, (_, i) => i % 200), + c: Array.from({ length: SIZE }, (_, i) => i % 100), +}); + +for (let i = 0; i < WARMUP; i++) { + nuniqueSeries(s); + anySeries(boolSeries); + allSeries(boolSeries); + nunique(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + nuniqueSeries(s); + anySeries(boolSeries); + allSeries(boolSeries); + nunique(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "reduce_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_rename_ops.ts b/benchmarks/tsb/bench_rename_ops.ts new file mode 100644 index 00000000..9277e6e6 --- /dev/null +++ b/benchmarks/tsb/bench_rename_ops.ts @@ -0,0 +1,41 @@ +/** + * Benchmark: rename_ops — renameSeriesIndex / renameDataFrame / addPrefixDataFrame / addSuffixDataFrame on 100k rows. + * Outputs JSON: {"function": "rename_ops", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, renameSeriesIndex, renameDataFrame, addPrefixDataFrame, addSuffixDataFrame } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => i), index: Array.from({ length: SIZE }, (_, i) => `row_${i}`) }); +const df = DataFrame.fromColumns({ + col_a: Array.from({ length: SIZE }, (_, i) => i), + col_b: Array.from({ length: SIZE }, (_, i) => i * 2), + col_c: Array.from({ length: SIZE }, (_, i) => i * 3), +}); + +for (let i = 0; i < WARMUP; i++) { + renameSeriesIndex(s, (lbl) => `new_${String(lbl)}`); + renameDataFrame(df, { columns: { col_a: "a", col_b: "b" } }); + addPrefixDataFrame(df, "pre_"); + addSuffixDataFrame(df, "_suf"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + renameSeriesIndex(s, (lbl) => `new_${String(lbl)}`); + renameDataFrame(df, { columns: { col_a: "a", col_b: "b" } }); + addPrefixDataFrame(df, "pre_"); + addSuffixDataFrame(df, "_suf"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "rename_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_shift_diff.ts b/benchmarks/tsb/bench_shift_diff.ts new file mode 100644 index 00000000..49a8ae4a --- /dev/null +++ b/benchmarks/tsb/bench_shift_diff.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: shiftSeries and diffSeries on 100k-element Series + */ +import { Series, shiftSeries, diffSeries } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => i * 1.5); +const s = new Series({ data }); + +for (let i = 0; i < WARMUP; i++) { + shiftSeries(s, 1); + diffSeries(s, 1); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + shiftSeries(s, 1); + diffSeries(s, 1); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "shift_diff", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_sort_ops.ts b/benchmarks/tsb/bench_sort_ops.ts new file mode 100644 index 00000000..684f1b6e --- /dev/null +++ b/benchmarks/tsb/bench_sort_ops.ts @@ -0,0 +1,38 @@ +/** + * Benchmark: sortValuesSeries and sortValuesDataFrame on 100k rows + */ +import { Series, DataFrame, sortValuesSeries, sortValuesDataFrame } from "../../src/index.ts"; + +const ROWS = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const data = Array.from({ length: ROWS }, (_, i) => Math.sin(i) * 1000); +const s = new Series({ data }); + +const dfData = { + a: Array.from({ length: ROWS }, (_, i) => Math.sin(i) * 1000), + b: Array.from({ length: ROWS }, (_, i) => Math.cos(i) * 500), +}; +const df = new DataFrame(dfData); + +for (let i = 0; i < WARMUP; i++) { + sortValuesSeries(s); + sortValuesDataFrame(df, "a"); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + sortValuesSeries(s); + sortValuesDataFrame(df, "a"); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "sort_ops", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_to_json_denormalize.ts b/benchmarks/tsb/bench_to_json_denormalize.ts new file mode 100644 index 00000000..07a42f5f --- /dev/null +++ b/benchmarks/tsb/bench_to_json_denormalize.ts @@ -0,0 +1,42 @@ +/** + * Benchmark: to_json_denormalize — toJsonDenormalize / toJsonRecords / toJsonSplit / toJsonIndex + * Outputs JSON: {"function": "to_json_denormalize", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { DataFrame, toJsonDenormalize, toJsonRecords, toJsonSplit, toJsonIndex } from "../../src/index.ts"; + +const ROWS = 10_000; +const WARMUP = 5; +const ITERATIONS = 30; + +// Create a nested-structure-like DataFrame (address.city, address.zip pattern) +const df = DataFrame.fromColumns({ + "name": Array.from({ length: ROWS }, (_, i) => `user_${i}`), + "address.city": Array.from({ length: ROWS }, (_, i) => `city_${i % 100}`), + "address.zip": Array.from({ length: ROWS }, (_, i) => `${10000 + (i % 9000)}`), + "score": Float64Array.from({ length: ROWS }, (_, i) => i * 0.01), +}); + +for (let i = 0; i < WARMUP; i++) { + toJsonDenormalize(df); + toJsonRecords(df); + toJsonSplit(df); + toJsonIndex(df); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + toJsonDenormalize(df); + toJsonRecords(df); + toJsonSplit(df); + toJsonIndex(df); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "to_json_denormalize", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_truncate_df.ts b/benchmarks/tsb/bench_truncate_df.ts new file mode 100644 index 00000000..f2661ce0 --- /dev/null +++ b/benchmarks/tsb/bench_truncate_df.ts @@ -0,0 +1,35 @@ +/** + * Benchmark: truncateDataFrame — slice rows by before/after labels on 100k-row DataFrame + * Outputs JSON: {"function": "truncate_df", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, DataFrame, truncateDataFrame } from "../../src/index.ts"; + +const N = 100_000; +const WARMUP = 3; +const ITERATIONS = 10; + +const index = Array.from({ length: N }, (_, i) => i); +const a = Array.from({ length: N }, (_, i) => i * 1.0); +const b = Array.from({ length: N }, (_, i) => i * 2.0); +const c = Array.from({ length: N }, (_, i) => i * 3.0); + +const df = DataFrame.fromColumns({ a, b, c }, { index }); + +for (let i = 0; i < WARMUP; i++) { + truncateDataFrame(df, 10_000, 90_000); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + truncateDataFrame(df, 10_000, 90_000); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "truncate_df", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_value_counts_full.ts b/benchmarks/tsb/bench_value_counts_full.ts new file mode 100644 index 00000000..d55b5b72 --- /dev/null +++ b/benchmarks/tsb/bench_value_counts_full.ts @@ -0,0 +1,32 @@ +/** + * Benchmark: value_counts_full — valueCountsBinned on 100k rows. + * Outputs JSON: {"function": "value_counts_full", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, valueCountsBinned } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 5; +const ITERATIONS = 50; + +const s = new Series({ data: Array.from({ length: SIZE }, () => Math.random() * 100) }); + +for (let i = 0; i < WARMUP; i++) { + valueCountsBinned(s, { bins: 10 }); + valueCountsBinned(s, { bins: 20 }); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + valueCountsBinned(s, { bins: 10 }); + valueCountsBinned(s, { bins: 20 }); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "value_counts_full", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +); diff --git a/benchmarks/tsb/bench_window_extended.ts b/benchmarks/tsb/bench_window_extended.ts new file mode 100644 index 00000000..a4b933cb --- /dev/null +++ b/benchmarks/tsb/bench_window_extended.ts @@ -0,0 +1,37 @@ +/** + * Benchmark: window_extended — rollingSem / rollingSkew / rollingKurt / rollingQuantile on 100k rows. + * Outputs JSON: {"function": "window_extended", "mean_ms": ..., "iterations": ..., "total_ms": ...} + */ +import { Series, rollingSem, rollingSkew, rollingKurt, rollingQuantile } from "../../src/index.ts"; + +const SIZE = 100_000; +const WARMUP = 3; +const ITERATIONS = 20; +const WINDOW = 10; + +const s = new Series({ data: Array.from({ length: SIZE }, (_, i) => Math.sin(i / 100) * 100 + i * 0.001) }); + +for (let i = 0; i < WARMUP; i++) { + rollingSem(s, WINDOW); + rollingSkew(s, WINDOW); + rollingKurt(s, WINDOW); + rollingQuantile(s, WINDOW, 0.5); +} + +const start = performance.now(); +for (let i = 0; i < ITERATIONS; i++) { + rollingSem(s, WINDOW); + rollingSkew(s, WINDOW); + rollingKurt(s, WINDOW); + rollingQuantile(s, WINDOW, 0.5); +} +const total = performance.now() - start; + +console.log( + JSON.stringify({ + function: "window_extended", + mean_ms: total / ITERATIONS, + iterations: ITERATIONS, + total_ms: total, + }), +);