Simulation-Decomposition · justushelo · Feb 8, 2026 · Feb 8, 2026 · Feb 12, 2026 · Feb 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,6 @@ sdist/*
 docs/html
 docs/jupyter_execute
 app.html
+
+# Virtual environment
+venv/
diff --git a/src/simdec/decomposition.py b/src/simdec/decomposition.py
@@ -68,7 +68,7 @@ def decomposition(
     output: pd.DataFrame,
     *,
     sensitivity_indices: np.ndarray,
-    dec_limit: float = 1,
+    dec_limit: float | None = None,
     auto_ordering: bool = True,
     states: list[int] | None = None,
     statistic: Literal["mean", "median"] | None = "mean",
@@ -79,7 +79,7 @@ def decomposition(
     ----------
     inputs : DataFrame of shape (n_runs, n_factors)
         Input variables.
-    output : DataFrame of shape (n_runs, 1)
+    output : DataFrame of shape (n_runs, 1) or (n_runs,)
         Target variable.
     sensitivity_indices : ndarray of shape (n_factors, 1)
         Sensitivity indices, combined effect of each input.
@@ -116,7 +116,7 @@ def decomposition(
         inputs[cat_col] = codes
 
     inputs = inputs.to_numpy()
-    output = output.to_numpy()
+    output = output.to_numpy().flatten()
 
     # 1. variables for decomposition
     var_order = np.argsort(sensitivity_indices)[::-1]
@@ -125,26 +125,41 @@ def decomposition(
     sensitivity_indices = sensitivity_indices[var_order]
 
     if auto_ordering:
-        n_var_dec = np.where(np.cumsum(sensitivity_indices) < dec_limit)[0].size
+        # handle edge case where sensitivity indices don't sum exactly to 1.0
+        if dec_limit is None:
+            dec_limit = 0.8 * np.sum(sensitivity_indices)
+
+        cumulative_sum = np.cumsum(sensitivity_indices)
+        indices_over_limit = np.where(cumulative_sum >= dec_limit)[0]
+
+        if indices_over_limit.size > 0:
+            n_var_dec = indices_over_limit[0] + 1
+        else:
+            n_var_dec = sensitivity_indices.size
+
         n_var_dec = max(1, n_var_dec)  # keep at least one variable
         n_var_dec = min(5, n_var_dec)  # use at most 5 variables
     else:
         n_var_dec = inputs.shape[1]
 
-    # 2. states formation
+    # 2. variable selection and reordering
+    if auto_ordering:
+        var_names = var_names[var_order[:n_var_dec]].tolist()
+        inputs = inputs[:, var_order[:n_var_dec]]
+    else:
+        var_names = var_names[:n_var_dec].tolist()
+        inputs = inputs[:, :n_var_dec]
+
+    # 3. states formation (after reordering/selection)
     if states is None:
-        states = 3 if n_var_dec < 3 else 2
+        states = 3 if n_var_dec <= 2 else 2
         states = [states] * n_var_dec
 
         for i in range(n_var_dec):
             n_unique = np.unique(inputs[:, i]).size
             states[i] = n_unique if n_unique <= 5 else states[i]
 
-    if auto_ordering:
-        var_names = var_names[var_order[:n_var_dec]].tolist()
-        inputs = inputs[:, var_order[:n_var_dec]]
-
-    # 3. decomposition
+    # 4. decomposition
     bins = []
 
     statistic_methods = {
@@ -153,8 +168,8 @@ def decomposition(
     }
     try:
         statistic_method = statistic_methods[statistic]
-    except IndexError:
-        msg = f"'statistic' must be one of {statistic_methods.values()}"
+    except KeyError:
+        msg = f"'statistic' must be one of {statistic_methods.keys()}"
         raise ValueError(msg)
 
     def statistic_(inputs):

diff --git a/src/simdec/sensitivity_indices.py b/src/simdec/sensitivity_indices.py
@@ -1,5 +1,4 @@
 from dataclasses import dataclass
-import warnings
 
 import numpy as np
 import pandas as pd
@@ -61,7 +60,7 @@ def sensitivity_indices(
             Sensitivity indices, combined effect of each input.
         foe : ndarray of shape (n_factors, 1)
             First-order effects (also called 'main' or 'individual').
-        soe : ndarray of shape (n_factors, 1)
+        soe_full : ndarray of shape (n_factors, 1)
             Second-order effects (also called 'interaction').
 
     Examples
@@ -96,15 +95,21 @@ def sensitivity_indices(
     array([0.43157591, 0.44241433, 0.11767249])
 
     """
+    # Handle inputs conversion
     if isinstance(inputs, pd.DataFrame):
         cat_columns = inputs.select_dtypes(["category", "O"]).columns
         inputs[cat_columns] = inputs[cat_columns].apply(
             lambda x: x.astype("category").cat.codes
         )
         inputs = inputs.to_numpy()
-    if isinstance(output, pd.DataFrame):
+
+    # Handle output conversion first, then flatten
+    if isinstance(output, (pd.DataFrame, pd.Series)):
         output = output.to_numpy()
 
+    # Flatten output if it's (N, 1)
+    output = output.flatten()
+
     n_runs, n_factors = inputs.shape
     n_bins_foe, n_bins_soe = number_of_bins(n_runs, n_factors)
 
@@ -116,55 +121,64 @@ def sensitivity_indices(
     soe = np.zeros((n_factors, n_factors))
 
     for i in range(n_factors):
-        # first order
+        # 1. First-order effects (FOE)
         xi = inputs[:, i]
 
         bin_avg, _, binnumber = stats.binned_statistic(
-            x=xi, values=output, bins=n_bins_foe
+            x=xi, values=output, bins=n_bins_foe, statistic="mean"
         )
-        # can have NaN in the average but no corresponding binnumber
-        bin_avg = bin_avg[~np.isnan(bin_avg)]
-        bin_counts = np.unique(binnumber, return_counts=True)[1]
 
-        # weighted variance and divide by the overall variance of the output
-        foe[i] = _weighted_var(bin_avg, weights=bin_counts) / var_y
+        # Filter empty bins and get weights (counts)
+        mask_foe = ~np.isnan(bin_avg)
+        mean_i_foe = bin_avg[mask_foe]
+        # binnumber starts at 1; 0 is for values outside range
+        bin_counts_foe = np.unique(binnumber[binnumber > 0], return_counts=True)[1]
 
-        # second order
+        foe[i] = _weighted_var(mean_i_foe, weights=bin_counts_foe) / var_y
+
+        # 2. Second-order effects (SOE)
         for j in range(n_factors):
-            if i == j or j < i:
+            if j <= i:
                 continue
 
             xj = inputs[:, j]
 
-            bin_avg, *edges, binnumber = stats.binned_statistic_2d(
+            # 2D Binned Statistic for Var(E[Y|Xi, Xj])
+            bin_avg_ij, x_edges, y_edges, binnumber_ij = stats.binned_statistic_2d(
                 x=xi, y=xj, values=output, bins=n_bins_soe, expand_binnumbers=False
             )
 
-            mean_ij = bin_avg[~np.isnan(bin_avg)]
-            bin_counts = np.unique(binnumber, return_counts=True)[1]
-            var_ij = _weighted_var(mean_ij, weights=bin_counts)
-
-            # expand_binnumbers here
-            nbin = np.array([len(edges_) + 1 for edges_ in edges])
-            binnumbers = np.asarray(np.unravel_index(binnumber, nbin))
+            mask_ij = ~np.isnan(bin_avg_ij)
+            mean_ij = bin_avg_ij[mask_ij]
+            counts_ij = np.unique(binnumber_ij[binnumber_ij > 0], return_counts=True)[1]
+            var_ij = _weighted_var(mean_ij, weights=counts_ij)
 
-            bin_counts_i = np.unique(binnumbers[0], return_counts=True)[1]
-            bin_counts_j = np.unique(binnumbers[1], return_counts=True)[1]
-
-            # handle NaNs
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore", RuntimeWarning)
-                mean_i = np.nanmean(bin_avg, axis=1)
-                mean_i = mean_i[~np.isnan(mean_i)]
-                mean_j = np.nanmean(bin_avg, axis=0)
-                mean_j = mean_j[~np.isnan(mean_j)]
-
-            var_i = _weighted_var(mean_i, weights=bin_counts_i)
-            var_j = _weighted_var(mean_j, weights=bin_counts_j)
+            # Marginal Var(E[Y|Xi]) using n_bins_soe to match MATLAB logic
+            bin_avg_i_soe, _, binnumber_i_soe = stats.binned_statistic(
+                x=xi, values=output, bins=n_bins_soe, statistic="mean"
+            )
+            mask_i = ~np.isnan(bin_avg_i_soe)
+            counts_i = np.unique(
+                binnumber_i_soe[binnumber_i_soe > 0], return_counts=True
+            )[1]
+            var_i_soe = _weighted_var(bin_avg_i_soe[mask_i], weights=counts_i)
+
+            # Marginal Var(E[Y|Xj]) using n_bins_soe to match MATLAB logic
+            bin_avg_j_soe, _, binnumber_j_soe = stats.binned_statistic(
+                x=xj, values=output, bins=n_bins_soe, statistic="mean"
+            )
+            mask_j = ~np.isnan(bin_avg_j_soe)
+            counts_j = np.unique(
+                binnumber_j_soe[binnumber_j_soe > 0], return_counts=True
+            )[1]
+            var_j_soe = _weighted_var(bin_avg_j_soe[mask_j], weights=counts_j)
 
-            soe[i, j] = (var_ij - var_i - var_j) / var_y
+            soe[i, j] = (var_ij - var_i_soe - var_j_soe) / var_y
 
-        soe = np.where(soe == 0, soe.T, soe)
-        si[i] = foe[i] + soe[:, i].sum() / 2
+    # Mirror SOE and calculate Combined Effect (SI)
+    # SI is FOE + half of all interactions associated with that variable
+    soe_full = soe + soe.T
+    for k in range(n_factors):
+        si[k] = foe[k] + (soe_full[:, k].sum() / 2)
 
-    return SensitivityAnalysisResult(si, foe, soe)
+    return SensitivityAnalysisResult(si, foe, soe_full)