Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,6 @@ sdist/*
docs/html
docs/jupyter_execute
app.html

# Virtual environment
venv/
41 changes: 28 additions & 13 deletions src/simdec/decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def decomposition(
output: pd.DataFrame,
*,
sensitivity_indices: np.ndarray,
dec_limit: float = 1,
dec_limit: float | None = None,
auto_ordering: bool = True,
states: list[int] | None = None,
statistic: Literal["mean", "median"] | None = "mean",
Expand All @@ -79,7 +79,7 @@ def decomposition(
----------
inputs : DataFrame of shape (n_runs, n_factors)
Input variables.
output : DataFrame of shape (n_runs, 1)
output : DataFrame of shape (n_runs, 1) or (n_runs,)
Target variable.
sensitivity_indices : ndarray of shape (n_factors, 1)
Sensitivity indices, combined effect of each input.
Expand Down Expand Up @@ -116,7 +116,7 @@ def decomposition(
inputs[cat_col] = codes

inputs = inputs.to_numpy()
output = output.to_numpy()
output = output.to_numpy().flatten()

# 1. variables for decomposition
var_order = np.argsort(sensitivity_indices)[::-1]
Expand All @@ -125,26 +125,41 @@ def decomposition(
sensitivity_indices = sensitivity_indices[var_order]

if auto_ordering:
n_var_dec = np.where(np.cumsum(sensitivity_indices) < dec_limit)[0].size
# handle edge case where sensitivity indices don't sum exactly to 1.0
if dec_limit is None:
dec_limit = 0.8 * np.sum(sensitivity_indices)

cumulative_sum = np.cumsum(sensitivity_indices)
indices_over_limit = np.where(cumulative_sum >= dec_limit)[0]

if indices_over_limit.size > 0:
n_var_dec = indices_over_limit[0] + 1
else:
n_var_dec = sensitivity_indices.size

n_var_dec = max(1, n_var_dec) # keep at least one variable
n_var_dec = min(5, n_var_dec) # use at most 5 variables
else:
n_var_dec = inputs.shape[1]

# 2. states formation
# 2. variable selection and reordering
if auto_ordering:
var_names = var_names[var_order[:n_var_dec]].tolist()
inputs = inputs[:, var_order[:n_var_dec]]
else:
var_names = var_names[:n_var_dec].tolist()
inputs = inputs[:, :n_var_dec]

# 3. states formation (after reordering/selection)
if states is None:
states = 3 if n_var_dec < 3 else 2
states = 3 if n_var_dec <= 2 else 2
states = [states] * n_var_dec

for i in range(n_var_dec):
n_unique = np.unique(inputs[:, i]).size
states[i] = n_unique if n_unique <= 5 else states[i]

if auto_ordering:
var_names = var_names[var_order[:n_var_dec]].tolist()
inputs = inputs[:, var_order[:n_var_dec]]

# 3. decomposition
# 4. decomposition
bins = []

statistic_methods = {
Expand All @@ -153,8 +168,8 @@ def decomposition(
}
try:
statistic_method = statistic_methods[statistic]
except IndexError:
msg = f"'statistic' must be one of {statistic_methods.values()}"
except KeyError:
msg = f"'statistic' must be one of {statistic_methods.keys()}"
raise ValueError(msg)

def statistic_(inputs):
Expand Down
88 changes: 51 additions & 37 deletions src/simdec/sensitivity_indices.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from dataclasses import dataclass
import warnings

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -61,7 +60,7 @@ def sensitivity_indices(
Sensitivity indices, combined effect of each input.
foe : ndarray of shape (n_factors, 1)
First-order effects (also called 'main' or 'individual').
soe : ndarray of shape (n_factors, 1)
soe_full : ndarray of shape (n_factors, 1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why renaming?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought it could help any future debugging to have the same variable name soe_full as it can be found in the Matlab function.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would not do it

Second-order effects (also called 'interaction').

Examples
Expand Down Expand Up @@ -96,15 +95,21 @@ def sensitivity_indices(
array([0.43157591, 0.44241433, 0.11767249])

"""
# Handle inputs conversion
if isinstance(inputs, pd.DataFrame):
cat_columns = inputs.select_dtypes(["category", "O"]).columns
inputs[cat_columns] = inputs[cat_columns].apply(
lambda x: x.astype("category").cat.codes
)
inputs = inputs.to_numpy()
if isinstance(output, pd.DataFrame):

# Handle output conversion first, then flatten
if isinstance(output, (pd.DataFrame, pd.Series)):
output = output.to_numpy()

# Flatten output if it's (N, 1)
output = output.flatten()

n_runs, n_factors = inputs.shape
n_bins_foe, n_bins_soe = number_of_bins(n_runs, n_factors)

Expand All @@ -116,55 +121,64 @@ def sensitivity_indices(
soe = np.zeros((n_factors, n_factors))

for i in range(n_factors):
# first order
# 1. First-order effects (FOE)
xi = inputs[:, i]

bin_avg, _, binnumber = stats.binned_statistic(
x=xi, values=output, bins=n_bins_foe
x=xi, values=output, bins=n_bins_foe, statistic="mean"
)
# can have NaN in the average but no corresponding binnumber
bin_avg = bin_avg[~np.isnan(bin_avg)]
bin_counts = np.unique(binnumber, return_counts=True)[1]

# weighted variance and divide by the overall variance of the output
foe[i] = _weighted_var(bin_avg, weights=bin_counts) / var_y
# Filter empty bins and get weights (counts)
mask_foe = ~np.isnan(bin_avg)
mean_i_foe = bin_avg[mask_foe]
# binnumber starts at 1; 0 is for values outside range
bin_counts_foe = np.unique(binnumber[binnumber > 0], return_counts=True)[1]

# second order
foe[i] = _weighted_var(mean_i_foe, weights=bin_counts_foe) / var_y

# 2. Second-order effects (SOE)
for j in range(n_factors):
if i == j or j < i:
if j <= i:
continue

xj = inputs[:, j]

bin_avg, *edges, binnumber = stats.binned_statistic_2d(
# 2D Binned Statistic for Var(E[Y|Xi, Xj])
bin_avg_ij, x_edges, y_edges, binnumber_ij = stats.binned_statistic_2d(
x=xi, y=xj, values=output, bins=n_bins_soe, expand_binnumbers=False
)

mean_ij = bin_avg[~np.isnan(bin_avg)]
bin_counts = np.unique(binnumber, return_counts=True)[1]
var_ij = _weighted_var(mean_ij, weights=bin_counts)

# expand_binnumbers here
nbin = np.array([len(edges_) + 1 for edges_ in edges])
binnumbers = np.asarray(np.unravel_index(binnumber, nbin))
mask_ij = ~np.isnan(bin_avg_ij)
mean_ij = bin_avg_ij[mask_ij]
counts_ij = np.unique(binnumber_ij[binnumber_ij > 0], return_counts=True)[1]
var_ij = _weighted_var(mean_ij, weights=counts_ij)

bin_counts_i = np.unique(binnumbers[0], return_counts=True)[1]
bin_counts_j = np.unique(binnumbers[1], return_counts=True)[1]

# handle NaNs
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
mean_i = np.nanmean(bin_avg, axis=1)
mean_i = mean_i[~np.isnan(mean_i)]
mean_j = np.nanmean(bin_avg, axis=0)
mean_j = mean_j[~np.isnan(mean_j)]

var_i = _weighted_var(mean_i, weights=bin_counts_i)
var_j = _weighted_var(mean_j, weights=bin_counts_j)
# Marginal Var(E[Y|Xi]) using n_bins_soe to match MATLAB logic
bin_avg_i_soe, _, binnumber_i_soe = stats.binned_statistic(
x=xi, values=output, bins=n_bins_soe, statistic="mean"
)
mask_i = ~np.isnan(bin_avg_i_soe)
counts_i = np.unique(
binnumber_i_soe[binnumber_i_soe > 0], return_counts=True
)[1]
var_i_soe = _weighted_var(bin_avg_i_soe[mask_i], weights=counts_i)

# Marginal Var(E[Y|Xj]) using n_bins_soe to match MATLAB logic
bin_avg_j_soe, _, binnumber_j_soe = stats.binned_statistic(
x=xj, values=output, bins=n_bins_soe, statistic="mean"
)
mask_j = ~np.isnan(bin_avg_j_soe)
counts_j = np.unique(
binnumber_j_soe[binnumber_j_soe > 0], return_counts=True
)[1]
var_j_soe = _weighted_var(bin_avg_j_soe[mask_j], weights=counts_j)

soe[i, j] = (var_ij - var_i - var_j) / var_y
soe[i, j] = (var_ij - var_i_soe - var_j_soe) / var_y

soe = np.where(soe == 0, soe.T, soe)
si[i] = foe[i] + soe[:, i].sum() / 2
# Mirror SOE and calculate Combined Effect (SI)
# SI is FOE + half of all interactions associated with that variable
soe_full = soe + soe.T
for k in range(n_factors):
si[k] = foe[k] + (soe_full[:, k].sum() / 2)

return SensitivityAnalysisResult(si, foe, soe)
return SensitivityAnalysisResult(si, foe, soe_full)
Loading