From bbe08a48d98a5380876853797b5561dda9a6e945 Mon Sep 17 00:00:00 2001
From: Yongyi Wu <yongyiwu@mu2ebuild02.fnal.gov>
Date: Sat, 11 Apr 2026 17:39:38 -0500
Subject: [PATCH 1/4] separated PR (Diffusion Model resampler for STM) from PR
 #1794

---
 MCDataProducts/inc/GenId.hh                   |    6 +-
 MachineLearningTools/CMakeLists.txt           |   11 +
 .../inc/ScoreBasedDiffusionModel.hh           |  321 +++++
 MachineLearningTools/src/SConscript           |   20 +
 .../src/ScoreBasedDiffusionModel.cc           | 1048 +++++++++++++++++
 STMMC/CMakeLists.txt                          |   39 +
 STMMC/inc/VDResamplerTransformDefaults.hh     |   13 +
 STMMC/src/SConscript                          |    2 +
 STMMC/src/VDResamplerConfigure_module.cc      |  271 +++++
 .../VDResamplerGenerateFromModel_module.cc    |  342 ++++++
 STMMC/src/VDResamplerGenerateMix_module.cc    |  500 ++++++++
 STMMC/src/VDResamplerTrain_module.cc          |  397 +++++++
 12 files changed, 2967 insertions(+), 3 deletions(-)
 create mode 100644 MachineLearningTools/CMakeLists.txt
 create mode 100644 MachineLearningTools/inc/ScoreBasedDiffusionModel.hh
 create mode 100644 MachineLearningTools/src/SConscript
 create mode 100644 MachineLearningTools/src/ScoreBasedDiffusionModel.cc
 create mode 100644 STMMC/inc/VDResamplerTransformDefaults.hh
 create mode 100644 STMMC/src/VDResamplerConfigure_module.cc
 create mode 100644 STMMC/src/VDResamplerGenerateFromModel_module.cc
 create mode 100644 STMMC/src/VDResamplerGenerateMix_module.cc
 create mode 100644 STMMC/src/VDResamplerTrain_module.cc
diff --git a/MCDataProducts/inc/GenId.hh b/MCDataProducts/inc/GenId.hh
index 1c781150a6..d7feb7f715 100644
--- a/MCDataProducts/inc/GenId.hh
+++ b/MCDataProducts/inc/GenId.hh
@@ -44,8 +44,8 @@ namespace mu2e {
       cosmicCRY,  pbarFlat, fromAscii, ExternalRMC, InternalRMC, CeLeadingLog, cosmicCORSIKA, //44
       MuCapProtonGenTool, MuCapDeuteronGenTool, DIOGenTool, MuCapNeutronGenTool, // 48
       MuCapPhotonGenTool, MuCapGammaRayGenTool, CeLeadingLogGenTool, MuplusMichelGenTool,// 52
-      gammaPairProduction, antiproton, Mu2eXGenTool,//55
-      lastEnum //56
+      gammaPairProduction, antiproton, Mu2eXGenTool, STMDownStreamGenTool,//56
+      lastEnum //57
     };
 
 #ifndef SWIG
@@ -64,7 +64,7 @@ namespace mu2e {
       "CosmicCRY", "pbarFlat","fromAscii","ExternalRMC","InternalRMC","CeLeadingLog", "CosmicCORSIKA", \
     "MuCapProtonGenTool", "MuCapDeuteronGenTool", "DIOGenTool", "MuCapNeutronGenTool", \
       "MuCapPhotonGenTool", "MuCapGammaRayGenTool","CeLeadingLogGenTool","MuplusMichelGenTool", \
-      "gammaPairProduction", "antiproton", "Mu2eXGenTool"
+      "gammaPairProduction", "antiproton", "Mu2eXGenTool", "STMDownStreamGenTool"
 #endif
 
   public:
diff --git a/MachineLearningTools/CMakeLists.txt b/MachineLearningTools/CMakeLists.txt
new file mode 100644
index 0000000000..b837296aff
--- /dev/null
+++ b/MachineLearningTools/CMakeLists.txt
@@ -0,0 +1,11 @@
+cet_make_library(
+    SOURCE
+      src/ScoreBasedDiffusionModel.cc
+    LIBRARIES PUBLIC
+      CLHEP::CLHEP
+      messagefacility::messagefacility
+      cetlib_except::cetlib_except
+)
+
+install_source(SUBDIRS src)
+install_headers(USE_PROJECT_NAME SUBDIRS inc)
diff --git a/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh b/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh
new file mode 100644
index 0000000000..2bf4c7c697
--- /dev/null
+++ b/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh
@@ -0,0 +1,321 @@
+// Module for training and using score-based diffusion model
+// Added by Yongyi Wu
+// Mar. 2026
+
+#pragma once
+
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <iomanip>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <cctype>
+#include <cassert>
+
+#include "CLHEP/Random/RandomEngine.h"
+#include "CLHEP/Random/RandFlat.h"
+#include "CLHEP/Random/RandGaussQ.h"
+
+#include "messagefacility/MessageLogger/MessageLogger.h"
+#include "cetlib_except/exception.h"
+
+namespace mu2e{
+    struct DiffusionTrainingSample {
+        std::vector<double> x;    // transformed state vector to diffuse (size = dim)
+        std::vector<double> cond; // optional conditioning vector (size = conditionDim)
+    };
+
+    class ScoreBasedDiffusionModel {
+    public:
+        // Enumeration for optimizer selection
+        enum class OptimizerType {
+            SGD,   // Stochastic Gradient Descent
+            ADAM   // Adam optimizer
+        };
+
+        // Enumeration for noise schedule selection
+        enum class NoiseScheduleType {
+            LINEAR,  // Linear noise schedule
+            COSINE   // Cosine noise schedule
+        };
+
+        // Constructor: Initialize diffusion model with CLHEP random distributions.
+        //
+        // Parameters:
+        //   randFlat              - Reference to CLHEP RandFlat for uniform sampling (externally managed)
+        //   randGaussQ            - Reference to CLHEP RandGaussQ for Gaussian noise (externally managed)
+        //   dim                   - Dimensionality of the state space
+        //   conditionDim          - Dimensionality of the optional conditioning vector (default: 0 for unconditional model)
+        //   hidden                - Size of hidden layers in the neural network
+        //   layers                - Number of layers in the network
+        //   optimizerType         - Type of optimizer to use (SGD or ADAM, default: ADAM)
+        //   adamBeta1             - Adam optimizer beta1 parameter (default: 0.9)
+        //   adamBeta2             - Adam optimizer beta2 parameter (default: 0.999)
+        //   adamEps               - Adam optimizer epsilon parameter (default: 1e-8)
+        //   scheduleType          - Type of noise schedule (LINEAR or COSINE, default: COSINE)
+        //   betaMin               - Minimum noise schedule parameter (for LINEAR schedule, default: 1e-4)
+        //   betaMax               - Maximum noise schedule parameter (for LINEAR schedule, default: 0.02)
+        //   cosineOffset          - Offset parameter (for cosine schedule, default: 0.008)
+        //   batchSize             - Batch size for training (default: 32)
+        //   gradientClipThreshold - Threshold for gradient clipping (default: 1.0)
+        //   learningRate          - Learning rate for training (default: 1e-3)
+        //   diffusionSteps        - Number of steps in the diffusion process (default: 200)
+        ScoreBasedDiffusionModel(
+            // Network architecture parameters
+            CLHEP::RandFlat& randFlat,
+            CLHEP::RandGaussQ& randGaussQ,
+            int dim,
+            int conditionDim,
+            int hidden,
+            int layers,
+            // Optimizer configuration
+            OptimizerType optimizerType = OptimizerType::ADAM,
+            double adamBeta1 = 0.9,
+            double adamBeta2 = 0.999,
+            double adamEps = 1e-8,
+            // Noise schedule configuration
+            NoiseScheduleType scheduleType = NoiseScheduleType::COSINE,
+            double betaMin = 1e-4,
+            double betaMax = 0.02,
+            double cosineOffset = 0.008,
+            // Training configuration
+            int batchSize = 32,
+            double gradientClipThreshold = 1.0,
+            double learningRate = 1e-3,
+            // Diffusion process configuration
+            int diffusionSteps = 200
+        );
+
+        // Train the score network on a batch of samples.
+        // Uses random sampling and noise injection via the external engine.
+        // Note that training needs to occur on all data samples. Training on multiple small subsets
+        // of data and then averaging or aggregating the model parameters is not supported and may lead to
+        // issues as neural networks are not linear.
+        //
+        // Parameters:
+        //   data         - Training samples (transformed state vectors)
+        //   epochs       - Number of training epochs to perform
+        void train(
+            const std::vector<DiffusionTrainingSample>& data,
+            int epochs
+        );
+
+        // Generate a new sample from the diffusion model via reverse process.
+        // Uses the external random engine for noise generation during sampling.
+        //
+        // Parameters:
+        //   condition       - Optional conditioning vector (must match conditionDim_ when enabled)
+        //   useHeun         - If true, uses Heun's method (2nd order, default). If false, uses Euler's method (1st order)
+        //   diffusionSteps  - Number of diffusion steps for sampling (default: -1 uses the model's configured diffusionSteps_)
+        //
+        // Returns: A generated sample vector of dimension dim_
+        std::vector<double> generateSample(
+            const std::vector<double>& condition = {},
+            bool useHeun = true,
+            int diffusionSteps = -1
+        );
+
+        // Save the model parameters to a CSV file with annotations for later use.
+        // Uses a default filename of "DiffusionModel.csv" if not specified.
+        //
+        // Parameters:
+        //   filename - Path to the CSV file where model parameters will be saved (default: "DiffusionModel.csv")
+        void saveModel(const std::string& filename = "DiffusionModel.csv");
+
+        // Load model parameters from a file to restore a previously trained model.
+        // Note that as the adam optimizer state is not saved, it is not possible to resume training from a loaded
+        // model and pick up the training process where it left off. The loaded model can only be used for sampling.
+        //
+        // Parameters:
+        //   randFlat / RandGaussQ - CLHEP random number generator wrappers being passed
+        //   filename              - Path to the file from which model parameters will be loaded
+        static ScoreBasedDiffusionModel loadModel(
+            CLHEP::RandFlat& randFlat,
+            CLHEP::RandGaussQ& randGaussQ,
+            const std::string& filename
+        );
+
+    private:
+
+        // ----- network -----
+
+        // Represents a single fully-connected layer with weights and biases.
+        struct Layer {
+            std::vector<std::vector<double>> W; // Weight matrix [output_size][input_size]
+            std::vector<double> b;              // Bias vector [output_size]
+
+            // storage for gradients during back-propagation
+            std::vector<std::vector<double>> gradW; // gradient of loss w.r.t. weights
+            std::vector<double> gradb;              // gradient of loss w.r.t. biases
+
+            // Adam optimizer state buffers
+            std::vector<std::vector<double>> mW; // First moment estimates for weights, m_t = beta1 m_{t-1} + (1-beta1) g_t
+            std::vector<std::vector<double>> vW; // Second moment estimates for weights, v_t = beta2 v_{t-1} + (1-beta2) g_t^2
+
+            std::vector<double> mb; // First moment estimates for biases
+            std::vector<double> vb; // Second moment estimates for biases
+        };
+
+        std::vector<Layer> network_; // Network layers in forward order
+
+        // Storage for activations and pre-activations during forward pass (used in back-propagation)
+        // During forward pass, preactivations_[l] = W[l] * activations_[l-1] + b[l],
+        //                     and activations_[l] = activationFunction(preactivations_[l])
+        // These are needed to compute gradients during the backward pass.
+        std::vector<std::vector<double>> activations_;
+        std::vector<std::vector<double>> preactivations_;
+
+        // Forward pass through the network.
+        // Computes the score function.
+        // input actually contains the state vector, optional condition vector, and the time embedding.
+        // dimension hence is dim_ + conditionDim_ + 1.
+        //
+        // Parameters:
+        //   x - Input vector of dimension dim_ + conditionDim_ + 1
+        //
+        // Returns: Output vector (dimension depends on network architecture)
+        std::vector<double> forward(
+            const std::vector<double>& x
+        );
+
+        // Backward pass for gradient computation.
+        // Computes gradients w.r.t. network parameters via chain rule.
+        //
+        // Parameters:
+        //   gradOutput - Gradient of loss w.r.t. network output
+        //
+        // Returns: Gradient of loss w.r.t. network parameters (gradW and gradb for each layer)
+        void backward(
+            const std::vector<double>& gradOutput
+        );
+
+        // Update network weights using computed gradients (Stochastic Gradient Descent SGD).
+        // Applied after backward pass. Alternately, an Adam optimizer step can be implemented
+        // in adamUpdate() for better convergence.
+        //
+        // Parameters:
+        //   lr - Learning rate for gradient descent step
+        void updateWeights(double lr);
+
+        // Update network weights using Adam optimization algorithm.
+        // This method would implement the Adam update rule using the stored gradients and moment estimates.
+        //
+        // Parameters:
+        //   lr - Learning rate for Adam update
+        void adamUpdate(double lr);
+
+        // ----- diffusion -----
+
+        // Noise schedule parameter beta(t) over diffusion time [0,1].
+        // Linear interpolation between betaMin_ and betaMax_.
+        //
+        // Parameters:
+        //   t - Diffusion time parameter in [0,1]
+        //
+        // Returns: Beta value for the given time step
+        double beta(double t) const;
+
+        // Standard deviation of noise at diffusion time t.
+        // Related to the noise schedule via sigma(t) = sqrt(1 - exp(-integral(beta(s) ds))).
+        //
+        // Parameters:
+        //   t - Diffusion time parameter in [0,1]
+        //
+        // Returns: Noise standard deviation at time t
+        double sigma(double t) const;
+
+        // Add Gaussian noise to a state vector at diffusion time t.
+        // Uses external engine (randGaussQ_) for reproducible noise generation.
+        // Noise sample is stored in eps for later use in training.
+        //
+        // Parameters:
+        //   x   - Original state vector
+        //   t   - Diffusion time parameter in [0,1]
+        //   eps - Output: Gaussian noise vector used for perturbation (size = dim_)
+        //
+        // Returns: Noisy state = x + sigma(t) * eps
+        std::vector<double> addNoise(
+            const std::vector<double>& x,
+            double t,
+            std::vector<double>& eps
+        );
+
+        // ----- loss -----
+
+        // Compute Mean Squared Error between predicted score and target score.
+        // Used during training to measure model performance.
+        //
+        // Parameters:
+        //   score  - Model's predicted score vector
+        //   target - Ground truth target score vector
+        //
+        // Returns: MSE loss value (scalar)
+        double computeLoss(
+            const std::vector<double>& score,
+            const std::vector<double>& target
+        ) const;
+
+        // Clip gradients to prevent exploding gradients during training.
+        //
+        // Parameters:
+        //   maxNorm - Maximum allowed norm for the gradients. If the total norm exceeds this threshold,
+        //             gradients are scaled down to have norm equal to maxNorm.
+        void clipGradients(double maxNorm);
+
+        // ----- internal vars -----
+
+        // The random engine is NOT owned by this class. It is injected externally
+        // by the framework. Below are CLHEP distribution wrappers for actual random number generation.
+        // These wrap the engine_ and provide specific probability distributions.
+        // - RandFlat:   Uniform distribution on [0,1)
+        // - RandGaussQ: Gaussian (normal) distribution with mean=0, sigma=1 (or custom)
+        // Both are bound in the constructor to externally managed wrappers.
+        CLHEP::RandFlat& randFlat_;       // Used for uniform sampling (e.g., batch selection)
+        CLHEP::RandGaussQ& randGaussQ_;   // Used for Gaussian noise in diffusion process
+
+        // Model hyperparameters
+        int dim_;           // Dimensionality of state space
+        int conditionDim_;  // Dimensionality of the optional conditioning vector
+        int hidden_;        // Hidden layer size
+        int layers_;        // Number of network layers
+
+        // Optimizer configuration
+        OptimizerType optimizerType_;  // Type of optimizer to use (SGD or ADAM)
+
+        // Adam optimizer parameters
+        double adamBeta1_;  // First moment exponential decay rate (default: 0.9)
+        double adamBeta2_;  // Second moment exponential decay rate (default: 0.999)
+        double adamEps_;    // Small constant for numerical stability (default: 1e-8)
+
+        // Noise schedule configuration
+        NoiseScheduleType noiseScheduleType_;  // Type of noise schedule (LINEAR or COSINE)
+
+        // Linear noise schedule parameters (beta(t) = betaMin + t*(betaMax - betaMin))
+        // default betaMin = 1e-4, betaMax = 0.02 are typical values used in diffusion models, but can be tuned for specific applications.
+        double betaMin_;    // Beta value at t=0
+        double betaMax_;    // Beta value at t=1
+
+        // Cosine noise schedule parameter (offset to avoid singularity at t=0 in cosine schedule, default: 0.008)
+        double cosineOffset_;
+
+        // Training configuration
+        int batchSize_;  // Batch size for vectorized training (default: 32)
+        double gradientClipThreshold_;  // Gradient clipping threshold (default: 1.0)
+        double learningRate_; // Learning rate for training (default: 1e-3)
+
+        // Diffusion process discretization
+        int diffusionSteps_;  // Number of time steps to generate a sample (default: 200)
+
+        // Training state
+        double runningLoss_;  // Accumulated loss for monitoring during training
+        int adamStep_; // Step counter for Adam optimizer (used to compute bias-corrected moment estimates)
+        int trainingSampleSize_; // Total number of training samples
+
+        // Container for tracking training loss over epochs
+        std::vector<double> epochLosses_;
+
+    };
+}
diff --git a/MachineLearningTools/src/SConscript b/MachineLearningTools/src/SConscript
new file mode 100644
index 0000000000..f74c4ad579
--- /dev/null
+++ b/MachineLearningTools/src/SConscript
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+#
+# Script to build the files found in this directory.
+#
+
+import os
+Import('env')
+Import('mu2e_helper')
+
+helper=mu2e_helper(env)
+
+mainlib = helper.make_mainlib ( [ 'CLHEP',
+                                  'cetlib',
+                                  'cetlib_except',
+                                  'MF_MessageLogger' ] )
+
+# This tells emacs to view this file in python mode.
+# Local Variables:
+# mode:python
+# End:
diff --git a/MachineLearningTools/src/ScoreBasedDiffusionModel.cc b/MachineLearningTools/src/ScoreBasedDiffusionModel.cc
new file mode 100644
index 0000000000..5cf8b2dc34
--- /dev/null
+++ b/MachineLearningTools/src/ScoreBasedDiffusionModel.cc
@@ -0,0 +1,1048 @@
+#include "Offline/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh"
+
+namespace mu2e {
+
+    // SiLU activation function (Sigmoid Linear Unit)
+    // Defined as: silu(x) = x / (1 + exp(-x))
+    static double silu(double x) {
+        return x / (1.0 + std::exp(-x));
+    }
+
+    // Derivative of SiLU activation function, needed for back-propagation.
+    // Defined as: silu'(x) = sigmoid(x) * (1 + x * (1 - sigmoid(x)))
+    static double siluDeriv(double x) {
+        double s = 1.0 / (1.0 + std::exp(-x));
+        return s * (1.0 + x * (1.0 - s));
+    }
+
+    // Linear interpolation of beta(t) between betaMin_ and betaMax_.
+    // This is only used in the linear noise schedule.
+    double ScoreBasedDiffusionModel::beta(double t) const {
+        return betaMin_ + t * (betaMax_ - betaMin_);
+    }
+
+    // Standard deviation of noise at diffusion time t.
+    // For the linear schedule, use simply sqrt(beta(t)).
+    // For the cosine schedule, this is derived from the cumulative noise schedule.
+    double ScoreBasedDiffusionModel::sigma(double t) const {
+        if (noiseScheduleType_ == NoiseScheduleType::COSINE) {
+            // Cosine noise schedule
+            double f = (t + cosineOffset_) / (1.0 + cosineOffset_);
+            double alpha_bar = std::cos(f * M_PI * 0.5);
+            alpha_bar *= alpha_bar;
+            return std::sqrt(1.0 - alpha_bar);
+        } else {
+            // Linear noise schedule (default)
+            return std::sqrt(beta(t));
+        }
+    }
+
+
+    ScoreBasedDiffusionModel::ScoreBasedDiffusionModel(
+        CLHEP::RandFlat& randFlat,
+        CLHEP::RandGaussQ& randGaussQ,
+        int dim,
+        int conditionDim,
+        int hidden,
+        int layers,
+        OptimizerType optimizerType,
+        double adamBeta1,
+        double adamBeta2,
+        double adamEps,
+        NoiseScheduleType scheduleType,
+        double betaMin,
+        double betaMax,
+        double cosineOffset,
+        int batchSize,
+        double gradientClipThreshold,
+        double learningRate,
+        int diffusionSteps
+    ) : randFlat_(randFlat), randGaussQ_(randGaussQ),
+        dim_(dim), conditionDim_(conditionDim), hidden_(hidden), layers_(layers),
+        optimizerType_(optimizerType),
+        adamBeta1_(adamBeta1), adamBeta2_(adamBeta2), adamEps_(adamEps),
+        noiseScheduleType_(scheduleType),
+        betaMin_(betaMin), betaMax_(betaMax), cosineOffset_(cosineOffset),
+        batchSize_(batchSize), gradientClipThreshold_(gradientClipThreshold), learningRate_(learningRate),
+        diffusionSteps_(diffusionSteps),
+        runningLoss_(0.0), adamStep_(0), trainingSampleSize_(0), epochLosses_() {
+
+        // Validate model dimensions and parameters
+        if (dim <= 0 || conditionDim < 0 || hidden <= 0 || layers <= 0) {
+            throw cet::exception("ScoreBasedDiffusionModel::initialization") << "Invalid model dimensions";
+        }
+        if (batchSize <= 0) {
+            throw cet::exception("ScoreBasedDiffusionModel::initialization") << "Invalid batchSize";
+        }
+        if (diffusionSteps <= 0) {
+            throw cet::exception("ScoreBasedDiffusionModel::initialization") << "Invalid diffusionSteps";
+        }
+
+        // ------------------------------------------------------------
+        // Network architecture
+        //
+        // Input dimension = dim_ + conditionDim_ + 1
+        // (+1 because diffusion time t is appended to the input vector)
+        // ------------------------------------------------------------
+
+        int inputSize = dim_ + conditionDim_ + 1;
+        int in = inputSize;
+
+        // Weight initialization scale (local constant so it can be tuned easily)
+        // const double weightInitScale = 0.02; // not scalable with size, can lead to instability for larger models and too slow training for smaller models
+        // const double weightInitScale = std::sqrt(2.0 / in); // He initialization for ReLU activations, SiLU is similar to ReLU in terms of variance preservation
+        double reducedHe = 0.5 * std::sqrt(2.0 / in); // Scaled He initialization found to improve stability
+        const double weightInitScale = std::min(reducedHe, 0.3); // Cap the weight initialization scale to prevent instability for very small input sizes
+
+        for (int l = 0; l < layers_; ++l) {
+
+            // Last layer outputs the score vector (dimension = dim_)
+            int out = (l == layers_ - 1) ? dim_ : hidden_;
+
+            Layer layer;
+
+            // Allocate weights
+            layer.W.resize(out, std::vector<double>(in));
+
+            // Allocate biases
+            layer.b.resize(out);
+
+            // Allocate gradient buffers
+            layer.gradW.resize(out, std::vector<double>(in, 0.0));
+            layer.gradb.resize(out, 0.0);
+
+            // Allocate Adam buffers
+            layer.mW.resize(out, std::vector<double>(in, 0.0));
+            layer.vW.resize(out, std::vector<double>(in, 0.0));
+
+            layer.mb.resize(out, 0.0);
+            layer.vb.resize(out, 0.0);
+
+            // --------------------------------------------------------
+            // Weight initialization
+            //
+            // Small Gaussian initialization improves training stability
+            // --------------------------------------------------------
+
+            for (int i = 0; i < out; ++i) {
+                for (int j = 0; j < in; ++j) {
+                    layer.W[i][j] = weightInitScale * randGaussQ_.fire();
+                    // this generates a Gaussian random number with mean=0 and sigma=weightInitScale
+                }
+                layer.b[i] = 0.0;
+            }
+
+            if (layer.W.empty() || layer.W[0].size() != static_cast<size_t>(in) || //Check that weight matrix has correct input dimension
+                layer.W.size() != static_cast<size_t>(out) ||                      //Check that weight matrix has correct output dimension
+                layer.b.size() != static_cast<size_t>(out)) {                      //Check that bias vector has correct dimension
+                throw cet::exception("ScoreBasedDiffusionModel::initialization")
+                    << "Layer shape initialization mismatch";
+            }
+
+            network_.push_back(layer);
+
+            // Output becomes next layer input
+            in = out;
+        }
+
+        // Print layer and model configuration
+        std::ostringstream oss;
+        oss << "ScoreBasedDiffusionModel initialized\n"
+            << "Model configuration:\n"
+            << "  Network architecture:\n"
+            << "    | dim=" << dim_ << "\n"
+            << "    | conditionDim=" << conditionDim_ << "\n"
+            << "    | hidden=" << hidden_ << "\n"
+            << "    | layers=" << layers_ << "\n"
+            << "  Optimizer configuration:\n"
+            << "    | Optimizer=" << (optimizerType_ == OptimizerType::ADAM ? "Adam" : "SGD") << "\n";
+        if (optimizerType_ == OptimizerType::ADAM) {
+            oss << "      |- AdamBeta1=" << adamBeta1_ << "\n"
+                << "      |- AdamBeta2=" << adamBeta2_ << "\n"
+                << "      |- AdamEps=" << adamEps_ << "\n";
+        }
+        oss << "  Noise schedule configuration:\n"
+            << "    | NoiseSchedule=" << (noiseScheduleType_ == NoiseScheduleType::COSINE ? "Cosine" : "Linear") << "\n";
+        if (noiseScheduleType_ == NoiseScheduleType::COSINE) {
+            oss << "      |- CosineOffset=" << cosineOffset_ << "\n";
+        } else {
+            oss << "      |- BetaMin=" << betaMin_ << "\n"
+                << "      |- BetaMax=" << betaMax_ << "\n";
+        }
+        oss << "  Training configuration:\n"
+            << "    | BatchSize=" << batchSize_ << "\n"
+            << "    | GradientClipThreshold=" << gradientClipThreshold_ << "\n"
+            << "    | LearningRate=" << learningRate_ << "\n"
+            << "  Diffusion process configuration:\n"
+            << "    | DiffusionSteps=" << diffusionSteps_ << "\n";
+        mf::LogInfo("ScoreBasedDiffusionModel::initialize") << oss.str();
+
+        // Reserve space for forward-pass caches
+        activations_.reserve(layers_ + 1);
+        preactivations_.reserve(layers_);
+    }
+
+    // forward pass to compute the score function given input (state + time embedding)
+    std::vector<double> ScoreBasedDiffusionModel::forward(
+        const std::vector<double>& input
+    )
+    {
+        if (network_.empty() || network_[0].W.empty() || network_[0].W[0].empty()) {
+            throw cet::exception("ScoreBasedDiffusionModel::forward")
+                << "Network is not properly initialized";
+        }
+        if (input.size() != network_[0].W[0].size()) {
+            throw cet::exception("ScoreBasedDiffusionModel::forward")
+                << "Input dimension mismatch: got " << input.size()
+                << ", expected " << network_[0].W[0].size();
+        }
+
+        activations_.clear();
+        preactivations_.clear();
+
+        // Input vector contains both the state and the time embedding (e.g., concatenated together).
+        std::vector<double> x = input;
+
+        // Cache the input as the activation of layer 0 for use in back-propagation.
+        activations_.push_back(x);
+
+        // Forward pass through the network layers
+        for (size_t l = 0; l < network_.size(); ++l)
+        {
+            // Compute pre-activation z = W*x + b for the current layer
+            auto& layer = network_[l];
+            std::vector<double> z(layer.W.size());
+            for (size_t i = 0; i < layer.W.size(); ++i)
+            {
+                double v = layer.b[i];
+                for (size_t j = 0; j < layer.W[i].size(); ++j)
+                {
+                    v += layer.W[i][j]*x[j];
+                }
+                z[i] = v;
+            }
+
+            // Cache pre-activation for back-propagation
+            preactivations_.push_back(z);
+
+            // Apply activation function (SiLU) to get the output of the current layer.
+            std::vector<double> y(z.size());
+            if(l != network_.size()-1) // No activation on the last layer (output layer), as it predicts the score directly.
+            {
+                for (size_t i=0;i<z.size();++i)
+                    y[i] = silu(z[i]);
+            }
+            else
+                y = z;
+
+            // Cache activation for back-propagation
+            activations_.push_back(y);
+
+            // Output of current layer becomes input to the next layer
+            x = y;
+        }
+
+        return x;
+    }
+
+    // backward pass to compute gradients of the loss w.r.t. network parameters using chain rule
+    void ScoreBasedDiffusionModel::backward(
+        const std::vector<double>& gradOutput
+    )
+    {
+        std::vector<double> grad = gradOutput;
+
+        // Back-propagate through layers in reverse order
+        for (int l = static_cast<int>(network_.size()) - 1; l >= 0; l--)
+        {
+            auto& layer = network_[l];
+
+            auto& aPrev = activations_[l];
+            auto& z = preactivations_[l];
+
+            std::vector<double> gradZ(grad.size());
+
+            // For the output layer, the gradient is directly from the loss w.r.t. output (no activation function).
+            if(l == static_cast<int>(network_.size())-1)
+            {
+                gradZ = grad;
+            }
+            // For hidden layers, apply the derivative of the activation function (SiLU) to the gradient.
+            else
+            {
+                for (size_t i=0;i<grad.size();++i)
+                    gradZ[i] = grad[i]*siluDeriv(z[i]);
+            }
+
+            // Compute gradients w.r.t. weights and biases for the current layer
+            for (size_t i = 0; i < layer.W.size(); i++)
+            {
+                for (size_t j=0;j<layer.W[i].size();++j)
+                {
+                    // Gradient of loss w.r.t. weight W[i][j]
+                    layer.gradW[i][j] += gradZ[i]*aPrev[j];
+                }
+
+                // Gradient of loss w.r.t. bias b[i]
+                // Note: We accumulate gradients in gradW and gradb because we will apply the optimizer step
+                // after processing a batch of samples.
+                layer.gradb[i] += gradZ[i];
+            }
+
+            if (layer.W.empty() || layer.W[0].empty()) {
+                throw cet::exception("ScoreBasedDiffusionModel::backward")
+                    << "Encountered empty layer weights during backward pass";
+            }
+            // Compute gradient w.r.t. input of the current layer to propagate to the previous layer
+            std::vector<double> gradPrev(layer.W[0].size(),0.0);
+
+            for (size_t j=0;j<gradPrev.size();++j)
+                for (size_t i=0;i<layer.W.size();++i)
+                    gradPrev[j] += layer.W[i][j]*gradZ[i];
+
+            grad = gradPrev;
+        }
+    }
+
+    // Update network weights using computed gradients (Stochastic Gradient Descent).
+    void ScoreBasedDiffusionModel::updateWeights(
+        double lr
+    )
+    {
+        for (auto& layer : network_)
+        {
+            if (layer.W.empty() || layer.W[0].empty() || layer.b.empty()) {
+                throw cet::exception("ScoreBasedDiffusionModel::updateWeights")
+                    << "Encountered malformed layer during SGD update";
+            }
+
+            size_t outSize = layer.W.size();
+            size_t inSize  = layer.W[0].size();
+
+            // Update weights
+            for (size_t i = 0; i < outSize; ++i)
+            {
+                for (size_t j = 0; j < inSize; ++j)
+                {
+                    // Update weight using SGD: W = W - lr * gradW
+                    layer.W[i][j] -= lr * layer.gradW[i][j];
+
+                    // reset gradient
+                    layer.gradW[i][j] = 0.0;
+                }
+
+                // Update bias
+                layer.b[i] -= lr * layer.gradb[i];
+
+                // reset gradient
+                layer.gradb[i] = 0.0;
+            }
+        }
+    }
+
+    // Update network weights using Adam optimization algorithm.
+    // Adam optimizer maintains running estimates of the first and second moments of the gradients (m and v) for each parameter,
+    // and uses these to adapt the learning rate for each parameter individually.
+    // The update rule for a parameter theta with gradient g at time step t is:
+    // m_t = b1 * m_{t-1} + (1 - b1) * g
+    // v_t = b2 * v_{t-1} + (1 - b2) * g^2
+    // mhat = m_t / (1 - b1^t)  (bias-corrected first moment estimate)
+    // vhat = v_t / (1 - b2^t)  (bias-corrected second moment estimate)
+    // theta = theta - lr * mhat / (sqrt(vhat) + eps)
+    void ScoreBasedDiffusionModel::adamUpdate(
+        double lr
+    )
+    {
+        adamStep_++; // initialize to 0 in constructor,
+                     // increment first to avoid division by zero in the first step when computing bias-corrected estimates
+                     // (1 - b1^t) and (1 - b2^t). Do not modify this to increment after the update.
+
+        for (auto& layer : network_)
+        {
+            if (layer.W.empty() || layer.W[0].empty() || layer.b.empty()) {
+                throw cet::exception("ScoreBasedDiffusionModel::adamUpdate")
+                    << "Encountered malformed layer during Adam update";
+            }
+
+            for (size_t i=0;i<layer.W.size();++i)
+            {
+                // Update weights using Adam update rule
+                for (size_t j=0;j<layer.W[i].size();++j)
+                {
+                    double g = layer.gradW[i][j];
+
+                    layer.mW[i][j] = adamBeta1_*layer.mW[i][j] + (1-adamBeta1_)*g;
+                    layer.vW[i][j] = adamBeta2_*layer.vW[i][j] + (1-adamBeta2_)*g*g;
+
+                    double mhat = layer.mW[i][j]/(1-std::pow(adamBeta1_,adamStep_));
+                    double vhat = layer.vW[i][j]/(1-std::pow(adamBeta2_,adamStep_));
+
+                    layer.W[i][j] -= lr*mhat/(std::sqrt(vhat)+adamEps_);
+
+                    layer.gradW[i][j] = 0.0;
+                }
+
+                // Update bias using Adam update rule
+                double g = layer.gradb[i];
+
+                layer.mb[i] = adamBeta1_*layer.mb[i] + (1-adamBeta1_)*g;
+                layer.vb[i] = adamBeta2_*layer.vb[i] + (1-adamBeta2_)*g*g;
+
+                double mhat = layer.mb[i]/(1-std::pow(adamBeta1_,adamStep_));
+                double vhat = layer.vb[i]/(1-std::pow(adamBeta2_,adamStep_));
+
+                layer.b[i] -= lr*mhat/(std::sqrt(vhat)+adamEps_);
+
+                layer.gradb[i] = 0.0;
+            }
+        }
+    }
+
+    // Add noise to the input state x at diffusion time t to produce the noisy sample xt.
+    std::vector<double> ScoreBasedDiffusionModel::addNoise(
+        const std::vector<double>& x,
+        double t, // Diffusion time parameter in [0,1]
+        std::vector<double>& eps
+    ){
+        if (x.size() != static_cast<size_t>(dim_)) {
+            throw cet::exception("ScoreBasedDiffusionModel::addNoise")
+                << "Input x dimension mismatch: got " << x.size()
+                << ", expected " << dim_;
+        }
+
+        double s = sigma(t);
+
+        // Generate Gaussian noise vector eps of dimension dim_ using the external random engine.
+        eps.resize(dim_);
+
+        std::vector<double> xt(dim_);
+
+        for (int i = 0; i < dim_; ++i) {
+            eps[i] = randGaussQ_.fire();   // Gaussian N(0,1)
+            xt[i]  = x[i] + s * eps[i];
+        }
+
+        return xt;
+    }
+
+    // Compute Mean Squared Error per dimension between predicted score and target score.
+    double ScoreBasedDiffusionModel::computeLoss(
+        const std::vector<double>& score,
+        const std::vector<double>& target
+    ) const{
+
+        double loss = 0.0;
+
+        for (int i=0;i<dim_;++i){
+            double d = score[i]-target[i];
+            loss += d*d;
+        }
+
+        return loss/dim_;
+    }
+
+    // Clip gradients to prevent exploding gradients during training. This is done by scaling down
+    // the gradients if their L2 norm exceeds a specified threshold.
+    void ScoreBasedDiffusionModel::clipGradients(
+        double maxNorm
+    )
+    {
+        double norm = 0.0;
+
+        for (auto& layer : network_)
+        {
+            for (auto& row : layer.gradW)
+                for (double g : row)
+                    norm += g*g;
+            for (double g : layer.gradb)
+                norm += g*g;
+        }
+        norm = std::sqrt(norm);
+
+        // If the norm is below the threshold, no clipping is needed.
+        if(norm <= maxNorm)
+            return;
+
+        // Scale down the gradients to have the specified maximum norm.
+        double scale = maxNorm / norm;
+        for (auto& layer : network_)
+        {
+            for (auto& row : layer.gradW)
+                for (auto& g : row)
+                    g *= scale;
+            for (auto& g : layer.gradb)
+                g *= scale;
+        }
+    }
+
+    // Train the diffusion model using the provided training data. The training loop iterates over epochs and batches,
+    // applying noise to the input samples, computing the score predictions, calculating the loss, and performing
+    // back-propagation to update the model parameters.
+    // For a data sample of 5M 6-dimensional vectors of double precision (8 Byte), total memory for the data is 5M*6*8 = 240 MB,
+    // which is manageable for in-memory training.
+    // Much larger datasets may require streaming from disk or using mini-batches that do not fit entirely in memory.
+    void ScoreBasedDiffusionModel::train(
+        const std::vector<DiffusionTrainingSample>& data,
+        int epochs
+    )
+    {
+        // Check that the network is properly initialized before training.
+        if (network_.empty() || network_[0].W.empty() || network_[0].W[0].empty()) {
+            throw cet::exception("ScoreBasedDiffusionModel::train")
+                << "Network is not properly initialized";
+        }
+
+        const double eps_safe = 1e-12;
+        const size_t N = data.size();
+
+        trainingSampleSize_ = N; // Store the total number of training samples
+
+        if (N <= 1)
+        {
+            throw cet::exception("ScoreBasedDiffusionModel::train")
+                << "Insufficient data samples for training (samples must be > 1)";
+        }
+
+        for (size_t i = 0; i < N; ++i) {
+            if (data[i].x.size() != static_cast<size_t>(dim_)) {
+                throw cet::exception("ScoreBasedDiffusionModel::train")
+                    << "Training sample " << i << " has x dimension " << data[i].x.size()
+                    << ", expected " << dim_;
+            }
+            if (data[i].cond.size() != static_cast<size_t>(conditionDim_)) {
+                throw cet::exception("ScoreBasedDiffusionModel::train")
+                    << "Training sample " << i << " has conditioning dimension " << data[i].cond.size()
+                    << ", expected " << conditionDim_;
+            }
+        }
+
+        // Create index vector once
+        std::vector<size_t> indices(N);
+        for (size_t i = 0; i < N; ++i)
+            indices[i] = i;
+
+        // Data shuffling and batching is performed at the epoch level to ensure that each epoch sees the data
+        // in a different order, which can improve training convergence.
+        for (int e = 0; e < epochs; ++e)
+        {
+            // Shuffle indices using Fisher–Yates and RandFlat for reproducibility
+            for (size_t i = N - 1; i > 0; --i)
+            {
+                size_t j = static_cast<size_t>(randFlat_.fire() * (i + 1));
+                std::swap(indices[i], indices[j]);
+            }
+
+            // Counter for number of samples processed in the epoch (used for averaging loss). Avoid using N directly to
+            // allow for early stopping or partial epoch processing if needed.
+            int n = 0;
+
+            int batchCounter = 0;
+            double epochLoss = 0.0;
+
+            // iterate over the shuffled data samples
+            for (size_t idx = 0; idx < N; ++idx)
+            {
+                const auto& sample = data[indices[idx]];
+
+                // Sample diffusion time
+                double t = randFlat_.fire();
+                // container for noise vector eps
+                std::vector<double> eps;
+
+                auto xt = addNoise(sample.x,t,eps);
+                double s = sigma(t);
+
+                // The target score is the negative of the noise scaled by the noise standard deviation, i.e., -eps/sigma(t).
+                std::vector<double> target(dim_);
+                for (int i=0;i<dim_;++i)
+                    target[i] = -eps[i]/std::max(s, eps_safe); // add eps_safe to prevent division by zero in case of very small sigma
+
+                // Prepare input vector for the network by concatenating the noisy sample xt with the diffusion time t.
+                std::vector<double> input = xt;
+                input.insert(input.end(), sample.cond.begin(), sample.cond.end());
+                input.push_back(t);
+
+                // Check that input dimension matches expected dimension (dim_ + conditionDim_ + 1)
+                if (input.size() != network_[0].W[0].size()) {
+                    throw cet::exception("ScoreBasedDiffusionModel::train")
+                        << "Training input dimension mismatch: got " << input.size()
+                        << ", expected " << network_[0].W[0].size();
+                }
+
+                // Forward pass to compute the predicted score from the network given the input (noisy sample + time).
+                auto score = forward(input);
+
+                // Compute the loss (Mean Squared Error) between the predicted score and the target score.
+                std::vector<double> grad(dim_);
+                double loss=0.0;
+                for (int i=0;i<dim_;++i)
+                {
+                    double d = score[i]-target[i];
+                    loss += d*d;
+                    // Gradient of the loss w.r.t. the predicted score is 2*(score-target)/dim_ (the division by dim_ is for averaging the loss per dimension).
+                    grad[i] = 2*d/dim_;
+                }
+
+                // Backward pass to compute gradients of the loss w.r.t. network parameters using the computed gradient of the loss w.r.t. the predicted score.
+                backward(grad);
+
+                // Increment batch counter and apply optimizer step if batch size is reached. This allows for vectorized updates after processing a batch of
+                // samples, which can improve training efficiency and convergence.
+                batchCounter++;
+                if(batchCounter == batchSize_)
+                {
+                    // Average accumulated gradients so optimizer step size is independent of batch size.
+                    const double invBatch = 1.0 / static_cast<double>(batchCounter);
+                    for (auto& layer : network_) {
+                        for (auto& row : layer.gradW)
+                            for (auto& g : row)
+                                g *= invBatch;
+                        for (auto& g : layer.gradb)
+                            g *= invBatch;
+                    }
+
+                    clipGradients(gradientClipThreshold_); // Clip gradients to prevent exploding gradients
+
+                    // Apply optimizer based on configuration
+                    if (optimizerType_ == OptimizerType::ADAM) {
+                        adamUpdate(learningRate_);
+                    } else {
+                        updateWeights(learningRate_);
+                    }
+                    batchCounter = 0;
+                }
+
+                epochLoss += loss; // Accumulate loss for monitoring.
+                n++;
+            }
+
+            // Final flush: apply optimizer to remaining gradients
+            if(batchCounter > 0)
+            {
+                // Average by the true final mini-batch size (which may be < batchSize_).
+                const double invBatch = 1.0 / static_cast<double>(batchCounter);
+                for (auto& layer : network_) {
+                    for (auto& row : layer.gradW)
+                        for (auto& g : row)
+                            g *= invBatch;
+                    for (auto& g : layer.gradb)
+                        g *= invBatch;
+                }
+
+                clipGradients(gradientClipThreshold_); // Clip gradients before final update
+
+                if (optimizerType_ == OptimizerType::ADAM) {
+                    adamUpdate(learningRate_);
+                } else {
+                    updateWeights(learningRate_);
+                }
+            }
+
+            epochLoss /= n;
+            mf::LogInfo("ScoreBasedDiffusionModel::train") << "Epoch " << e << "  Loss=" << epochLoss;
+            epochLosses_.push_back(epochLoss);
+        }
+    }
+
+    void ScoreBasedDiffusionModel::saveModel(
+        const std::string& filename
+    )
+    {
+        std::ofstream out(filename);
+
+        if (!out) {
+            throw cet::exception("ScoreBasedDiffusionModel::saveModel")
+                << "Cannot open file " << filename;
+        }
+
+        // Write CSV header and model configuration parameters with annotations
+        out << "[MODEL_PARAMETERS]\n";
+        out << "Parameter,Value\n";
+        // Model architecture
+        out << "dim," << dim_ << "\n";
+        out << "conditionDim," << conditionDim_ << "\n";
+        out << "hidden," << hidden_ << "\n";
+        out << "layers," << layers_ << "\n";
+        // Optimizer configuration
+        out << "optimizerType," << (optimizerType_ == OptimizerType::ADAM ? "ADAM" : "SGD") << "\n";
+        out << "adamBeta1," << adamBeta1_ << "\n";
+        out << "adamBeta2," << adamBeta2_ << "\n";
+        out << "adamEps," << adamEps_ << "\n";
+        // Noise schedule configuration
+        out << "noiseScheduleType," << (noiseScheduleType_ == NoiseScheduleType::COSINE ? "COSINE" : "LINEAR") << "\n";
+        out << "betaMin," << betaMin_ << "\n";
+        out << "betaMax," << betaMax_ << "\n";
+        out << "cosineOffset," << cosineOffset_ << "\n";
+        // Training configuration
+        out << "batchSize," << batchSize_ << "\n";
+        out << "gradientClipThreshold," << gradientClipThreshold_ << "\n";
+        out << "learningRate," << learningRate_ << "\n";
+        // Diffusion process configuration
+        out << "diffusionSteps," << diffusionSteps_ << "\n";
+
+        // Write network architecture header
+        out << "\n[NETWORK_PARAMETERS]\n";
+        out << "numLayers," << network_.size() << "\n";
+        out << std::fixed << std::setprecision(17); // Use fixed-point notation with high precision for weights and biases
+
+        for (size_t layerIdx = 0; layerIdx < network_.size(); ++layerIdx) {
+            // Write layer dimensions
+            auto& layer = network_[layerIdx];
+            size_t outSize = layer.W.size();
+            size_t inSize  = layer.W[0].size();
+            out << "\nLayer" << layerIdx << "_OutSize," << outSize << "\n";
+            out << "Layer" << layerIdx << "_InSize," << inSize << "\n";
+
+            // Write weights in matrix format
+            out << "Layer" << layerIdx << "_Weights\n";
+            for (const auto& row : layer.W) {
+                for (size_t j = 0; j < row.size(); ++j) {
+                    out << row[j];
+                    if (j < row.size() - 1) out << ",";
+                }
+                out << "\n";
+            }
+
+            // Write biases
+            out << "Layer" << layerIdx << "_Biases\n";
+            for (size_t j = 0; j < layer.b.size(); ++j) {
+                out << layer.b[j];
+                if (j < layer.b.size() - 1) out << ",";
+            }
+            out << "\n";
+        }
+
+        // Write training history
+        out << "\n[TRAINING_HISTORY]\n";
+        out << "numEpochs," << epochLosses_.size() << "\n";
+        out << "trainingSampleSize," << trainingSampleSize_ << "\n";
+        out << "EpochNumber,Loss\n";
+        for (size_t i = 0; i < epochLosses_.size(); ++i) {
+            out << i << "," << epochLosses_[i] << "\n";
+        }
+    }
+
+    ScoreBasedDiffusionModel ScoreBasedDiffusionModel::loadModel(
+        CLHEP::RandFlat& randFlat,
+        CLHEP::RandGaussQ& randGaussQ,
+        const std::string& filename
+    )
+    {
+        std::ifstream in(filename);
+
+        if (!in) {
+            throw cet::exception("ScoreBasedDiffusionModel::loadModel")
+                << "Cannot open file " << filename;
+        }
+
+        std::string line;
+
+        // Temporary storage
+        int dim = 0, conditionDim = 0, hidden = 0, layers = 0;
+        OptimizerType optimizerType = OptimizerType::ADAM;
+        double adamBeta1 = 0.0, adamBeta2 = 0.0, adamEps = 0.0;
+        NoiseScheduleType scheduleType = NoiseScheduleType::COSINE;
+        double betaMin = 0.0, betaMax = 0.0, cosineOffset = 0.0;
+        int batchSize = 1, diffusionSteps = 1;
+        double gradientClipThreshold = 0.0, learningRate = 0.0;
+
+        std::vector<Layer> loadedNetwork;
+        std::vector<double> epochLosses;
+
+        // Helper lambda to split CSV
+        auto split = [](const std::string& s) {
+            std::vector<std::string> tokens;
+            std::stringstream ss(s);
+            std::string item;
+            while (std::getline(ss, item, ',')) {
+                // trim whitespace from item (beginning and end, should not be present if csv is generated by code, but just in case)
+                item.erase(item.begin(), std::find_if(item.begin(), item.end(), [](unsigned char ch) { return !std::isspace(ch); }));
+                item.erase(std::find_if(item.rbegin(), item.rend(), [](unsigned char ch) { return !std::isspace(ch); }).base(), item.end());
+                tokens.push_back(item);
+            }
+            return tokens;
+        };
+        // Helper lambda to extract layer index from strings like "Layer10_OutSize"
+        auto getLayerIdx = [](const std::string& s) {
+            size_t start = 5; // after "Layer"
+            size_t end = s.find('_', start);
+            if (end == std::string::npos)
+                throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Malformed layer string: " << s;
+            return std::stoi(s.substr(start, end - start));
+        };
+
+        // Parse file
+        std::string section;
+
+        while (std::getline(in, line)) {
+            if (line.empty()) continue;
+
+            // Detect section headers
+            if (line[0] == '[') {
+                section = line;
+                continue;
+            }
+
+            // MODEL PARAMETERS
+            if (section == "[MODEL_PARAMETERS]") {
+                if (line == "Parameter,Value") continue;
+
+                auto tokens = split(line);
+                if (tokens.size() != 2) throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "ScoreBasedDiffusionModel::loadModel: invalid line in [MODEL_PARAMETERS] section: " << line;
+
+                const std::string& key = tokens[0];
+                const std::string& val = tokens[1];
+
+                if (key == "dim") dim = std::stoi(val);
+                else if (key == "conditionDim") conditionDim = std::stoi(val);
+                else if (key == "hidden") hidden = std::stoi(val);
+                else if (key == "layers") layers = std::stoi(val);
+                else if (key == "optimizerType")
+                    optimizerType = (val == "ADAM") ? OptimizerType::ADAM : OptimizerType::SGD;
+                else if (key == "adamBeta1") adamBeta1 = std::stod(val);
+                else if (key == "adamBeta2") adamBeta2 = std::stod(val);
+                else if (key == "adamEps") adamEps = std::stod(val);
+                else if (key == "noiseScheduleType")
+                    scheduleType = (val == "COSINE") ? NoiseScheduleType::COSINE : NoiseScheduleType::LINEAR;
+                else if (key == "betaMin") betaMin = std::stod(val);
+                else if (key == "betaMax") betaMax = std::stod(val);
+                else if (key == "cosineOffset") cosineOffset = std::stod(val);
+                else if (key == "batchSize") batchSize = std::stoi(val);
+                else if (key == "gradientClipThreshold") gradientClipThreshold = std::stod(val);
+                else if (key == "learningRate") learningRate = std::stod(val);
+                else if (key == "diffusionSteps") diffusionSteps = std::stoi(val);
+            }
+
+            // NETWORK PARAMETERS
+            else if (section == "[NETWORK_PARAMETERS]") {
+
+                if (line.rfind("numLayers", 0) == 0) {
+                    int numLayers = std::stoi(split(line)[1]);
+                    loadedNetwork.resize(numLayers);
+                    if (loadedNetwork.size() != (size_t)layers) {
+                        throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Layer count mismatch";
+                    }
+                    continue;
+                }
+
+                // Layer sizes
+                if (line.find("_OutSize") != std::string::npos) {
+                    auto tokens = split(line);
+                    int outSize = std::stoi(tokens[1]);
+                    int layerIdx = getLayerIdx(tokens[0]);
+                    loadedNetwork[layerIdx].W.resize(outSize);
+                    loadedNetwork[layerIdx].b.resize(outSize);
+                }
+                else if (line.find("_InSize") != std::string::npos) {
+                    auto tokens = split(line);
+                    int inSize = std::stoi(tokens[1]);
+                    int layerIdx = getLayerIdx(tokens[0]);
+                    if (layerIdx == 0 && inSize != dim + conditionDim + 1) {
+                        throw cet::exception("ScoreBasedDiffusionModel::loadModel")
+                            << "Input layer input size mismatch (expected "
+                            << (dim + conditionDim + 1) << ", got " << inSize << ")";
+                    }
+                    if (loadedNetwork[layerIdx].W.empty()) {
+                        throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "InSize encountered before OutSize for layer " << layerIdx;
+                    }
+                    for (auto& row : loadedNetwork[layerIdx].W)
+                        row.resize(inSize);
+                }
+                // Weights
+                else if (line.find("_Weights") != std::string::npos) {
+                    int layerIdx = getLayerIdx(line);
+                    if (layerIdx < 0 || layerIdx >= (int)loadedNetwork.size()) {
+                        throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Invalid layer index: " << layerIdx;
+                    }
+                    if (loadedNetwork[layerIdx].W.empty() || loadedNetwork[layerIdx].W[0].empty()) {
+                        throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Weights encountered before layer size definition";
+                    }
+                    for (auto& row : loadedNetwork[layerIdx].W) {
+                        if (!std::getline(in, line)) {
+                            throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Unexpected EOF while reading weights";
+                        }
+                        auto vals = split(line);
+                        if (vals.size() != row.size()) {
+                            throw cet::exception("ScoreBasedDiffusionModel::loadModel")
+                                << "Weight row size mismatch for layer " << layerIdx;
+                        }
+                        for (size_t j = 0; j < vals.size(); ++j)
+                            row[j] = std::stod(vals[j]);
+                    }
+                }
+                // Biases
+                else if (line.find("_Biases") != std::string::npos) {
+                    int layerIdx = getLayerIdx(line);
+                    if (layerIdx < 0 || layerIdx >= (int)loadedNetwork.size()) {
+                        throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Invalid layer index: " << layerIdx;
+                    }
+                    if (!std::getline(in, line)) {
+                        throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Unexpected EOF while reading biases";
+                    }
+                    auto vals = split(line);
+                    if (vals.size() != loadedNetwork[layerIdx].b.size()) {
+                        throw cet::exception("ScoreBasedDiffusionModel::loadModel")
+                            << "Bias size mismatch for layer " << layerIdx;
+                    }
+                    for (size_t j = 0; j < vals.size(); ++j)
+                        loadedNetwork[layerIdx].b[j] = std::stod(vals[j]);
+                }
+            }
+
+            // TRAINING HISTORY
+            else if (section == "[TRAINING_HISTORY]") {
+                if (line == "EpochNumber,Loss") continue;
+
+                auto tokens = split(line);
+                if (tokens.size() == 2) {
+                    if (tokens[0] == "numEpochs") {
+                        int numEpochs = std::stoi(tokens[1]);
+                        epochLosses.reserve(numEpochs);
+                    } else if (tokens[0] == "trainingSampleSize")
+                    {
+                        // We can store this if needed for analysis, but it is not used in model reconstruction, so we will ignore it for now.
+                    }else {
+                        epochLosses.push_back(std::stod(tokens[1]));
+                    }
+                }
+            }
+        }
+
+        if (dim <= 0 || conditionDim < 0 || hidden <= 0 || layers <= 0) {
+            throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Invalid model parameters in file";
+        }
+        for (size_t l = 0; l < loadedNetwork.size(); ++l) {
+            if (loadedNetwork[l].W.empty() || loadedNetwork[l].b.empty()) {
+                throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Incomplete layer data in file";
+            }
+            for (const auto& row : loadedNetwork[l].W) {
+                if (row.size() != loadedNetwork[l].W[0].size()) {
+                    throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Inconsistent row size in layer";
+                }
+            }
+        }
+        for (size_t l = 1; l < loadedNetwork.size(); ++l) {
+            if (loadedNetwork[l].W[0].size() != loadedNetwork[l-1].W.size()) {
+                throw cet::exception("ScoreBasedDiffusionModel::loadModel") << "Layer size mismatch between layers";
+            }
+        }
+
+        std::ostringstream logMsg;
+        logMsg << "Model parameters loaded successfully from " << filename << "\n"
+               << "Warning: optimizer state (e.g., Adam moments) is not saved/loaded.\n"
+               << "The loaded model is suitable for inference, or for retraining with a fresh optimizer state,\n"
+               << "but does NOT resume training from the original state.";
+        mf::LogInfo("ScoreBasedDiffusionModel::loadModel") << logMsg.str();
+
+        // Reconstruct model
+        ScoreBasedDiffusionModel model(
+            randFlat,
+            randGaussQ,
+            dim,
+            conditionDim,
+            hidden,
+            layers,
+            optimizerType,
+            adamBeta1,
+            adamBeta2,
+            adamEps,
+            scheduleType,
+            betaMin,
+            betaMax,
+            cosineOffset,
+            batchSize,
+            gradientClipThreshold,
+            learningRate,
+            diffusionSteps
+        );
+
+        // Overwrite initialized weights with loaded values
+        for (size_t l = 0; l < model.network_.size(); ++l) {
+            model.network_[l].W = loadedNetwork[l].W;
+            model.network_[l].b = loadedNetwork[l].b;
+        }
+        model.epochLosses_ = epochLosses;
+
+        return model;
+    }
+
+    std::vector<double> ScoreBasedDiffusionModel::generateSample(
+        const std::vector<double>& condition,
+        bool useHeun,
+        int diffusionSteps
+    )
+    {
+        if (condition.size() != static_cast<size_t>(conditionDim_)) {
+            throw cet::exception("ScoreBasedDiffusionModel::generateSample")
+                << "Conditioning dimension mismatch: got " << condition.size()
+                << ", expected " << conditionDim_;
+        }
+
+        // Use provided diffusionSteps if positive, otherwise use the model's configured value
+        int steps = (diffusionSteps > 0) ? diffusionSteps : diffusionSteps_;
+
+        // Start from pure noise
+        std::vector<double> x(dim_);
+
+        for (int i=0;i<dim_;++i)
+            x[i] = randGaussQ_.fire();
+
+        // Reverse diffusion process
+        for (int step = steps - 1; step >= 0; --step) {
+
+            double t = (double)step/steps;
+            double dt = 1.0/steps;
+            double s = sigma(t); // as long as diffusionSteps_ is not too large, s should not become too small to cause numerical issues.
+
+            if (!useHeun) {
+                // Euler method (1st order)
+                std::vector<double> input = x;
+                input.insert(input.end(), condition.begin(), condition.end());
+                input.push_back(t);
+
+                auto score = forward(input);
+
+                for (int i=0;i<dim_;++i){
+                    x[i] += -s*s*score[i]*dt;
+                }
+            } else {
+                // Heun's method (2nd order)
+                // k1 = f(x,t)
+                std::vector<double> input = x;
+                input.insert(input.end(), condition.begin(), condition.end());
+                input.push_back(t);
+                auto score_k1 = forward(input);
+
+                std::vector<double> k1(dim_);
+                for (int i=0;i<dim_;++i)
+                    k1[i] = -s*s*score_k1[i];
+
+                // predictor
+                std::vector<double> x_pred(dim_);
+                for (int i=0;i<dim_;++i)
+                    x_pred[i] = x[i] + k1[i]*dt;
+
+                // next time
+                double t_next = std::max(0.0, t - dt);
+                double s_next = sigma(t_next);
+
+                // k2 = f(x_pred,t_next)
+                std::vector<double> input_next = x_pred;
+                input_next.insert(input_next.end(), condition.begin(), condition.end());
+                input_next.push_back(t_next);
+                auto score_k2 = forward(input_next);
+
+                std::vector<double> k2(dim_);
+                for (int i=0;i<dim_;++i)
+                    k2[i] = -s_next*s_next*score_k2[i];
+
+                // trapezoidal update
+                for (int i=0;i<dim_;++i)
+                    x[i] += 0.5*(k1[i]+k2[i])*dt;
+            }
+        }
+
+        return x;
+    }
+
+} // namespace mu2e
diff --git a/STMMC/CMakeLists.txt b/STMMC/CMakeLists.txt
index 7e719143d7..e48d57a37e 100644
--- a/STMMC/CMakeLists.txt
+++ b/STMMC/CMakeLists.txt
@@ -83,6 +83,45 @@ cet_build_plugin(VirtualDetectorTree art::module
       Offline::MCDataProducts
 )
 
+cet_build_plugin(VDResamplerConfigure art::module
+    REG_SOURCE src/VDResamplerConfigure_module.cc
+    LIBRARIES REG
+      art_root_io::TFileService_service
+      Offline::GlobalConstantsService
+      Offline::MCDataProducts
+)
+
+cet_build_plugin(VDResamplerTrain art::module
+    REG_SOURCE src/VDResamplerTrain_module.cc
+    LIBRARIES REG
+      art_root_io::TFileService_service
+      Offline::GlobalConstantsService
+      Offline::MachineLearningTools
+      Offline::MCDataProducts
+      Offline::SeedService
+)
+
+cet_build_plugin(VDResamplerGenerateFromModel art::module
+    REG_SOURCE src/VDResamplerGenerateFromModel_module.cc
+    LIBRARIES REG
+      art_root_io::TFileService_service
+      Offline::DataProducts
+      Offline::GlobalConstantsService
+      Offline::MachineLearningTools
+      Offline::MCDataProducts
+      Offline::SeedService
+)
+
+cet_build_plugin(VDResamplerGenerateMix art::module
+    REG_SOURCE src/VDResamplerGenerateMix_module.cc
+    LIBRARIES REG
+      art_root_io::TFileService_service
+      Offline::DataProducts
+      Offline::GlobalConstantsService
+      Offline::MachineLearningTools
+      Offline::MCDataProducts
+      Offline::SeedService
+)
 
 install_source(SUBDIRS src)
 install_fhicl(SUBDIRS fcl SUBDIRNAME Offline/STMMC/fcl)
diff --git a/STMMC/inc/VDResamplerTransformDefaults.hh b/STMMC/inc/VDResamplerTransformDefaults.hh
new file mode 100644
index 0000000000..ada0d1d7f1
--- /dev/null
+++ b/STMMC/inc/VDResamplerTransformDefaults.hh
@@ -0,0 +1,13 @@
+#pragma once
+
+namespace mu2e {
+  namespace vdresampler {
+
+    constexpr double kX0 = -3904.0;
+    constexpr double kY0 = 0.0;
+    constexpr double kT0 = 1700.0;    // ns
+    constexpr double kTScale = 1.0;
+    constexpr double kP0 = 1.0;
+
+  }  // namespace vdresampler
+}  // namespace mu2e
diff --git a/STMMC/src/SConscript b/STMMC/src/SConscript
index 6278047c86..812af72a9f 100644
--- a/STMMC/src/SConscript
+++ b/STMMC/src/SConscript
@@ -48,6 +48,8 @@ helper.make_plugins( [ mainlib,
                        'mu2e_MCDataProducts',
                        'mu2e_GlobalConstantsService',
                        'mu2e_GeometryService',
+                       'mu2e_MachineLearningTools',
+                       'mu2e_SeedService',
                        rootlibs
                      ] )
 
diff --git a/STMMC/src/VDResamplerConfigure_module.cc b/STMMC/src/VDResamplerConfigure_module.cc
new file mode 100644
index 0000000000..82db041969
--- /dev/null
+++ b/STMMC/src/VDResamplerConfigure_module.cc
@@ -0,0 +1,271 @@
+// VDResamplerConfigure_module.cc
+// Makes one pass of the data set and determines which particles will need training for the
+// resampler, and generates a fcl file for training. "dataSourceTag" specifies the tag of the
+// data source, and will be appended to the generated fcl file and then csv files to
+// distinguish between different data sources. The generated fcl files will be stored and
+// named "VDResamplerTrain_[dataSourceTag].fcl".
+// On the other hand, a breakdown of the number of hits for each particle type will be stored
+// in "VDResamplerConfigure_[dataSourceTag]_HitSummary.csv" for reference.
+// Later when generating new samples using the resampler, the program will look for the
+// generated fcl files to determine which particle source to use, which particle types to
+// generate and which model parameters to load for each particle type.
+// Yongyi Wu, Mar. 2026
+
+// stdlib includes
+#include <cmath>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+// art includes
+#include "art/Framework/Core/EDAnalyzer.h"
+#include "art/Framework/Principal/Event.h"
+#include "art/Framework/Principal/Handle.h"
+#include "art/Framework/Principal/Run.h"
+
+// exception handling
+#include "cetlib_except/exception.h"
+
+// fhicl includes
+#include "canvas/Utilities/InputTag.h"
+#include "fhiclcpp/types/Atom.h"
+
+// message handling
+#include "messagefacility/MessageLogger/MessageLogger.h"
+
+// Offline includes
+#include "Offline/GlobalConstantsService/inc/GlobalConstantsHandle.hh"
+#include "Offline/GlobalConstantsService/inc/ParticleDataList.hh"
+#include "Offline/MCDataProducts/inc/SimParticle.hh"
+#include "Offline/MCDataProducts/inc/StepPointMC.hh"
+
+// ROOT includes
+#include "art_root_io/TFileService.h"
+#include "TTree.h"
+
+typedef cet::map_vector_key key_type;
+typedef unsigned long VolumeId_type;
+
+namespace mu2e {
+  class VDResamplerConfigure : public art::EDAnalyzer {
+    public:
+      using Name=fhicl::Name;
+      using Comment=fhicl::Comment;
+      struct Config {
+        fhicl::Atom<art::InputTag> StepPointMCsTag{Name("StepPointMCsTag"), Comment("Tag identifying the StepPointMCs")};
+        fhicl::Atom<art::InputTag> SimParticlemvTag{Name("SimParticlemvTag"), Comment("Tag identifying the SimParticlemv")};
+        fhicl::Atom<int> VirtualDetectorID{Name("VirtualDetectorID"), Comment("ID of the virtual detector to train on"), 116};
+        fhicl::Atom<std::string> VDResamplerDir{Name("VDResamplerDir"), Comment("Directory to store the generated csv files")};
+        fhicl::Atom<std::string> fclDir{Name("fclDir"), Comment("Directory to store the generated fhicl files"), ""};
+        fhicl::Atom<std::string> dataSourceTag{Name("dataSourceTag"), Comment("A tag to distinguish different data sources, will be appended to the generated fcl and csv files")};
+        fhicl::Atom<int> trainingThreshold{Name("trainingThreshold"), Comment("Minimum number of hits for a particle type to be included in the training"), 100};
+        fhicl::Atom<bool> doROOTDump{Name("doROOTDump"), Comment("Whether to dump the VD hit info into a ROOT file for debugging and analysis"), false};
+      };
+      using Parameters = art::EDAnalyzer::Table<Config>;
+      explicit VDResamplerConfigure(const Parameters& conf);
+      void analyze(const art::Event& e);
+      void endJob();
+    private:
+      art::ProductToken<StepPointMCCollection> StepPointMCsToken;
+      art::ProductToken<SimParticleCollection> SimParticlemvToken;
+      VolumeId_type VirtualDetectorID = 0;
+      std::string VDResamplerDir;
+      std::string fclDir;
+      std::string dataSourceTag;
+      int trainingThreshold;
+      bool doROOTDump;
+      GlobalConstantsHandle<ParticleDataList> pdt;
+      int pdgId = 0;
+      double x = 0.0, y = 0.0, z = 0.0, px = 0.0, py = 0.0, pz = 0.0, mass = 0.0, E = 0.0, time = 0.0;
+      VolumeId_type virtualdetectorId = 0;
+      int nThrownEvents = 0;
+      TTree* ttree;
+      std::map<int, int> pdgIds; // <id, count>
+  };
+
+  VDResamplerConfigure::VDResamplerConfigure(const Parameters& conf) :
+    art::EDAnalyzer(conf),
+    StepPointMCsToken(consumes<StepPointMCCollection>(conf().StepPointMCsTag())),
+    SimParticlemvToken(consumes<SimParticleCollection>(conf().SimParticlemvTag())),
+    VirtualDetectorID(conf().VirtualDetectorID()),
+    VDResamplerDir(conf().VDResamplerDir()),
+    fclDir(conf().fclDir()),
+    dataSourceTag(conf().dataSourceTag()),
+    trainingThreshold(conf().trainingThreshold()),
+    doROOTDump(conf().doROOTDump()) {
+    if (doROOTDump) {
+        art::ServiceHandle<art::TFileService> tfs;
+        ttree = tfs->make<TTree>( "ttree", "Virtual Detectors Hit Summary");
+        ttree->Branch("time", &time, "time/D"); // ns
+        ttree->Branch("virtualdetectorId", &virtualdetectorId, "virtualdetectorId/l");
+        ttree->Branch("pdgId", &pdgId, "pdgId/I");
+        ttree->Branch("x", &x, "x/D"); // mm
+        ttree->Branch("y", &y, "y/D"); // mm
+        ttree->Branch("z", &z, "z/D"); // mm
+        ttree->Branch("px", &px, "px/D"); // MeV
+        ttree->Branch("py", &py, "py/D"); // MeV
+        ttree->Branch("pz", &pz, "pz/D"); // MeV
+        ttree->Branch("E", &E, "E/D"); // MeV
+    }
+  };
+
+  void VDResamplerConfigure::analyze(const art::Event& event) {
+    // Get the data products from the event
+    auto const& StepPointMCs = event.getProduct(StepPointMCsToken);
+    if (StepPointMCs.empty())
+      return;
+    auto const& SimParticles = event.getProduct(SimParticlemvToken);
+    if (SimParticles.empty())
+      return;
+
+    // Loop over all VD hits
+    for (const StepPointMC& step : StepPointMCs) {
+      // Get the associated particle
+      const SimParticle& particle = SimParticles.at(step.trackId());
+      pdgId = particle.pdgId();
+
+      virtualdetectorId = step.virtualDetectorId();
+      pz = step.momentum().z();
+      if (virtualdetectorId != VirtualDetectorID || pz <= 0)
+      {
+        // mf::LogWarning("VDResamplerConfigure") << "Thrown event\n"
+        //                                        << "PDG ID = " << pdgId << ", VDID = " << virtualdetectorId << ", z = " << step.position().z() << ", pz = " << pz;
+        nThrownEvents += 1;
+        continue; // Filter hits based on the virtual detector ID and pz
+      }
+
+      if (doROOTDump) {
+          x = step.position().x();
+          y = step.position().y();
+          z = step.position().z();
+          px = step.momentum().x();
+          py = step.momentum().y();
+          pz = step.momentum().z();
+          mass = pdt->particle(pdgId).mass();
+          E = std::sqrt(step.momentum().mag2()+mass*mass)-mass; // Subtract the rest mass
+          if (E < 0)
+            throw cet::exception("LogicError", "Energy is negative");
+
+          ttree->Fill();
+      }
+
+      // Count the number of hits for each particle type for the summary
+      if (pdgIds.find(pdgId) != pdgIds.end())
+        pdgIds[pdgId] += 1;
+      else
+        pdgIds.emplace(std::make_pair(pdgId, 1));
+    };
+    return;
+  };
+
+  void VDResamplerConfigure::endJob() {
+    mf::LogInfo log("Virtual Detector Resampler Training Configuration Summary");
+    log << "========= Particle Summary =========\n";
+    for (auto part : pdgIds)
+      log << "PDGID " << part.first << ": " << part.second << "\n";
+    log << "====================================\n";
+    log << "Number of thrown events: " << nThrownEvents << "\n";
+
+    // store a summary of the number of hits for each particle type in a csv file for reference,
+    // all particles are included, but the particle types with hits below the training threshold are
+    // marked with an asterisk to indicate that they will not be included in the training.
+    std::string summaryFile = VDResamplerDir + "/VDResamplerConfigure_" + dataSourceTag + "_HitSummary.csv";
+    std::string fclFilePath = fclDir.empty() ? VDResamplerDir : fclDir;
+    std::string fclFile = fclFilePath + "/VDResamplerTrain_" + dataSourceTag + ".fcl";
+    std::ofstream sumOutFile(summaryFile);
+    std::ofstream fclOutFile(fclFile);
+    if (!sumOutFile) {
+        throw cet::exception("VDResamplerConfigure::endJob") << "Cannot open file " << summaryFile;
+    }
+    if (!fclOutFile) {
+        throw cet::exception("VDResamplerConfigure::endJob") << "Cannot open file " << fclFile;
+    }
+
+    std::string trainingPathNames = "";
+    std::vector<std::pair<std::string, std::string>> trainingPaths; // pairs of (moduleName, pathName)
+
+    sumOutFile << "PDGID,HitCount,TrainingMode\n";
+
+    fclOutFile << "# This fcl is generated by VDResamplerConfigure_module.cc\n";
+    fclOutFile << "# Training configuration for the VD resampler is generated for each particle type\n\n";
+    fclOutFile << "#include \"Offline/fcl/standardServices.fcl\"\n";
+    fclOutFile << "#include \"Production/JobConfig/pileup/STM/prolog.fcl\"\n\n";
+    fclOutFile << "process_name: VDResamplerTrain\n\n";
+    fclOutFile << "source : @local::STMPileup.VDResamplerSource\n\n";
+    fclOutFile << "services : @local::Services.Sim\n\n";
+    fclOutFile << "physics: {\n  analyzers : {\n";
+
+    for (const auto& part : pdgIds) {
+      sumOutFile << part.first << "," << part.second;
+      bool useTwoStageTraining = false;
+      if (part.second < 100000) {
+        // if data is limited, use two-stage training to improve performance and reduce the required training data size for each model.
+        useTwoStageTraining = true;
+      }
+      if (part.second < trainingThreshold) {
+        sumOutFile << ",*\n";
+        mf::LogWarning("VDResamplerConfigure::endJob") << "Particle type with PDGID " << part.first
+            << " has only " << part.second << " hits, which is below the training threshold of "
+            << trainingThreshold << ". This particle type will NOT be included in the training.";
+      } else {
+        // mark how many models will be trained for this particle type (1 for all-at-once, 2 for two-stage)
+        sumOutFile << "," << (useTwoStageTraining ? "2" : "1") << "\n";
+
+        // Generate the fcl file for training for this particle type
+        // if pdgID is negative, we will use "m" instead of "-" in the filename to avoid issues with file naming
+        std::string pdgIdstr = (part.first < 0) ? "m" + std::to_string(-part.first) : std::to_string(part.first);
+        std::string moduleName = "VDResamplerTrainVD"+ std::to_string(VirtualDetectorID) + dataSourceTag + "pdg" + pdgIdstr;
+        std::string pathName = "trainPathVD" + std::to_string(VirtualDetectorID) + "pdg" + pdgIdstr;
+        trainingPaths.push_back({moduleName, pathName});
+        fclOutFile << "    " << moduleName << " : {\n"
+                    << "      module_type : VDResamplerTrain\n"
+                    << "      StepPointMCsTag : @local::SimplifyStage1Data.StepPointMCsTag\n"
+                    << "      SimParticlemvTag : @local::SimplifyStage1Data.SimParticlemvTag\n"
+                    << "      SBDMuseTwoStageTraining : " << (useTwoStageTraining ? "true" : "false") << "\n";
+        if (useTwoStageTraining) {
+          fclOutFile << "      SBDMstage1ModelFile : \"" << VDResamplerDir << "/SBDMmodel_stage1_VD" << VirtualDetectorID << "_pdg" << pdgIdstr << ".csv\"\n"
+                     << "      SBDMstage2ModelFile : \"" << VDResamplerDir << "/SBDMmodel_stage2_VD" << VirtualDetectorID << "_pdg" << pdgIdstr << ".csv\"\n";
+        } else {
+          fclOutFile << "      SBDMallAtOnceModelFile : \"" << VDResamplerDir << "/SBDMmodel_allAtOnce_VD" << VirtualDetectorID << "_pdg" << pdgIdstr << ".csv\"\n";
+        }
+        fclOutFile << "      VirtualDetectorID : " << VirtualDetectorID << "\n"
+                    // << "      VDz0 : " << std::to_string(<value>) << "\n" // include if not VD116
+                    // << "      VDr : " << std::to_string(<value>) << "\n" // include if not VD116
+                    << "      pdgID : " << part.first << "\n"
+                    // << "      SBDMhidden : 128\n"
+                    // << "      SBDMlayers : 4\n"
+                    // << "      SBDMoptimizer : \"ADAM\"\n"
+                    // << "      SBDMadamBeta1 : 0.9\n"
+                    // << "      SBDMadamBeta2 : 0.999\n"
+                    // << "      SBDMadamEps : 1e-8\n"
+                    // << "      SBDMnoiseSchedule : \"COSINE\"\n"
+                    // << "      SBDMbetaMin : 1e-4\n"
+                    // << "      SBDMbetaMax : 0.02\n"
+                    // << "      SBDMcosineOffset : 0.008\n"
+                    // << "      SBDMbatchSize : 32\n"
+                    // << "      SBDMgradientClip : 1.0\n"
+                    // << "      SBDMlearningRate : 1e-3\n"
+                    // << "      SBDMdiffusionSteps : 200\n"
+                    << "      SBDMtrainingSize : " << part.second << "\n"
+                    // << "      SBDMtrainingEpochs : 10\n"
+                    << "    }\n";
+      }
+    }
+    fclOutFile << "  }\n";
+
+    // Create trigger paths for each training module
+    for (size_t i = 0; i < trainingPaths.size(); ++i) {
+      fclOutFile << "  " << trainingPaths[i].second << " : [" << trainingPaths[i].first << "]\n";
+      if (i > 0) trainingPathNames += ", ";
+      trainingPathNames += trainingPaths[i].second;
+    }
+
+    fclOutFile << "  end_paths: [" + trainingPathNames + "]\n";
+    fclOutFile << "}\n\n";
+    if (doROOTDump) fclOutFile << "services.TFileService.fileName : @nil\n";
+
+    return;
+  };
+}; // end namespace mu2e
+
+DEFINE_ART_MODULE(mu2e::VDResamplerConfigure)
diff --git a/STMMC/src/VDResamplerGenerateFromModel_module.cc b/STMMC/src/VDResamplerGenerateFromModel_module.cc
new file mode 100644
index 0000000000..9f72ccbea0
--- /dev/null
+++ b/STMMC/src/VDResamplerGenerateFromModel_module.cc
@@ -0,0 +1,342 @@
+// VDResamplerGenerateFromModel_module.cc
+// This module takes either a single all-at-once model file or a pair of stage-1/stage-2
+// model files and produces new virtual-detector-like GenParticles from the trained
+// score-based diffusion model(s).
+// Yongyi Wu, Mar. 2026
+
+// stdlib includes
+#include <algorithm>
+#include <cctype>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <utility>
+
+#include "Offline/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh"
+
+// art includes
+#include "art/Framework/Core/EDProducer.h"
+#include "art/Framework/Principal/Event.h"
+#include "art/Framework/Services/Optional/RandomNumberGenerator.h"
+#include "art/Framework/Services/Registry/ServiceHandle.h"
+
+// CLHEP includes
+#include "CLHEP/Vector/LorentzVector.h"
+#include "CLHEP/Vector/ThreeVector.h"
+#include "CLHEP/Random/RandFlat.h"
+#include "CLHEP/Random/RandGaussQ.h"
+
+// exception handling
+#include "cetlib_except/exception.h"
+
+// fhicl includes
+#include "fhiclcpp/types/Atom.h"
+
+// message handling
+#include "messagefacility/MessageLogger/MessageLogger.h"
+
+// Offline includes
+#include "Offline/DataProducts/inc/PDGCode.hh"
+#include "Offline/GlobalConstantsService/inc/GlobalConstantsHandle.hh"
+#include "Offline/GlobalConstantsService/inc/ParticleDataList.hh"
+#include "Offline/MCDataProducts/inc/GenId.hh"
+#include "Offline/MCDataProducts/inc/GenParticle.hh"
+#include "Offline/SeedService/inc/SeedService.hh"
+#include "Offline/STMMC/inc/VDResamplerTransformDefaults.hh"
+
+// ROOT includes
+#include "art_root_io/TFileService.h"
+#include "TTree.h"
+
+namespace mu2e {
+
+  namespace {
+    // Utility function to extract PDG ID from the model filename, which is expected to contain a substring like "pdg[optional m][digits]", e.g. "pdg13" for muons, "pdgm13" for muon pluses.
+    int loadPDGIdFromFileName(const std::string& fileName) {
+      const size_t pdgPos = fileName.find("pdg");
+      if (pdgPos == std::string::npos) {
+        throw cet::exception("VDResamplerGenerateFromModel")
+          << "Cannot infer PDG ID from model filename: " << fileName;
+      }
+      size_t pos = pdgPos + 3;
+      bool negative = false;
+      if (pos < fileName.size() && (fileName[pos] == 'm' || fileName[pos] == '-')) {
+        negative = true;
+        ++pos;
+      }
+      const size_t startDigits = pos;
+      while (pos < fileName.size() && std::isdigit(static_cast<unsigned char>(fileName[pos]))) {
+        ++pos;
+      }
+      if (startDigits == pos) {
+        throw cet::exception("VDResamplerGenerateFromModel")
+          << "Cannot infer PDG ID from model filename: " << fileName;
+      }
+      const int magnitude = std::stoi(fileName.substr(startDigits, pos - startDigits));
+      return negative ? -magnitude : magnitude;
+    }
+  }
+
+  class VDResamplerGenerateFromModel : public art::EDProducer {
+    public:
+      using Name = fhicl::Name;
+      using Comment = fhicl::Comment;
+
+      struct Config {
+        fhicl::Atom<bool> useTwoStageModel{
+          Name("useTwoStageModel"),
+          Comment("If true, use stage-1/stage-2 models. If false, use the all-at-once 6D model."),
+          true
+        };
+        fhicl::Atom<std::string> stage1ModelFile{
+          Name("stage1ModelFile"),
+          Comment("CSV filename for the stage-1 model parameters"),
+          ""
+        };
+        fhicl::Atom<std::string> stage2ModelFile{
+          Name("stage2ModelFile"),
+          Comment("CSV filename for the stage-2 model parameters"),
+          ""
+        };
+        fhicl::Atom<std::string> allAtOnceModelFile{
+          Name("allAtOnceModelFile"),
+          Comment("CSV filename for the all-at-once 6D model parameters"),
+          ""
+        };
+        fhicl::Atom<bool> useHeun{
+          Name("useHeun"),
+          Comment("If true, use Heun's method for reverse diffusion. Otherwise use Euler."),
+          true
+        };
+        fhicl::Atom<int> diffusionSteps{
+          Name("diffusionSteps"),
+          Comment("Number of reverse-diffusion steps used for sampling"),
+          200
+        };
+        fhicl::Atom<double> VDz0{
+          Name("VDz0"),
+          Comment("Nominal z coordinate of the virtual detector"),
+          37700.39
+        };
+        fhicl::Atom<double> VDr{
+          Name("VDr"),
+          Comment("Virtual detector radius used in the training transform"),
+          2000.0
+        };
+        fhicl::Atom<bool> doROOTDump{
+          Name("doROOTDump"),
+          Comment("Whether to dump generated samples into a ROOT tree"),
+          false
+        };
+      };
+
+      using Parameters = art::EDProducer::Table<Config>;
+
+      explicit VDResamplerGenerateFromModel(const Parameters& conf);
+      void produce(art::Event& event) override;
+
+    private:
+      art::RandomNumberGenerator::base_engine_t& engine_;
+      CLHEP::RandFlat randFlat_;
+      CLHEP::RandGaussQ randGaussQ_;
+      bool useTwoStageModel_;
+      std::string stage1ModelFile_;
+      std::string stage2ModelFile_;
+      std::string allAtOnceModelFile_;
+      bool useHeun_;
+      int diffusionSteps_;
+      double VDz0_;
+      double VDr_;
+      bool doROOTDump_;
+      GlobalConstantsHandle<ParticleDataList> pdt_;
+
+      std::unique_ptr<ScoreBasedDiffusionModel> allAtOnceModel_;
+      std::unique_ptr<ScoreBasedDiffusionModel> stage1Model_;
+      std::unique_ptr<ScoreBasedDiffusionModel> stage2Model_;
+
+      int pdgId_ = 0;
+
+      // Detector-center parameters used in the same transform as training.
+      double x0_ = vdresampler::kX0;
+      double y0_ = vdresampler::kY0;
+      double t0_ = vdresampler::kT0;
+      double tScale_ = vdresampler::kTScale;
+      double p0_ = vdresampler::kP0;
+
+      // Variables for optional ROOT dump.
+      TTree* outTree_ = nullptr;
+      double x_gen_ = 0.0;
+      double y_gen_ = 0.0;
+      double z_gen_ = 0.0;
+      double t_gen_ = 0.0;
+      double px_gen_ = 0.0;
+      double py_gen_ = 0.0;
+      double pz_gen_ = 0.0;
+      double mass_gen_ = 0.0;
+      double E_gen_ = 0.0;
+    };
+
+  VDResamplerGenerateFromModel::VDResamplerGenerateFromModel(const Parameters& conf)
+    : art::EDProducer{conf},
+      engine_(createEngine(art::ServiceHandle<SeedService>()->getSeed())),
+      randFlat_(engine_),
+      randGaussQ_(engine_),
+      useTwoStageModel_(conf().useTwoStageModel()),
+      stage1ModelFile_(conf().stage1ModelFile()),
+      stage2ModelFile_(conf().stage2ModelFile()),
+      allAtOnceModelFile_(conf().allAtOnceModelFile()),
+      useHeun_(conf().useHeun()),
+      diffusionSteps_(conf().diffusionSteps()),
+      VDz0_(conf().VDz0()),
+      VDr_(conf().VDr()),
+      doROOTDump_(conf().doROOTDump()) {
+
+    produces<GenParticleCollection>();
+
+    if (useTwoStageModel_) {
+      if (stage1ModelFile_.empty() || stage2ModelFile_.empty()) {
+        throw cet::exception("VDResamplerGenerateFromModel")
+          << "Two-stage generation requires both stage1ModelFile and stage2ModelFile.";
+      }
+
+      stage1Model_ = std::make_unique<ScoreBasedDiffusionModel>(
+        ScoreBasedDiffusionModel::loadModel(randFlat_, randGaussQ_, stage1ModelFile_)
+      );
+      stage2Model_ = std::make_unique<ScoreBasedDiffusionModel>(
+        ScoreBasedDiffusionModel::loadModel(randFlat_, randGaussQ_, stage2ModelFile_)
+      );
+      pdgId_ = loadPDGIdFromFileName(stage1ModelFile_);
+    } else {
+      if (allAtOnceModelFile_.empty()) {
+        throw cet::exception("VDResamplerGenerateFromModel")
+          << "All-at-once generation requires allAtOnceModelFile.";
+      }
+
+      allAtOnceModel_ = std::make_unique<ScoreBasedDiffusionModel>(
+        ScoreBasedDiffusionModel::loadModel(randFlat_, randGaussQ_, allAtOnceModelFile_)
+      );
+      pdgId_ = loadPDGIdFromFileName(allAtOnceModelFile_);
+    }
+
+    z_gen_ = VDz0_;
+
+    if (doROOTDump_) {
+      art::ServiceHandle<art::TFileService> tfs;
+      outTree_ = tfs->make<TTree>("ttree", "Generated samples from VD resampler model");
+      outTree_->Branch("pdgId", &pdgId_, "pdgId/I");
+      outTree_->Branch("x", &x_gen_, "x/D");
+      outTree_->Branch("y", &y_gen_, "y/D");
+      outTree_->Branch("z", &z_gen_, "z/D");
+      outTree_->Branch("time", &t_gen_, "time/D");
+      outTree_->Branch("px", &px_gen_, "px/D");
+      outTree_->Branch("py", &py_gen_, "py/D");
+      outTree_->Branch("pz", &pz_gen_, "pz/D");
+      outTree_->Branch("E", &E_gen_, "E/D");
+    }
+  }
+
+  void VDResamplerGenerateFromModel::produce(art::Event& event) {
+    auto output = std::make_unique<GenParticleCollection>();
+
+    double x_trans = 0.0;
+    double y_trans = 0.0;
+    double t_trans = 0.0;
+    double pr_t = 0.0;
+    double pphi_t = 0.0;
+    double pz_t = 0.0;
+
+    // Generate a new sample using the loaded model(s)
+    // note the values here are transformed and need to be inverted back to the original coordinates after sampling.
+    if (useTwoStageModel_) {
+      const std::vector<double> stage1Sample = stage1Model_->generateSample({}, useHeun_, diffusionSteps_);
+      if (stage1Sample.size() != 3u) {
+        throw cet::exception("VDResamplerGenerateFromModel")
+          << "Stage-1 model returned " << stage1Sample.size() << " values, expected 3.";
+      }
+
+      t_trans = stage1Sample[0];
+      x_trans = stage1Sample[1];
+      y_trans = stage1Sample[2];
+
+      const std::vector<double> stage2Condition = {t_trans, x_trans, y_trans};
+      const std::vector<double> stage2Sample = stage2Model_->generateSample(stage2Condition, useHeun_, diffusionSteps_);
+      if (stage2Sample.size() != 3u) {
+        throw cet::exception("VDResamplerGenerateFromModel")
+          << "Stage-2 model returned " << stage2Sample.size() << " values, expected 3.";
+      }
+
+      pr_t = stage2Sample[0];
+      pphi_t = stage2Sample[1];
+      pz_t = stage2Sample[2];
+    } else {
+      const std::vector<double> sample = allAtOnceModel_->generateSample({}, useHeun_, diffusionSteps_);
+      if (sample.size() != 6u) {
+        throw cet::exception("VDResamplerGenerateFromModel")
+          << "All-at-once model returned " << sample.size() << " values, expected 6.";
+      }
+
+      t_trans = sample[0];
+      x_trans = sample[1];
+      y_trans = sample[2];
+      pr_t = sample[3];
+      pphi_t = sample[4];
+      pz_t = sample[5];
+    }
+
+    // Recover detector coordinates from the transformed (x', y').
+    const double u = std::sqrt(x_trans * x_trans + y_trans * y_trans);
+    const double theta = std::atan2(y_trans, x_trans);
+    const double rho = std::tanh(u);
+    const double r = rho * VDr_;
+    const double dx = r * std::cos(theta);
+    const double dy = r * std::sin(theta);
+    x_gen_ = dx + x0_;
+    y_gen_ = dy + y0_;
+    z_gen_ = VDz0_;
+
+    // Invert the time transform.
+    t_gen_ = t0_ * std::exp(t_trans * tScale_);
+
+    // Invert the momentum transform.
+    const double pr = p0_ * std::sinh(pr_t);
+    const double pphi = p0_ * std::sinh(pphi_t);
+    pz_gen_ = p0_ * std::sinh(pz_t);
+
+    if (r > 1e-6) {
+      const double rx = dx / r;
+      const double ry = dy / r;
+      const double phix = -ry;
+      const double phiy = rx;
+      px_gen_ = pr * rx + pphi * phix;
+      py_gen_ = pr * ry + pphi * phiy;
+    } else {
+      px_gen_ = pr;
+      py_gen_ = pphi;
+    }
+
+    mass_gen_ = pdt_->particle(pdgId_).mass();
+    const CLHEP::Hep3Vector momParticle(px_gen_, py_gen_, pz_gen_);
+    const CLHEP::Hep3Vector posParticle(x_gen_, y_gen_, z_gen_);
+    const double eTotal = std::sqrt(momParticle.mag2() + mass_gen_ * mass_gen_);
+    E_gen_ = eTotal - mass_gen_;
+    const CLHEP::HepLorentzVector fourMomParticle(eTotal, momParticle);
+
+    output->emplace_back(
+      PDGCode::type(pdgId_),
+      GenId::STMDownStreamGenTool,
+      posParticle,
+      fourMomParticle,
+      t_gen_
+    );
+
+    event.put(std::move(output));
+
+    if (doROOTDump_) {
+      outTree_->Fill();
+    }
+  }
+
+} // namespace mu2e
+
+DEFINE_ART_MODULE(mu2e::VDResamplerGenerateFromModel)
diff --git a/STMMC/src/VDResamplerGenerateMix_module.cc b/STMMC/src/VDResamplerGenerateMix_module.cc
new file mode 100644
index 0000000000..a0c477b375
--- /dev/null
+++ b/STMMC/src/VDResamplerGenerateMix_module.cc
@@ -0,0 +1,500 @@
+// VDResamplerGenerateMix_module.cc
+// This module takes a list of VDResamplerConfigure_[dataSourceTag]_HitSummary.csv files
+// generated by VDResamplerConfigure_module.cc, plus a matching list of POT counts.
+// It chooses which source summary to use according to the trained hit yield per POT,
+// then chooses the particle type according to the per-summary particle ratios, loads
+// either a one-stage or two-stage model for that particle, and generates a GenParticle.
+// Yongyi Wu, Mar. 2026
+
+// stdlib includes
+#include <algorithm>
+#include <cctype>
+#include <cmath>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "Offline/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh"
+
+// art includes
+#include "art/Framework/Core/EDProducer.h"
+#include "art/Framework/Principal/Event.h"
+#include "art/Framework/Services/Optional/RandomNumberGenerator.h"
+#include "art/Framework/Services/Registry/ServiceHandle.h"
+
+// CLHEP includes
+#include "CLHEP/Random/RandFlat.h"
+#include "CLHEP/Random/RandGaussQ.h"
+#include "CLHEP/Vector/LorentzVector.h"
+#include "CLHEP/Vector/ThreeVector.h"
+
+// exception handling
+#include "cetlib_except/exception.h"
+
+// fhicl includes
+#include "fhiclcpp/types/Atom.h"
+#include "fhiclcpp/types/Sequence.h"
+
+// message handling
+#include "messagefacility/MessageLogger/MessageLogger.h"
+
+// Offline includes
+#include "Offline/DataProducts/inc/PDGCode.hh"
+#include "Offline/GlobalConstantsService/inc/GlobalConstantsHandle.hh"
+#include "Offline/GlobalConstantsService/inc/ParticleDataList.hh"
+#include "Offline/MCDataProducts/inc/GenId.hh"
+#include "Offline/MCDataProducts/inc/GenParticle.hh"
+#include "Offline/SeedService/inc/SeedService.hh"
+#include "Offline/STMMC/inc/VDResamplerTransformDefaults.hh"
+
+// ROOT includes
+#include "art_root_io/TFileService.h"
+#include "TTree.h"
+
+namespace mu2e {
+
+  namespace {
+
+    // Utility function to trim whitespace from a string
+    std::string trim(const std::string& s) {
+      const auto begin = std::find_if_not(s.begin(), s.end(), [](unsigned char ch) { return std::isspace(ch); });
+      if (begin == s.end()) {
+        return "";
+      }
+      const auto end = std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) { return std::isspace(ch); }).base();
+      return std::string(begin, end);
+    }
+
+    // Utility function to split a CSV line into fields, trimming whitespace
+    std::vector<std::string> splitCSVLine(const std::string& line) {
+      std::vector<std::string> fields;
+      std::stringstream ss(line);
+      std::string item;
+      while (std::getline(ss, item, ',')) {
+        fields.push_back(trim(item));
+      }
+      return fields;
+    }
+
+    std::string pdgIdToFileToken(const int pdgId) {
+      return (pdgId < 0) ? ("m" + std::to_string(-pdgId)) : std::to_string(pdgId);
+    }
+  }
+
+  class VDResamplerGenerateMix : public art::EDProducer {
+    public:
+      using Name = fhicl::Name;
+      using Comment = fhicl::Comment;
+
+      struct Config {
+        fhicl::Sequence<std::string> hitSummaryFiles{
+          Name("hitSummaryFiles"),
+          Comment("List of VDResamplerConfigure_[dataSourceTag]_HitSummary.csv files")
+        };
+        fhicl::Sequence<double> potsPerFile{
+          Name("potsPerFile"),
+          Comment("List of POT counts corresponding to hitSummaryFiles")
+        };
+        fhicl::Atom<std::string> ModelFileDir{
+          Name("ModelFileDir"),
+          Comment("Directory where the trained SBDM model files are stored")
+        };
+        fhicl::Atom<int> VirtualDetectorID{
+          Name("VirtualDetectorID"),
+          Comment("Virtual detector ID encoded in the model filenames"),
+          116
+        };
+        fhicl::Atom<bool> useHeun{
+          Name("useHeun"),
+          Comment("Whether to use Heun's method for reverse diffusion"),
+          true
+        };
+        fhicl::Atom<int> diffusionSteps{
+          Name("diffusionSteps"),
+          Comment("Number of diffusion steps for sampling"),
+          200
+        };
+        fhicl::Atom<double> VDz0{
+          Name("VDz0"),
+          Comment("Nominal z coordinate of the virtual detector"),
+          37700.39
+        };
+        fhicl::Atom<double> VDr{
+          Name("VDr"),
+          Comment("Virtual detector radius used in the training transform"),
+          2000.0
+        };
+        fhicl::Atom<bool> doROOTDump{
+          Name("doROOTDump"),
+          Comment("Whether to dump generated samples into a ROOT tree"),
+          false
+        };
+      };
+
+      using Parameters = art::EDProducer::Table<Config>;
+
+      explicit VDResamplerGenerateMix(const Parameters& conf);
+      void produce(art::Event& event) override;
+
+    private:
+      struct ParticleModelSet {
+        int pdgId = 0;
+        int hitCount = 0;
+        bool useTwoStageModel = false;
+        std::unique_ptr<ScoreBasedDiffusionModel> allAtOnceModel;
+        std::unique_ptr<ScoreBasedDiffusionModel> stage1Model;
+        std::unique_ptr<ScoreBasedDiffusionModel> stage2Model;
+      };
+
+      struct SourceSummary {
+        std::string hitSummaryFile;
+        double pots = 0.0;
+        double sourceWeight = 0.0;
+        int trainedHitCount = 0;
+        std::vector<ParticleModelSet> particles;
+      };
+
+      art::RandomNumberGenerator::base_engine_t& engine_;
+      CLHEP::RandFlat randFlat_;
+      CLHEP::RandGaussQ randGaussQ_;
+      std::string modelFileDir_;
+      int virtualDetectorID_ = 0;
+      bool useHeun_ = true;
+      int diffusionSteps_ = 0;
+      double VDz0_ = 0.0;
+      double VDr_ = 0.0;
+      bool doROOTDump_ = false;
+      GlobalConstantsHandle<ParticleDataList> pdt_;
+
+      std::vector<SourceSummary> sources_;
+      double totalSourceWeight_ = 0.0;
+
+      double x0_ = vdresampler::kX0;
+      double y0_ = vdresampler::kY0;
+      double t0_ = vdresampler::kT0;
+      double tScale_ = vdresampler::kTScale;
+      double p0_ = vdresampler::kP0;
+
+      TTree* outTree_ = nullptr;
+      int summaryIndex_ = -1;
+      int pdgId_ = 0;
+      int generationMode_ = 0; // 1 = all-at-once, 2 = two-stage
+      double x_gen_ = 0.0;
+      double y_gen_ = 0.0;
+      double z_gen_ = 0.0;
+      double t_gen_ = 0.0;
+      double px_gen_ = 0.0;
+      double py_gen_ = 0.0;
+      double pz_gen_ = 0.0;
+      double mass_gen_ = 0.0;
+      double E_gen_ = 0.0;
+
+      SourceSummary loadSourceSummary(const std::string& summaryFile, double pots);
+      void generateFromModels(
+        const ParticleModelSet& modelSet,
+        double& xTrans,
+        double& yTrans,
+        double& tTrans,
+        double& prTrans,
+        double& pphiTrans,
+        double& pzTrans
+      ) const;
+    };
+
+  VDResamplerGenerateMix::VDResamplerGenerateMix(const Parameters& conf)
+    : art::EDProducer{conf},
+      engine_(createEngine(art::ServiceHandle<SeedService>()->getSeed())),
+      randFlat_(engine_),
+      randGaussQ_(engine_),
+      modelFileDir_(conf().ModelFileDir()),
+      virtualDetectorID_(conf().VirtualDetectorID()),
+      useHeun_(conf().useHeun()),
+      diffusionSteps_(conf().diffusionSteps()),
+      VDz0_(conf().VDz0()),
+      VDr_(conf().VDr()),
+      doROOTDump_(conf().doROOTDump()) {
+
+    produces<GenParticleCollection>();
+
+    const auto& hitSummaryFiles = conf().hitSummaryFiles();
+    const auto& potsPerFile = conf().potsPerFile();
+    if (hitSummaryFiles.size() != potsPerFile.size()) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "hitSummaryFiles and potsPerFile must have the same length.";
+    }
+    if (hitSummaryFiles.empty()) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "At least one hit summary file must be provided.";
+    }
+
+    double referencePots = 0.0;
+    for (size_t i = 0; i < hitSummaryFiles.size(); ++i) {
+      sources_.push_back(loadSourceSummary(hitSummaryFiles[i], potsPerFile[i]));
+      referencePots = std::max(referencePots, sources_.back().pots);
+    }
+
+    for (auto& source : sources_) {
+      // Use a common POT reference scale so we preserve the desired
+      // proportionality trainedHitCount / pots without ever storing tiny
+      // per-source weights like O(1e-11).
+      source.sourceWeight = static_cast<double>(source.trainedHitCount) * (referencePots / source.pots);
+      totalSourceWeight_ += source.sourceWeight;
+    }
+
+    if (totalSourceWeight_ <= 0.0) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "No trained particles with positive weight were found across the provided summaries.";
+    }
+
+    if (doROOTDump_) {
+      art::ServiceHandle<art::TFileService> tfs;
+      outTree_ = tfs->make<TTree>("ttree", "Generated samples from mixed VD resampler models");
+      outTree_->Branch("summaryIndex", &summaryIndex_, "summaryIndex/I");
+      outTree_->Branch("pdgId", &pdgId_, "pdgId/I");
+      outTree_->Branch("generationMode", &generationMode_, "generationMode/I");
+      outTree_->Branch("x", &x_gen_, "x/D");
+      outTree_->Branch("y", &y_gen_, "y/D");
+      outTree_->Branch("z", &z_gen_, "z/D");
+      outTree_->Branch("time", &t_gen_, "time/D");
+      outTree_->Branch("px", &px_gen_, "px/D");
+      outTree_->Branch("py", &py_gen_, "py/D");
+      outTree_->Branch("pz", &pz_gen_, "pz/D");
+      outTree_->Branch("E", &E_gen_, "E/D");
+    }
+  }
+
+  VDResamplerGenerateMix::SourceSummary VDResamplerGenerateMix::loadSourceSummary(
+    const std::string& summaryFile,
+    const double pots
+  ) {
+    if (pots <= 0.0) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "potsPerFile must be positive for " << summaryFile;
+    }
+
+    std::ifstream in(summaryFile);
+    if (!in) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "Cannot open hit summary file " << summaryFile;
+    }
+
+    SourceSummary source;
+    source.hitSummaryFile = summaryFile;
+    source.pots = pots;
+
+    std::string line;
+    bool firstLine = true;
+    while (std::getline(in, line)) {
+      line = trim(line);
+      if (line.empty()) {
+        continue;
+      }
+      if (firstLine) {
+        firstLine = false;
+        continue; // header
+      }
+
+      const std::vector<std::string> fields = splitCSVLine(line);
+      if (fields.size() < 3) {
+        throw cet::exception("VDResamplerGenerateMix")
+          << "Malformed summary line in " << summaryFile << ": " << line;
+      }
+
+      const int pdgId = std::stoi(fields[0]);
+      const int hitCount = std::stoi(fields[1]);
+      const std::string modeToken = fields[2];
+      if (modeToken == "*") {
+        continue;
+      }
+
+      ParticleModelSet particle;
+      particle.pdgId = pdgId;
+      particle.hitCount = hitCount;
+      particle.useTwoStageModel = (modeToken == "2");
+      if (modeToken != "1" && modeToken != "2") {
+        throw cet::exception("VDResamplerGenerateMix")
+          << "Unexpected model mode token '" << modeToken << "' in " << summaryFile;
+      }
+
+      const std::string pdgToken = pdgIdToFileToken(pdgId);
+      if (particle.useTwoStageModel) {
+        const std::string stage1ModelFile =
+          modelFileDir_ + "/SBDMmodel_stage1_VD" + std::to_string(virtualDetectorID_) + "_pdg" + pdgToken + ".csv";
+        const std::string stage2ModelFile =
+          modelFileDir_ + "/SBDMmodel_stage2_VD" + std::to_string(virtualDetectorID_) + "_pdg" + pdgToken + ".csv";
+
+        particle.stage1Model = std::make_unique<ScoreBasedDiffusionModel>(
+          ScoreBasedDiffusionModel::loadModel(randFlat_, randGaussQ_, stage1ModelFile)
+        );
+        particle.stage2Model = std::make_unique<ScoreBasedDiffusionModel>(
+          ScoreBasedDiffusionModel::loadModel(randFlat_, randGaussQ_, stage2ModelFile)
+        );
+      } else {
+        const std::string modelFile =
+          modelFileDir_ + "/SBDMmodel_allAtOnce_VD" + std::to_string(virtualDetectorID_) + "_pdg" + pdgToken + ".csv";
+        particle.allAtOnceModel = std::make_unique<ScoreBasedDiffusionModel>(
+          ScoreBasedDiffusionModel::loadModel(randFlat_, randGaussQ_, modelFile)
+        );
+      }
+
+      source.trainedHitCount += hitCount;
+      source.particles.push_back(std::move(particle));
+    }
+
+    if (source.particles.empty()) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "Summary file " << summaryFile << " contains no trained particle entries.";
+    }
+
+    source.sourceWeight = 0.0;
+    return source;
+  }
+
+  void VDResamplerGenerateMix::generateFromModels(
+    const ParticleModelSet& modelSet,
+    double& xTrans,
+    double& yTrans,
+    double& tTrans,
+    double& prTrans,
+    double& pphiTrans,
+    double& pzTrans
+  ) const {
+    if (modelSet.useTwoStageModel) {
+      const std::vector<double> stage1Sample = modelSet.stage1Model->generateSample({}, useHeun_, diffusionSteps_);
+      if (stage1Sample.size() != 3u) {
+        throw cet::exception("VDResamplerGenerateMix")
+          << "Stage-1 model returned " << stage1Sample.size() << " values, expected 3.";
+      }
+
+      tTrans = stage1Sample[0];
+      xTrans = stage1Sample[1];
+      yTrans = stage1Sample[2];
+
+      const std::vector<double> condition = {tTrans, xTrans, yTrans};
+      const std::vector<double> stage2Sample = modelSet.stage2Model->generateSample(condition, useHeun_, diffusionSteps_);
+      if (stage2Sample.size() != 3u) {
+        throw cet::exception("VDResamplerGenerateMix")
+          << "Stage-2 model returned " << stage2Sample.size() << " values, expected 3.";
+      }
+
+      prTrans = stage2Sample[0];
+      pphiTrans = stage2Sample[1];
+      pzTrans = stage2Sample[2];
+    } else {
+      const std::vector<double> sample = modelSet.allAtOnceModel->generateSample({}, useHeun_, diffusionSteps_);
+      if (sample.size() != 6u) {
+        throw cet::exception("VDResamplerGenerateMix")
+          << "All-at-once model returned " << sample.size() << " values, expected 6.";
+      }
+
+      tTrans = sample[0];
+      xTrans = sample[1];
+      yTrans = sample[2];
+      prTrans = sample[3];
+      pphiTrans = sample[4];
+      pzTrans = sample[5];
+    }
+  }
+
+  void VDResamplerGenerateMix::produce(art::Event& event) {
+    auto output = std::make_unique<GenParticleCollection>();
+
+    // Choose which source summary to use according to trained hit yield per POT.
+    double draw = randFlat_.fire() * totalSourceWeight_;
+    size_t chosenSourceIndex = 0;
+    for (; chosenSourceIndex + 1 < sources_.size(); ++chosenSourceIndex) {
+      draw -= sources_[chosenSourceIndex].sourceWeight;
+      if (draw < 0.0) {
+        break;
+      }
+    }
+    const SourceSummary& source = sources_[chosenSourceIndex];
+
+    // Choose particle type according to the hit ratios within the selected source.
+    double totalParticleWeight = 0.0;
+    for (const auto& particle : source.particles) {
+      totalParticleWeight += particle.hitCount;
+    }
+    if (totalParticleWeight <= 0.0) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "Selected source " << source.hitSummaryFile << " has non-positive particle weight.";
+    }
+
+    draw = randFlat_.fire() * totalParticleWeight;
+    size_t chosenParticleIndex = 0;
+    for (; chosenParticleIndex + 1 < source.particles.size(); ++chosenParticleIndex) {
+      draw -= source.particles[chosenParticleIndex].hitCount;
+      if (draw < 0.0) {
+        break;
+      }
+    }
+    const ParticleModelSet& modelSet = source.particles[chosenParticleIndex];
+
+    double xTrans = 0.0;
+    double yTrans = 0.0;
+    double tTrans = 0.0;
+    double prTrans = 0.0;
+    double pphiTrans = 0.0;
+    double pzTrans = 0.0;
+    generateFromModels(modelSet, xTrans, yTrans, tTrans, prTrans, pphiTrans, pzTrans);
+
+    const double u = std::sqrt(xTrans * xTrans + yTrans * yTrans);
+    const double theta = std::atan2(yTrans, xTrans);
+    const double rho = std::tanh(u);
+    const double r = rho * VDr_;
+    const double dx = r * std::cos(theta);
+    const double dy = r * std::sin(theta);
+    x_gen_ = dx + x0_;
+    y_gen_ = dy + y0_;
+    z_gen_ = VDz0_;
+
+    t_gen_ = t0_ * std::exp(tTrans * tScale_);
+
+    const double pr = p0_ * std::sinh(prTrans);
+    const double pphi = p0_ * std::sinh(pphiTrans);
+    pz_gen_ = p0_ * std::sinh(pzTrans);
+
+    if (r > 1e-6) {
+      const double rx = dx / r;
+      const double ry = dy / r;
+      const double phix = -ry;
+      const double phiy = rx;
+      px_gen_ = pr * rx + pphi * phix;
+      py_gen_ = pr * ry + pphi * phiy;
+    } else {
+      px_gen_ = pr;
+      py_gen_ = pphi;
+    }
+
+    pdgId_ = modelSet.pdgId;
+    generationMode_ = modelSet.useTwoStageModel ? 2 : 1;
+    summaryIndex_ = static_cast<int>(chosenSourceIndex);
+
+    mass_gen_ = pdt_->particle(pdgId_).mass();
+    const CLHEP::Hep3Vector momParticle(px_gen_, py_gen_, pz_gen_);
+    const CLHEP::Hep3Vector posParticle(x_gen_, y_gen_, z_gen_);
+    const double eTotal = std::sqrt(momParticle.mag2() + mass_gen_ * mass_gen_);
+    E_gen_ = eTotal - mass_gen_;
+    const CLHEP::HepLorentzVector fourMomParticle(eTotal, momParticle);
+
+    output->emplace_back(
+      PDGCode::type(pdgId_),
+      GenId::STMDownStreamGenTool,
+      posParticle,
+      fourMomParticle,
+      t_gen_
+    );
+
+    event.put(std::move(output));
+
+    if (doROOTDump_) {
+      outTree_->Fill();
+    }
+  }
+
+} // namespace mu2e
+
+DEFINE_ART_MODULE(mu2e::VDResamplerGenerateMix)
diff --git a/STMMC/src/VDResamplerTrain_module.cc b/STMMC/src/VDResamplerTrain_module.cc
new file mode 100644
index 0000000000..14c0f6d843
--- /dev/null
+++ b/STMMC/src/VDResamplerTrain_module.cc
@@ -0,0 +1,397 @@
+// VDResamplerTrain_module.cc
+// For StepPointMCs on the designated virtual detectors, train either:
+//   1) a two-stage score-based diffusion model with
+//      Stage 1: unconditional model for (t', x', y')
+//      Stage 2: conditional model for (p_r', p_phi', p_z' | t', x', y')
+//   2) a single unconditional score-based diffusion model for
+//      (t', x', y', p_r', p_phi', p_z')
+// and store the trained model parameters in CSV files.
+// note that p_z are filtered and only hits with positive p_z are kept
+// Yongyi Wu, Mar. 2026
+
+// stdlib includes
+#include <cmath>
+#include <iostream>
+#include <fstream>
+
+#include "Offline/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh"
+
+#include "CLHEP/Random/RandFlat.h"
+#include "CLHEP/Random/RandGaussQ.h"
+
+// art includes
+#include "art/Framework/Core/EDAnalyzer.h"
+#include "art/Framework/Principal/Event.h"
+#include "art/Framework/Principal/Handle.h"
+#include "art/Framework/Principal/Run.h"
+#include "art/Framework/Services/Optional/RandomNumberGenerator.h"
+#include "art/Framework/Services/Registry/ServiceHandle.h"
+
+// exception handling
+#include "cetlib_except/exception.h"
+
+// fhicl includes
+#include "canvas/Utilities/InputTag.h"
+#include "fhiclcpp/ParameterSet.h"
+#include "fhiclcpp/types/Atom.h"
+
+// message handling
+#include "messagefacility/MessageLogger/MessageLogger.h"
+
+// Offline includes
+#include "Offline/GlobalConstantsService/inc/GlobalConstantsHandle.hh"
+#include "Offline/GlobalConstantsService/inc/ParticleDataList.hh"
+#include "Offline/MCDataProducts/inc/SimParticle.hh"
+#include "Offline/MCDataProducts/inc/StepPointMC.hh"
+#include "Offline/SeedService/inc/SeedService.hh"
+#include "Offline/STMMC/inc/VDResamplerTransformDefaults.hh"
+
+typedef cet::map_vector_key key_type;
+typedef unsigned long VolumeId_type;
+
+namespace mu2e {
+  class VDResamplerTrain : public art::EDAnalyzer {
+    public:
+      using Name=fhicl::Name;
+      using Comment=fhicl::Comment;
+      struct Config {
+        fhicl::Atom<art::InputTag> StepPointMCsTag{   Name("StepPointMCsTag"),         Comment("Tag identifying the StepPointMCs")};
+        fhicl::Atom<art::InputTag> SimParticlemvTag{  Name("SimParticlemvTag"),        Comment("Tag identifying the SimParticlemv")};
+        fhicl::Atom<bool> SBDMuseTwoStageTraining{    Name("SBDMuseTwoStageTraining"), Comment("If true, train the two-stage factorized model. If false, train all 6 dimensions at once."), true};
+        fhicl::Atom<std::string> SBDMallAtOnceModelFile{ Name("SBDMallAtOnceModelFile"), Comment("CSV filename for the all-at-once 6D SBDM model parameters"), ""};
+        fhicl::Atom<std::string> SBDMstage1ModelFile{ Name("SBDMstage1ModelFile"),     Comment("CSV filename for the trained stage-1 SBDM model parameters"), ""};
+        fhicl::Atom<std::string> SBDMstage2ModelFile{ Name("SBDMstage2ModelFile"),     Comment("CSV filename for the trained stage-2 SBDM model parameters"), ""};
+        fhicl::Atom<int> VirtualDetectorID{           Name("VirtualDetectorID"),       Comment("ID of the virtual detector to train on"),                        116};
+        fhicl::Atom<double> VDz0{                     Name("VDz0"),                    Comment("z coordinate of the virtual detector"),                          37700.39};
+        fhicl::Atom<double> VDr{                      Name("VDr"),                     Comment("VD radius"),                                                     2000.0 };
+        fhicl::Atom<int> pdgID{                       Name("pdgID"),                   Comment("pdgID of the particle to train on"),                             22};
+        fhicl::Atom<int> SBDMhidden{                  Name("SBDMhidden"),              Comment("Size of hidden layers in the SBDM neural network"),              128};
+        fhicl::Atom<int> SBDMlayers{                  Name("SBDMlayers"),              Comment("Number of layers in the SBDM neural network"),                   4};
+        fhicl::Atom<std::string> SBDMoptimizer{       Name("SBDMoptimizer"),           Comment("Optimizer for training the SBDM neural network (SGD or ADAM)"), "ADAM"};
+        fhicl::Atom<double> SBDMadamBeta1{            Name("SBDMadamBeta1"),           Comment("Adam optimizer beta1 parameter"),                                0.9};
+        fhicl::Atom<double> SBDMadamBeta2{            Name("SBDMadamBeta2"),           Comment("Adam optimizer beta2 parameter"),                                0.999};
+        fhicl::Atom<double> SBDMadamEps{              Name("SBDMadamEps"),             Comment("Adam optimizer epsilon parameter"),                              1e-8};
+        fhicl::Atom<std::string> SBDMnoiseSchedule{   Name("SBDMnoiseSchedule"),       Comment("Noise schedule for the SBDM (LINEAR or COSINE)"),                "COSINE"};
+        fhicl::Atom<double> SBDMbetaMin{              Name("SBDMbetaMin"),             Comment("Minimum noise schedule parameter (for LINEAR schedule)") ,       1e-4};
+        fhicl::Atom<double> SBDMbetaMax{              Name("SBDMbetaMax"),             Comment("Maximum noise schedule parameter (for LINEAR schedule)"),        0.02};
+        fhicl::Atom<double> SBDMcosineOffset{         Name("SBDMcosineOffset"),        Comment("Offset parameter (for cosine schedule)"),                        0.008};
+        fhicl::Atom<int> SBDMbatchSize{               Name("SBDMbatchSize"),           Comment("Batch size for training the SBDM"),                              32};
+        fhicl::Atom<double> SBDMgradientClip{         Name("SBDMgradientClip"),        Comment("Gradient clipping threshold for training the SBDM"),             1.0};
+        fhicl::Atom<double> SBDMlearningRate{         Name("SBDMlearningRate"),        Comment("Learning rate for training the SBDM"),                           1e-3};
+        fhicl::Atom<int> SBDMdiffusionSteps{          Name("SBDMdiffusionSteps"),      Comment("Number of steps in the diffusion process for the SBDM"),         200};
+        fhicl::Atom<int> SBDMtrainingSize{            Name("SBDMtrainingSize"),        Comment("Size of the training data for the SBDM"),                        -1}; // -1 means use all available data
+        fhicl::Atom<int> SBDMtrainingEpochs{          Name("SBDMtrainingEpochs"),      Comment("Number of epochs to train the selected SBDM mode"),              10};
+      };
+      using Parameters = art::EDAnalyzer::Table<Config>;
+      explicit VDResamplerTrain(const Parameters& conf);
+      void analyze(const art::Event& e);
+      void endJob();
+    private:
+      art::RandomNumberGenerator::base_engine_t& engine;
+      CLHEP::RandFlat randFlat_;
+      CLHEP::RandGaussQ randGaussQ_;
+      art::ProductToken<StepPointMCCollection> StepPointMCsToken;
+      art::ProductToken<SimParticleCollection> SimParticlemvToken;
+
+      // SBDM model + data
+      bool useTwoStageTraining = true;
+      std::unique_ptr<ScoreBasedDiffusionModel> allAtOnceModel;
+      std::unique_ptr<ScoreBasedDiffusionModel> stage1Model;
+      std::unique_ptr<ScoreBasedDiffusionModel> stage2Model;
+      std::vector<DiffusionTrainingSample> allAtOnceTrainingData;
+      std::vector<DiffusionTrainingSample> stage1TrainingData;
+      std::vector<DiffusionTrainingSample> stage2TrainingData;
+      std::string SBDMallAtOnceModelFile;
+      std::string SBDMstage1ModelFile;
+      std::string SBDMstage2ModelFile;
+      VolumeId_type VirtualDetectorID = 0;
+      double VDz0 = 0.0;
+      double VDr = 0.0;
+      int pdgID = 0;
+      int trainingEpochs = 0;
+      int trainingSize = -1;
+
+      // variables to read from the art event
+      int stepPdgId = 0;
+      double x = 0.0, y = 0.0, z = 0.0, time = 0.0;
+      double px = 0.0, py = 0.0, pz = 0.0;
+      VolumeId_type virtualdetectorId = 0;
+
+      // transform variables for training data preparation
+      double x0 = vdresampler::kX0;
+      double y0 = vdresampler::kY0;
+      // time scaling
+      double t0 = vdresampler::kT0;
+      double tScale = vdresampler::kTScale;
+  };
+
+  VDResamplerTrain::VDResamplerTrain(const Parameters& conf) :
+    art::EDAnalyzer(conf),
+    engine(createEngine( art::ServiceHandle<SeedService>()->getSeed())),
+    randFlat_(engine),
+    randGaussQ_(engine),
+    StepPointMCsToken(consumes<StepPointMCCollection>(conf().StepPointMCsTag())),
+    SimParticlemvToken(consumes<SimParticleCollection>(conf().SimParticlemvTag())),
+    useTwoStageTraining(conf().SBDMuseTwoStageTraining()),
+    SBDMallAtOnceModelFile(conf().SBDMallAtOnceModelFile()),
+    SBDMstage1ModelFile(conf().SBDMstage1ModelFile()),
+    SBDMstage2ModelFile(conf().SBDMstage2ModelFile()),
+    VirtualDetectorID(conf().VirtualDetectorID()),
+    VDz0(conf().VDz0()),
+    VDr(conf().VDr()),
+    pdgID(conf().pdgID()),
+    trainingEpochs(conf().SBDMtrainingEpochs()),
+    trainingSize(conf().SBDMtrainingSize())
+  {
+    // optimizer selection
+    ScoreBasedDiffusionModel::OptimizerType opt =
+        (conf().SBDMoptimizer() == "SGD") ?
+        ScoreBasedDiffusionModel::OptimizerType::SGD :
+        ScoreBasedDiffusionModel::OptimizerType::ADAM;
+
+    // noise schedule
+    ScoreBasedDiffusionModel::NoiseScheduleType sched =
+        (conf().SBDMnoiseSchedule() == "LINEAR") ?
+        ScoreBasedDiffusionModel::NoiseScheduleType::LINEAR :
+        ScoreBasedDiffusionModel::NoiseScheduleType::COSINE;
+
+    if (useTwoStageTraining) {
+      // create stage-1 model for (t', x', y')
+      stage1Model = std::make_unique<ScoreBasedDiffusionModel>(
+          randFlat_,
+          randGaussQ_,
+          3, // dim
+          0, // conditionDim
+          conf().SBDMhidden(),
+          conf().SBDMlayers(),
+          opt,
+          conf().SBDMadamBeta1(),
+          conf().SBDMadamBeta2(),
+          conf().SBDMadamEps(),
+          sched,
+          conf().SBDMbetaMin(),
+          conf().SBDMbetaMax(),
+          conf().SBDMcosineOffset(),
+          conf().SBDMbatchSize(),
+          conf().SBDMgradientClip(),
+          conf().SBDMlearningRate(),
+          conf().SBDMdiffusionSteps()
+      );
+
+      // create stage-2 model for (p_r', p_phi', p_z' | t', x', y')
+      stage2Model = std::make_unique<ScoreBasedDiffusionModel>(
+          randFlat_,
+          randGaussQ_,
+          3, // dim
+          3, // conditionDim
+          conf().SBDMhidden(),
+          conf().SBDMlayers(),
+          opt,
+          conf().SBDMadamBeta1(),
+          conf().SBDMadamBeta2(),
+          conf().SBDMadamEps(),
+          sched,
+          conf().SBDMbetaMin(),
+          conf().SBDMbetaMax(),
+          conf().SBDMcosineOffset(),
+          conf().SBDMbatchSize(),
+          conf().SBDMgradientClip(),
+          conf().SBDMlearningRate(),
+          conf().SBDMdiffusionSteps()
+      );
+      // allocate memory according to the training size (if specified, otherwise will grow dynamically)
+      if (trainingSize > 0) {
+        stage1TrainingData.reserve(trainingSize);
+        stage2TrainingData.reserve(trainingSize);
+      } else {
+        stage1TrainingData.reserve(1000);
+        stage2TrainingData.reserve(1000);
+      }
+    } else {
+      allAtOnceModel = std::make_unique<ScoreBasedDiffusionModel>(
+          randFlat_,
+          randGaussQ_,
+          6, // dim
+          0, // conditionDim
+          conf().SBDMhidden(),
+          conf().SBDMlayers(),
+          opt,
+          conf().SBDMadamBeta1(),
+          conf().SBDMadamBeta2(),
+          conf().SBDMadamEps(),
+          sched,
+          conf().SBDMbetaMin(),
+          conf().SBDMbetaMax(),
+          conf().SBDMcosineOffset(),
+          conf().SBDMbatchSize(),
+          conf().SBDMgradientClip(),
+          conf().SBDMlearningRate(),
+          conf().SBDMdiffusionSteps()
+      );
+      // allocate memory according to the training size (if specified, otherwise will grow dynamically)
+      if (trainingSize > 0) {
+        allAtOnceTrainingData.reserve(trainingSize);
+      } else {
+        allAtOnceTrainingData.reserve(1000);
+      }
+    }
+  };
+
+  void VDResamplerTrain::analyze(const art::Event& event) {
+    // Get the data products from the event
+    auto const& StepPointMCs = event.getProduct(StepPointMCsToken);
+    if (StepPointMCs.empty())
+      return;
+    auto const& SimParticles = event.getProduct(SimParticlemvToken);
+    if (SimParticles.empty())
+      return;
+
+    // Loop over all VD hits
+    for (const StepPointMC& step : StepPointMCs) {
+      // Get the associated particle
+      const SimParticle& particle = SimParticles.at(step.trackId());
+
+      // Extract the parameters
+      stepPdgId = particle.pdgId();
+      virtualdetectorId = step.virtualDetectorId();
+      time = step.time();
+      x = step.position().x(); // This coordinate is in Mu2e frame, will be shifted to relative x w.r.t. the beamline
+      y = step.position().y();
+      z = step.position().z();
+      px = step.momentum().x();
+      py = step.momentum().y();
+      pz = step.momentum().z();
+
+      if (virtualdetectorId != VirtualDetectorID || (stepPdgId != pdgID && pdgID != 0) || pz <= 0)
+        continue; // Filter hits based on the virtual detector ID, particle type, and pz
+
+      // as z maybe slightly different from the nominal VDz0 due to the step size, we will extrapolate the (x, y) coordinates
+      // to the nominal VDz0 for all hits to compute the training parameters to be fed into the SBDM.
+      double extrapolationFactor = (VDz0 - z) / pz; // Assuming linear motion, this is the factor to extrapolate from current z to z0
+      double x_extrapolated = x + extrapolationFactor * px;
+      double y_extrapolated = y + extrapolationFactor * py;
+
+      // Now we have the extrapolated (x, y) at the nominal VDz0, we can compute the training parameters for the SBDM.
+      // We convert (t, x_extrapolated, y_extrapolated, px, py, pz) to (t', x_extrapolated, y_extrapolated, p_r', p_phi', p_z')
+
+      // shift to detector-centered coordinates
+      double dx = x_extrapolated - x0;
+      double dy = y_extrapolated - y0;
+      // polar coordinates
+      double r = std::sqrt(dx*dx + dy*dy);
+      double rho = r / VDr;
+      // numerical safety (avoid rho >= 1)
+      const double eps = 1e-6;
+      rho = std::min(rho, 1.0 - eps);
+      // boundary-removing transform
+      // u = atanh(r/R)
+      double u = 0.5 * std::log((1.0 + rho) / (1.0 - rho));
+      // angle
+      double theta = std::atan2(dy, dx);
+      // map back to Cartesian-like coordinates
+      double x_trans = u * std::cos(theta);
+      double y_trans = u * std::sin(theta);
+
+      // compute momentum components in the local polar coordinate system (r, phi, z)
+      double pr = 0.0;
+      double pphi = 0.0;
+      if (r > 1e-6) { // avoid division by zero, if r is very small, we can approximate pr ~ px and pphi ~ py
+        double rx = dx / r; // unit vector in the radial direction
+        double ry = dy / r;
+        double phix = -ry; // unit vector in the angular direction)
+        double phiy =  rx;
+        pr   = px*rx + py*ry; // radial momentum component
+        pphi = px*phix + py*phiy;
+      }
+      else {
+        pr = px;
+        pphi = py;
+      }
+      // momentum scaling
+      double p0 = vdresampler::kP0; // tunable scale where I want best resolution
+      double pr_t   = std::asinh(pr   / p0);
+      double pphi_t = std::asinh(pphi / p0);
+      double pz_t   = std::asinh(pz   / p0);
+
+      // time transform
+      double t_safe = std::max(time, 0.1); // avoid log(0)
+      double t_trans = std::log(t_safe / t0) / tScale;
+
+      if (useTwoStageTraining) {
+        DiffusionTrainingSample stage1Sample;
+        stage1Sample.x = {t_trans, x_trans, y_trans};
+        stage1TrainingData.push_back(std::move(stage1Sample));
+
+        DiffusionTrainingSample stage2Sample;
+        stage2Sample.x = {pr_t, pphi_t, pz_t};
+        stage2Sample.cond = {t_trans, x_trans, y_trans};
+        stage2TrainingData.push_back(std::move(stage2Sample));
+      } else {
+        DiffusionTrainingSample allAtOnceSample;
+        allAtOnceSample.x = {t_trans, x_trans, y_trans, pr_t, pphi_t, pz_t};
+        allAtOnceTrainingData.push_back(std::move(allAtOnceSample));
+      }
+    };
+    return;
+  };
+
+  void VDResamplerTrain::endJob() {
+
+      if (useTwoStageTraining && (stage1TrainingData.empty() || stage2TrainingData.empty())) {
+        mf::LogWarning("VDResamplerTrain") << "No training data collected.";
+        return;
+      }
+      if (!useTwoStageTraining && allAtOnceTrainingData.empty()) {
+        mf::LogWarning("VDResamplerTrain") << "No training data collected.";
+        return;
+      }
+
+      // if SBDMtrainingSize is set and smaller than the collected training data,
+      // truncate the training data to the specified size.
+      if (useTwoStageTraining) {
+        if(trainingSize > 0 && (int)stage1TrainingData.size() > trainingSize)
+          stage1TrainingData.resize(trainingSize);
+        if(trainingSize > 0 && (int)stage2TrainingData.size() > trainingSize)
+          stage2TrainingData.resize(trainingSize);
+
+        if (SBDMstage1ModelFile.empty() || SBDMstage2ModelFile.empty()) {
+          throw cet::exception("VDResamplerTrain") << "Two-stage training requires both SBDMstage1ModelFile and SBDMstage2ModelFile.";
+        }
+
+        mf::LogInfo("VDResamplerTrain")
+            << "Training stage-1 diffusion model with " << stage1TrainingData.size()
+            << " samples and " << trainingEpochs << " epochs...";
+        stage1Model->train(stage1TrainingData, trainingEpochs);
+        stage1Model->saveModel(SBDMstage1ModelFile);
+        mf::LogInfo("VDResamplerTrain") << "Stage-1 model saved to " << SBDMstage1ModelFile;
+
+        mf::LogInfo("VDResamplerTrain")
+            << "Training stage-2 diffusion model with " << stage2TrainingData.size()
+            << " samples and " << trainingEpochs << " epochs...";
+        stage2Model->train(stage2TrainingData, trainingEpochs);
+        stage2Model->saveModel(SBDMstage2ModelFile);
+        mf::LogInfo("VDResamplerTrain") << "Stage-2 model saved to " << SBDMstage2ModelFile;
+
+      } else {
+        if(trainingSize > 0 && (int)allAtOnceTrainingData.size() > trainingSize)
+          allAtOnceTrainingData.resize(trainingSize);
+
+        if (SBDMallAtOnceModelFile.empty()) {
+          throw cet::exception("VDResamplerTrain") << "All-at-once training requires SBDMallAtOnceModelFile.";
+        }
+
+        mf::LogInfo("VDResamplerTrain")
+            << "Training all-at-once diffusion model with " << allAtOnceTrainingData.size()
+            << " samples and " << trainingEpochs << " epochs...";
+        allAtOnceModel->train(allAtOnceTrainingData, trainingEpochs);
+        allAtOnceModel->saveModel(SBDMallAtOnceModelFile);
+
+        mf::LogInfo("VDResamplerTrain") << "All-at-once model saved to " << SBDMallAtOnceModelFile;
+      }
+
+     return;
+  };
+
+}; // end namespace mu2e
+
+DEFINE_ART_MODULE(mu2e::VDResamplerTrain)

From 9a255ebd5b640999a9ab4b6b5d73bf3bce28dd28 Mon Sep 17 00:00:00 2001
From: YongyiBWu <47263079+YongyiBWu@users.noreply.github.com>
Date: Sun, 12 Apr 2026 14:16:43 -0400
Subject: [PATCH 2/4] Update STMMC/src/VDResamplerConfigure_module.cc

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 STMMC/src/VDResamplerConfigure_module.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/STMMC/src/VDResamplerConfigure_module.cc b/STMMC/src/VDResamplerConfigure_module.cc
index 82db041969..97e40f14df 100644
--- a/STMMC/src/VDResamplerConfigure_module.cc
+++ b/STMMC/src/VDResamplerConfigure_module.cc
@@ -214,7 +214,14 @@ namespace mu2e {
         // Generate the fcl file for training for this particle type
         // if pdgID is negative, we will use "m" instead of "-" in the filename to avoid issues with file naming
         std::string pdgIdstr = (part.first < 0) ? "m" + std::to_string(-part.first) : std::to_string(part.first);
-        std::string moduleName = "VDResamplerTrainVD"+ std::to_string(VirtualDetectorID) + dataSourceTag + "pdg" + pdgIdstr;
+        std::string sanitizedDataSourceTag = dataSourceTag;
+        for (char& c : sanitizedDataSourceTag) {
+          const bool isAlphaNum = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9');
+          if (!isAlphaNum && c != '_') {
+            c = '_';
+          }
+        }
+        std::string moduleName = "VDResamplerTrainVD"+ std::to_string(VirtualDetectorID) + sanitizedDataSourceTag + "pdg" + pdgIdstr;
         std::string pathName = "trainPathVD" + std::to_string(VirtualDetectorID) + "pdg" + pdgIdstr;
         trainingPaths.push_back({moduleName, pathName});
         fclOutFile << "    " << moduleName << " : {\n"

From 92e3c8cb47c9d0a479bc223ce36245909b61064b Mon Sep 17 00:00:00 2001
From: YongyiBWu <47263079+YongyiBWu@users.noreply.github.com>
Date: Mon, 13 Apr 2026 00:49:18 -0400
Subject: [PATCH 3/4] Fixes in response to PR #1798 code reviews

Introduce VDResamplerTransforms (new header) and remove legacy VDResamplerTransformDefaults; centralize forward/inverse sample transforms and numeric safety constants. Harden ScoreBasedDiffusionModel by: adding initializeRandomWeights ctor flag, using size_t for trainingSampleSize_, initializing layer buffers safely, guarding weight initialization, adding dimension checks in computeLoss, improving gradient/Adam math/formatting, fixing gradient clipping early-return, validating and reconstructing loaded network shapes (loadModel uses initializeRandomWeights=false), and various style/robustness fixes. Update STMMC modules (Configure/GenerateFromModel/GenerateMix/Train) to use the new transforms, add runtime validation for VDr/VDz0, improve config parsing with warnings, initialize pointers, replace inlined transform code with calls into VDResampler, and minor API/constructor fixes (HepLorentzVector usage). Misc: remove unused typedefs, add small constants (kTwoStageTrainingHitThreshold), includes, and other small cleanups to improve stability and correctness.
---
 .../inc/ScoreBasedDiffusionModel.hh           |  43 ++--
 .../src/ScoreBasedDiffusionModel.cc           | 216 +++++++++++-------
 STMMC/inc/VDResamplerTransformDefaults.hh     |  13 --
 STMMC/inc/VDResamplerTransforms.hh            | 148 ++++++++++++
 STMMC/src/VDResamplerConfigure_module.cc      |   9 +-
 .../VDResamplerGenerateFromModel_module.cc    |  77 ++++---
 STMMC/src/VDResamplerGenerateMix_module.cc    |  73 +++---
 STMMC/src/VDResamplerTrain_module.cc          | 134 +++++------
 8 files changed, 456 insertions(+), 257 deletions(-)
 delete mode 100644 STMMC/inc/VDResamplerTransformDefaults.hh
 create mode 100644 STMMC/inc/VDResamplerTransforms.hh

diff --git a/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh b/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh
index 2bf4c7c697..a7cfd78659 100644
--- a/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh
+++ b/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh
@@ -45,24 +45,26 @@ namespace mu2e{
         // Constructor: Initialize diffusion model with CLHEP random distributions.
         //
         // Parameters:
-        //   randFlat              - Reference to CLHEP RandFlat for uniform sampling (externally managed)
-        //   randGaussQ            - Reference to CLHEP RandGaussQ for Gaussian noise (externally managed)
-        //   dim                   - Dimensionality of the state space
-        //   conditionDim          - Dimensionality of the optional conditioning vector (default: 0 for unconditional model)
-        //   hidden                - Size of hidden layers in the neural network
-        //   layers                - Number of layers in the network
-        //   optimizerType         - Type of optimizer to use (SGD or ADAM, default: ADAM)
-        //   adamBeta1             - Adam optimizer beta1 parameter (default: 0.9)
-        //   adamBeta2             - Adam optimizer beta2 parameter (default: 0.999)
-        //   adamEps               - Adam optimizer epsilon parameter (default: 1e-8)
-        //   scheduleType          - Type of noise schedule (LINEAR or COSINE, default: COSINE)
-        //   betaMin               - Minimum noise schedule parameter (for LINEAR schedule, default: 1e-4)
-        //   betaMax               - Maximum noise schedule parameter (for LINEAR schedule, default: 0.02)
-        //   cosineOffset          - Offset parameter (for cosine schedule, default: 0.008)
-        //   batchSize             - Batch size for training (default: 32)
-        //   gradientClipThreshold - Threshold for gradient clipping (default: 1.0)
-        //   learningRate          - Learning rate for training (default: 1e-3)
-        //   diffusionSteps        - Number of steps in the diffusion process (default: 200)
+        //   randFlat                - Reference to CLHEP RandFlat for uniform sampling (externally managed)
+        //   randGaussQ              - Reference to CLHEP RandGaussQ for Gaussian noise (externally managed)
+        //   dim                     - Dimensionality of the state space
+        //   conditionDim            - Dimensionality of the optional conditioning vector (default: 0 for unconditional model)
+        //   hidden                  - Size of hidden layers in the neural network
+        //   layers                  - Number of layers in the network
+        //   optimizerType           - Type of optimizer to use (SGD or ADAM, default: ADAM)
+        //   adamBeta1               - Adam optimizer beta1 parameter (default: 0.9)
+        //   adamBeta2               - Adam optimizer beta2 parameter (default: 0.999)
+        //   adamEps                 - Adam optimizer epsilon parameter (default: 1e-8)
+        //   scheduleType            - Type of noise schedule (LINEAR or COSINE, default: COSINE)
+        //   betaMin                 - Minimum noise schedule parameter (for LINEAR schedule, default: 1e-4)
+        //   betaMax                 - Maximum noise schedule parameter (for LINEAR schedule, default: 0.02)
+        //   cosineOffset            - Offset parameter (for cosine schedule, default: 0.008)
+        //   batchSize               - Batch size for training (default: 32)
+        //   gradientClipThreshold   - Threshold for gradient clipping (default: 1.0)
+        //   learningRate            - Learning rate for training (default: 1e-3)
+        //   diffusionSteps          - Number of steps in the diffusion process (default: 200)
+        //   initializeRandomWeights - If true, initialize network weights from random Gaussian draws.
+        //                             Set false when constructing (loading) from a saved model.
         ScoreBasedDiffusionModel(
             // Network architecture parameters
             CLHEP::RandFlat& randFlat,
@@ -86,7 +88,8 @@ namespace mu2e{
             double gradientClipThreshold = 1.0,
             double learningRate = 1e-3,
             // Diffusion process configuration
-            int diffusionSteps = 200
+            int diffusionSteps = 200,
+            bool initializeRandomWeights = true
         );
 
         // Train the score network on a batch of samples.
@@ -312,7 +315,7 @@ namespace mu2e{
         // Training state
         double runningLoss_;  // Accumulated loss for monitoring during training
         int adamStep_; // Step counter for Adam optimizer (used to compute bias-corrected moment estimates)
-        int trainingSampleSize_; // Total number of training samples
+        size_t trainingSampleSize_; // Total number of training samples
 
         // Container for tracking training loss over epochs
         std::vector<double> epochLosses_;
diff --git a/MachineLearningTools/src/ScoreBasedDiffusionModel.cc b/MachineLearningTools/src/ScoreBasedDiffusionModel.cc
index 5cf8b2dc34..31ed371129 100644
--- a/MachineLearningTools/src/ScoreBasedDiffusionModel.cc
+++ b/MachineLearningTools/src/ScoreBasedDiffusionModel.cc
@@ -56,7 +56,8 @@ namespace mu2e {
         int batchSize,
         double gradientClipThreshold,
         double learningRate,
-        int diffusionSteps
+        int diffusionSteps,
+        bool initializeRandomWeights
     ) : randFlat_(randFlat), randGaussQ_(randGaussQ),
         dim_(dim), conditionDim_(conditionDim), hidden_(hidden), layers_(layers),
         optimizerType_(optimizerType),
@@ -102,10 +103,10 @@ namespace mu2e {
             Layer layer;
 
             // Allocate weights
-            layer.W.resize(out, std::vector<double>(in));
+            layer.W.resize(out, std::vector<double>(in, 0.0));
 
             // Allocate biases
-            layer.b.resize(out);
+            layer.b.resize(out, 0.0);
 
             // Allocate gradient buffers
             layer.gradW.resize(out, std::vector<double>(in, 0.0));
@@ -118,18 +119,19 @@ namespace mu2e {
             layer.mb.resize(out, 0.0);
             layer.vb.resize(out, 0.0);
 
-            // --------------------------------------------------------
-            // Weight initialization
-            //
-            // Small Gaussian initialization improves training stability
-            // --------------------------------------------------------
-
-            for (int i = 0; i < out; ++i) {
-                for (int j = 0; j < in; ++j) {
-                    layer.W[i][j] = weightInitScale * randGaussQ_.fire();
-                    // this generates a Gaussian random number with mean=0 and sigma=weightInitScale
+            if (initializeRandomWeights) {
+                // --------------------------------------------------------
+                // Weight initialization
+                //
+                // Small Gaussian initialization improves training stability
+                // --------------------------------------------------------
+                for (int i = 0; i < out; ++i) {
+                    for (int j = 0; j < in; ++j) {
+                        layer.W[i][j] = weightInitScale * randGaussQ_.fire();
+                        // this generates a Gaussian random number with mean=0 and sigma=weightInitScale
+                    }
+                    layer.b[i] = 0.0;
                 }
-                layer.b[i] = 0.0;
             }
 
             if (layer.W.empty() || layer.W[0].size() != static_cast<size_t>(in) || //Check that weight matrix has correct input dimension
@@ -227,10 +229,10 @@ namespace mu2e {
 
             // Apply activation function (SiLU) to get the output of the current layer.
             std::vector<double> y(z.size());
-            if(l != network_.size()-1) // No activation on the last layer (output layer), as it predicts the score directly.
-            {
-                for (size_t i=0;i<z.size();++i)
+            if (l != network_.size() - 1) { // No activation on the last layer (output layer), as it predicts the score directly.
+                for (size_t i = 0; i < z.size(); ++i) {
                     y[i] = silu(z[i]);
+                }
             }
             else
                 y = z;
@@ -263,24 +265,19 @@ namespace mu2e {
             std::vector<double> gradZ(grad.size());
 
             // For the output layer, the gradient is directly from the loss w.r.t. output (no activation function).
-            if(l == static_cast<int>(network_.size())-1)
-            {
+            if (l == static_cast<int>(network_.size()) - 1) {
                 gradZ = grad;
-            }
-            // For hidden layers, apply the derivative of the activation function (SiLU) to the gradient.
-            else
-            {
-                for (size_t i=0;i<grad.size();++i)
-                    gradZ[i] = grad[i]*siluDeriv(z[i]);
+            } else { // For hidden layers, apply the derivative of the activation function (SiLU) to the gradient.
+                for (size_t i = 0; i < grad.size(); ++i) {
+                    gradZ[i] = grad[i] * siluDeriv(z[i]);
+                }
             }
 
             // Compute gradients w.r.t. weights and biases for the current layer
-            for (size_t i = 0; i < layer.W.size(); i++)
-            {
-                for (size_t j=0;j<layer.W[i].size();++j)
-                {
+            for (size_t i = 0; i < layer.W.size(); i++) {
+                for (size_t j = 0; j < layer.W[i].size(); ++j) {
                     // Gradient of loss w.r.t. weight W[i][j]
-                    layer.gradW[i][j] += gradZ[i]*aPrev[j];
+                    layer.gradW[i][j] += gradZ[i] * aPrev[j];
                 }
 
                 // Gradient of loss w.r.t. bias b[i]
@@ -294,11 +291,13 @@ namespace mu2e {
                     << "Encountered empty layer weights during backward pass";
             }
             // Compute gradient w.r.t. input of the current layer to propagate to the previous layer
-            std::vector<double> gradPrev(layer.W[0].size(),0.0);
+            std::vector<double> gradPrev(layer.W[0].size(), 0.0);
 
-            for (size_t j=0;j<gradPrev.size();++j)
-                for (size_t i=0;i<layer.W.size();++i)
-                    gradPrev[j] += layer.W[i][j]*gradZ[i];
+            for (size_t j = 0; j < gradPrev.size(); ++j) {
+                for (size_t i = 0; i < layer.W.size(); ++i) {
+                    gradPrev[j] += layer.W[i][j] * gradZ[i];
+                }
+            }
 
             grad = gradPrev;
         }
@@ -364,20 +363,18 @@ namespace mu2e {
                     << "Encountered malformed layer during Adam update";
             }
 
-            for (size_t i=0;i<layer.W.size();++i)
-            {
+            for (size_t i = 0; i < layer.W.size(); ++i) {
                 // Update weights using Adam update rule
-                for (size_t j=0;j<layer.W[i].size();++j)
-                {
+                for (size_t j = 0; j < layer.W[i].size(); ++j) {
                     double g = layer.gradW[i][j];
 
-                    layer.mW[i][j] = adamBeta1_*layer.mW[i][j] + (1-adamBeta1_)*g;
-                    layer.vW[i][j] = adamBeta2_*layer.vW[i][j] + (1-adamBeta2_)*g*g;
+                    layer.mW[i][j] = adamBeta1_ * layer.mW[i][j] + (1 - adamBeta1_) * g;
+                    layer.vW[i][j] = adamBeta2_ * layer.vW[i][j] + (1 - adamBeta2_) * g * g;
 
-                    double mhat = layer.mW[i][j]/(1-std::pow(adamBeta1_,adamStep_));
-                    double vhat = layer.vW[i][j]/(1-std::pow(adamBeta2_,adamStep_));
+                    double mhat = layer.mW[i][j] / (1 - std::pow(adamBeta1_, adamStep_));
+                    double vhat = layer.vW[i][j] / (1 - std::pow(adamBeta2_, adamStep_));
 
-                    layer.W[i][j] -= lr*mhat/(std::sqrt(vhat)+adamEps_);
+                    layer.W[i][j] -= lr * mhat / (std::sqrt(vhat) + adamEps_);
 
                     layer.gradW[i][j] = 0.0;
                 }
@@ -385,13 +382,13 @@ namespace mu2e {
                 // Update bias using Adam update rule
                 double g = layer.gradb[i];
 
-                layer.mb[i] = adamBeta1_*layer.mb[i] + (1-adamBeta1_)*g;
-                layer.vb[i] = adamBeta2_*layer.vb[i] + (1-adamBeta2_)*g*g;
+                layer.mb[i] = adamBeta1_ * layer.mb[i] + (1 - adamBeta1_) * g;
+                layer.vb[i] = adamBeta2_ * layer.vb[i] + (1 - adamBeta2_) * g * g;
 
-                double mhat = layer.mb[i]/(1-std::pow(adamBeta1_,adamStep_));
-                double vhat = layer.vb[i]/(1-std::pow(adamBeta2_,adamStep_));
+                double mhat = layer.mb[i] / (1 - std::pow(adamBeta1_, adamStep_));
+                double vhat = layer.vb[i] / (1 - std::pow(adamBeta2_, adamStep_));
 
-                layer.b[i] -= lr*mhat/(std::sqrt(vhat)+adamEps_);
+                layer.b[i] -= lr * mhat / (std::sqrt(vhat) + adamEps_);
 
                 layer.gradb[i] = 0.0;
             }
@@ -418,8 +415,8 @@ namespace mu2e {
         std::vector<double> xt(dim_);
 
         for (int i = 0; i < dim_; ++i) {
-            eps[i] = randGaussQ_.fire();   // Gaussian N(0,1)
-            xt[i]  = x[i] + s * eps[i];
+            eps[i] = randGaussQ_.fire(); // Gaussian N(0,1)
+            xt[i] = x[i] + s * eps[i];
         }
 
         return xt;
@@ -429,16 +426,27 @@ namespace mu2e {
     double ScoreBasedDiffusionModel::computeLoss(
         const std::vector<double>& score,
         const std::vector<double>& target
-    ) const{
+    ) const {
+
+        if (score.size() != static_cast<size_t>(dim_)) {
+            throw cet::exception("ScoreBasedDiffusionModel::computeLoss")
+                << "Score dimension mismatch: got " << score.size()
+                << ", expected " << dim_;
+        }
+        if (target.size() != static_cast<size_t>(dim_)) {
+            throw cet::exception("ScoreBasedDiffusionModel::computeLoss")
+                << "Target dimension mismatch: got " << target.size()
+                << ", expected " << dim_;
+        }
 
         double loss = 0.0;
 
-        for (int i=0;i<dim_;++i){
-            double d = score[i]-target[i];
-            loss += d*d;
+        for (int i = 0; i < dim_; ++i) {
+            double d = score[i] - target[i];
+            loss += d * d;
         }
 
-        return loss/dim_;
+        return loss / dim_;
     }
 
     // Clip gradients to prevent exploding gradients during training. This is done by scaling down
@@ -460,8 +468,9 @@ namespace mu2e {
         norm = std::sqrt(norm);
 
         // If the norm is below the threshold, no clipping is needed.
-        if(norm <= maxNorm)
+        if (norm <= maxNorm) {
             return;
+        }
 
         // Scale down the gradients to have the specified maximum norm.
         double scale = maxNorm / norm;
@@ -554,8 +563,9 @@ namespace mu2e {
 
                 // The target score is the negative of the noise scaled by the noise standard deviation, i.e., -eps/sigma(t).
                 std::vector<double> target(dim_);
-                for (int i=0;i<dim_;++i)
-                    target[i] = -eps[i]/std::max(s, eps_safe); // add eps_safe to prevent division by zero in case of very small sigma
+                for (int i = 0; i < dim_; ++i) {
+                    target[i] = -eps[i] / std::max(s, eps_safe); // add eps_safe to prevent division by zero in case of very small sigma
+                }
 
                 // Prepare input vector for the network by concatenating the noisy sample xt with the diffusion time t.
                 std::vector<double> input = xt;
@@ -572,15 +582,14 @@ namespace mu2e {
                 // Forward pass to compute the predicted score from the network given the input (noisy sample + time).
                 auto score = forward(input);
 
-                // Compute the loss (Mean Squared Error) between the predicted score and the target score.
+                // Compute the loss (Mean Squared Error) per dimension between the predicted score and the target score.
+                double loss = computeLoss(score, target);
+
+                // Compute the gradient of the loss w.r.t. the predicted score
+                // Gradient of the loss w.r.t. the predicted score is 2*(score-target)/dim_ (the division by dim_ is for averaging the loss per dimension).
                 std::vector<double> grad(dim_);
-                double loss=0.0;
-                for (int i=0;i<dim_;++i)
-                {
-                    double d = score[i]-target[i];
-                    loss += d*d;
-                    // Gradient of the loss w.r.t. the predicted score is 2*(score-target)/dim_ (the division by dim_ is for averaging the loss per dimension).
-                    grad[i] = 2*d/dim_;
+                for (int i = 0; i < dim_; ++i) {
+                    grad[i] = 2.0 * (score[i] - target[i]) / dim_;
                 }
 
                 // Backward pass to compute gradients of the loss w.r.t. network parameters using the computed gradient of the loss w.r.t. the predicted score.
@@ -589,8 +598,7 @@ namespace mu2e {
                 // Increment batch counter and apply optimizer step if batch size is reached. This allows for vectorized updates after processing a batch of
                 // samples, which can improve training efficiency and convergence.
                 batchCounter++;
-                if(batchCounter == batchSize_)
-                {
+                if (batchCounter == batchSize_) {
                     // Average accumulated gradients so optimizer step size is independent of batch size.
                     const double invBatch = 1.0 / static_cast<double>(batchCounter);
                     for (auto& layer : network_) {
@@ -617,8 +625,7 @@ namespace mu2e {
             }
 
             // Final flush: apply optimizer to remaining gradients
-            if(batchCounter > 0)
-            {
+            if (batchCounter > 0) {
                 // Average by the true final mini-batch size (which may be < batchSize_).
                 const double invBatch = 1.0 / static_cast<double>(batchCounter);
                 for (auto& layer : network_) {
@@ -900,7 +907,9 @@ namespace mu2e {
                         epochLosses.reserve(numEpochs);
                     } else if (tokens[0] == "trainingSampleSize")
                     {
-                        // We can store this if needed for analysis, but it is not used in model reconstruction, so we will ignore it for now.
+                        // We can store this if needed for analysis, but it is not used in model reconstruction.
+                        // If needed later, parse as size_t to avoid truncation, e.g. size_t trainingSampleSize = std::stoull(tokens[1]);
+                        // and apply checked conversion only when assigning to narrower types.
                     }else {
                         epochLosses.push_back(std::stod(tokens[1]));
                     }
@@ -934,7 +943,7 @@ namespace mu2e {
                << "but does NOT resume training from the original state.";
         mf::LogInfo("ScoreBasedDiffusionModel::loadModel") << logMsg.str();
 
-        // Reconstruct model
+        // Reconstruct model without random weight initialization
         ScoreBasedDiffusionModel model(
             randFlat,
             randGaussQ,
@@ -953,10 +962,42 @@ namespace mu2e {
             batchSize,
             gradientClipThreshold,
             learningRate,
-            diffusionSteps
+            diffusionSteps,
+            false
         );
 
-        // Overwrite initialized weights with loaded values
+        // Validate loaded network shape and dimensions match the initialized model before overwriting.
+        if (loadedNetwork.size() != model.network_.size()) {
+            throw cet::exception("ScoreBasedDiffusionModel::loadModel")
+                << "Layer count mismatch: loaded " << loadedNetwork.size()
+                << " layers, expected " << model.network_.size()
+                << ". The model file may be missing or malformed in [NETWORK_PARAMETERS] (e.g. numLayers).";
+        }
+        for (size_t l = 0; l < loadedNetwork.size(); ++l) {
+            if (loadedNetwork[l].W.empty() || loadedNetwork[l].W[0].empty() || loadedNetwork[l].b.empty()) {
+                throw cet::exception("ScoreBasedDiffusionModel::loadModel")
+                    << "Loaded layer " << l
+                    << " is not fully sized before overwrite (empty weights/biases).";
+            }
+            // Check that loaded layer dimensions match the model's initialized dimensions
+            if (loadedNetwork[l].W.size() != model.network_[l].W.size() ||
+                loadedNetwork[l].W[0].size() != model.network_[l].W[0].size()) {
+                throw cet::exception("ScoreBasedDiffusionModel::loadModel")
+                    << "Layer " << l << " weight dimension mismatch: loaded W["
+                    << loadedNetwork[l].W.size() << "]["
+                    << loadedNetwork[l].W[0].size() << "], expected W["
+                    << model.network_[l].W.size() << "]["
+                    << model.network_[l].W[0].size() << "].";
+            }
+            if (loadedNetwork[l].b.size() != model.network_[l].b.size()) {
+                throw cet::exception("ScoreBasedDiffusionModel::loadModel")
+                    << "Layer " << l << " bias dimension mismatch: loaded b["
+                    << loadedNetwork[l].b.size() << "], expected b["
+                    << model.network_[l].b.size() << "].";
+            }
+        }
+
+        // Allocate weights with loaded values
         for (size_t l = 0; l < model.network_.size(); ++l) {
             model.network_[l].W = loadedNetwork[l].W;
             model.network_[l].b = loadedNetwork[l].b;
@@ -984,8 +1025,9 @@ namespace mu2e {
         // Start from pure noise
         std::vector<double> x(dim_);
 
-        for (int i=0;i<dim_;++i)
+        for (int i = 0; i < dim_; ++i) {
             x[i] = randGaussQ_.fire();
+        }
 
         // Reverse diffusion process
         for (int step = steps - 1; step >= 0; --step) {
@@ -1002,8 +1044,8 @@ namespace mu2e {
 
                 auto score = forward(input);
 
-                for (int i=0;i<dim_;++i){
-                    x[i] += -s*s*score[i]*dt;
+                for (int i = 0; i < dim_; ++i) {
+                    x[i] += -s * s * score[i] * dt;
                 }
             } else {
                 // Heun's method (2nd order)
@@ -1014,13 +1056,15 @@ namespace mu2e {
                 auto score_k1 = forward(input);
 
                 std::vector<double> k1(dim_);
-                for (int i=0;i<dim_;++i)
-                    k1[i] = -s*s*score_k1[i];
+                for (int i = 0; i < dim_; ++i) {
+                    k1[i] = -s * s * score_k1[i];
+                }
 
                 // predictor
                 std::vector<double> x_pred(dim_);
-                for (int i=0;i<dim_;++i)
-                    x_pred[i] = x[i] + k1[i]*dt;
+                for (int i = 0; i < dim_; ++i) {
+                    x_pred[i] = x[i] + k1[i] * dt;
+                }
 
                 // next time
                 double t_next = std::max(0.0, t - dt);
@@ -1033,12 +1077,14 @@ namespace mu2e {
                 auto score_k2 = forward(input_next);
 
                 std::vector<double> k2(dim_);
-                for (int i=0;i<dim_;++i)
-                    k2[i] = -s_next*s_next*score_k2[i];
+                for (int i = 0; i < dim_; ++i) {
+                    k2[i] = -s_next * s_next * score_k2[i];
+                }
 
                 // trapezoidal update
-                for (int i=0;i<dim_;++i)
-                    x[i] += 0.5*(k1[i]+k2[i])*dt;
+                for (int i = 0; i < dim_; ++i) {
+                    x[i] += 0.5 * (k1[i] + k2[i]) * dt;
+                }
             }
         }
 
diff --git a/STMMC/inc/VDResamplerTransformDefaults.hh b/STMMC/inc/VDResamplerTransformDefaults.hh
deleted file mode 100644
index ada0d1d7f1..0000000000
--- a/STMMC/inc/VDResamplerTransformDefaults.hh
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-namespace mu2e {
-  namespace vdresampler {
-
-    constexpr double kX0 = -3904.0;
-    constexpr double kY0 = 0.0;
-    constexpr double kT0 = 1700.0;    // ns
-    constexpr double kTScale = 1.0;
-    constexpr double kP0 = 1.0;
-
-  }  // namespace vdresampler
-}  // namespace mu2e
diff --git a/STMMC/inc/VDResamplerTransforms.hh b/STMMC/inc/VDResamplerTransforms.hh
new file mode 100644
index 0000000000..b6e4178b07
--- /dev/null
+++ b/STMMC/inc/VDResamplerTransforms.hh
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+
+namespace mu2e {
+  namespace VDResampler {
+
+    // nominal center of the detector
+    constexpr double kX0 = -3904.0;
+    constexpr double kY0 = 0.0;
+    // sensitive time scale
+    constexpr double kT0 = 1700.0; // ns
+    constexpr double kTScale = 1.0;
+    // tunable momentum scale
+    constexpr double kP0 = 1.0; // MeV/c
+
+    // safety constants for numerical stability in the forward and inverse transforms
+    constexpr double kRadiusSafetyEpsilon = 1e-6;
+    constexpr double kMinSafeTime = 0.1;
+
+    // Transform detector-space quantities to model-space variables used for training.
+    inline void forwardTransformSample(
+      const double x,
+      const double y,
+      const double z,
+      const double t,
+      const double px,
+      const double py,
+      const double pz,
+      const double x0,
+      const double y0,
+      const double t0,
+      const double tScale,
+      const double p0,
+      const double VDr,
+      const double VDz0,
+      double& xTrans,
+      double& yTrans,
+      double& tTrans,
+      double& prTrans,
+      double& pphiTrans,
+      double& pzTrans
+    ) {
+      // as z maybe slightly different from the nominal VDz0 due to the step size, we will extrapolate the (x, y) coordinates
+      // to the nominal VDz0 for all hits to compute the training parameters to be fed into the SBDM.
+      const double extrapolationFactor = (VDz0 - z) / pz;
+      const double xExtrapolated = x + extrapolationFactor * px;
+      const double yExtrapolated = y + extrapolationFactor * py;
+
+      // Now we have the extrapolated (x, y) at the nominal VDz0, we can compute the training parameters for the SBDM.
+      // We convert (t, x_extrapolated, y_extrapolated, px, py, pz) to (t', x_extrapolated, y_extrapolated, p_r', p_phi', p_z')
+
+      // Shift to detector-centered coordinates and map to transformed position variables.
+      const double dx = xExtrapolated - x0;
+      const double dy = yExtrapolated - y0;
+      // polar coordinates
+      const double r = std::sqrt(dx * dx + dy * dy);
+      double rho = r / VDr;
+      // numerical safety (avoid rho >= 1)
+      rho = std::min(rho, 1.0 - kRadiusSafetyEpsilon);
+      // boundary-removing transform
+      // u = atanh(r/R)
+      const double u = 0.5 * std::log((1.0 + rho) / (1.0 - rho));
+      // angle
+      const double theta = std::atan2(dy, dx);
+      // map back to Cartesian-like coordinates
+      xTrans = u * std::cos(theta);
+      yTrans = u * std::sin(theta);
+
+      // compute momentum components in the local polar coordinate system (r, phi, z)
+      double pr = 0.0;
+      double pphi = 0.0;
+      if (r > kRadiusSafetyEpsilon) { // avoid division by zero, if r is very small, we can approximate pr ~ px and pphi ~ py
+        const double rx = dx / r; // unit vector in the radial direction
+        const double ry = dy / r;
+        const double phix = -ry; // unit vector in the angular direction
+        const double phiy = rx;
+        pr = px * rx + py * ry; // radial momentum component
+        pphi = px * phix + py * phiy; // angular momentum component
+      } else {
+        pr = px;
+        pphi = py;
+      }
+      // momentum scaling
+      prTrans = std::asinh(pr / p0); // tunable scale where I want best resolution
+      pphiTrans = std::asinh(pphi / p0);
+      pzTrans = std::asinh(pz / p0);
+
+      // time transform
+      const double tSafe = (t > kMinSafeTime) ? t : kMinSafeTime; // avoid log(0)
+      tTrans = std::log(tSafe / t0) / tScale;
+    }
+
+    // Invert transformed sample coordinates/momenta back to detector-space quantities.
+    inline void invertGeneratedSample(
+      const double xTrans,
+      const double yTrans,
+      const double tTrans,
+      const double prTrans,
+      const double pphiTrans,
+      const double pzTrans,
+      const double x0,
+      const double y0,
+      const double t0,
+      const double tScale,
+      const double p0,
+      const double VDr,
+      const double VDz0,
+      double& x,
+      double& y,
+      double& z,
+      double& t,
+      double& px,
+      double& py,
+      double& pz
+    ) {
+      const double u = std::sqrt(xTrans * xTrans + yTrans * yTrans);
+      const double theta = std::atan2(yTrans, xTrans);
+      const double rho = std::tanh(u);
+      const double r = rho * VDr;
+      const double dx = r * std::cos(theta);
+      const double dy = r * std::sin(theta);
+      x = dx + x0;
+      y = dy + y0;
+      z = VDz0;
+
+      t = t0 * std::exp(tTrans * tScale);
+
+      const double pr = p0 * std::sinh(prTrans);
+      const double pphi = p0 * std::sinh(pphiTrans);
+      pz = p0 * std::sinh(pzTrans);
+
+      if (r > kRadiusSafetyEpsilon) {
+        const double rx = dx / r;
+        const double ry = dy / r;
+        const double phix = -ry;
+        const double phiy = rx;
+        px = pr * rx + pphi * phix;
+        py = pr * ry + pphi * phiy;
+      } else {
+        px = pr;
+        py = pphi;
+      }
+    }
+
+  } // namespace VDResampler
+} // namespace mu2e
diff --git a/STMMC/src/VDResamplerConfigure_module.cc b/STMMC/src/VDResamplerConfigure_module.cc
index 97e40f14df..3fb0776da0 100644
--- a/STMMC/src/VDResamplerConfigure_module.cc
+++ b/STMMC/src/VDResamplerConfigure_module.cc
@@ -43,10 +43,13 @@
 #include "art_root_io/TFileService.h"
 #include "TTree.h"
 
-typedef cet::map_vector_key key_type;
 typedef unsigned long VolumeId_type;
 
 namespace mu2e {
+  namespace {
+    constexpr int kTwoStageTrainingHitThreshold = 100000;
+  }
+
   class VDResamplerConfigure : public art::EDAnalyzer {
     public:
       using Name=fhicl::Name;
@@ -79,7 +82,7 @@ namespace mu2e {
       double x = 0.0, y = 0.0, z = 0.0, px = 0.0, py = 0.0, pz = 0.0, mass = 0.0, E = 0.0, time = 0.0;
       VolumeId_type virtualdetectorId = 0;
       int nThrownEvents = 0;
-      TTree* ttree;
+      TTree* ttree = nullptr;
       std::map<int, int> pdgIds; // <id, count>
   };
 
@@ -198,7 +201,7 @@ namespace mu2e {
     for (const auto& part : pdgIds) {
       sumOutFile << part.first << "," << part.second;
       bool useTwoStageTraining = false;
-      if (part.second < 100000) {
+      if (part.second < kTwoStageTrainingHitThreshold) {
         // if data is limited, use two-stage training to improve performance and reduce the required training data size for each model.
         useTwoStageTraining = true;
       }
diff --git a/STMMC/src/VDResamplerGenerateFromModel_module.cc b/STMMC/src/VDResamplerGenerateFromModel_module.cc
index 9f72ccbea0..5967634c87 100644
--- a/STMMC/src/VDResamplerGenerateFromModel_module.cc
+++ b/STMMC/src/VDResamplerGenerateFromModel_module.cc
@@ -10,6 +10,7 @@
 #include <cmath>
 #include <fstream>
 #include <iostream>
+#include <memory>
 #include <string>
 #include <utility>
 
@@ -43,7 +44,7 @@
 #include "Offline/MCDataProducts/inc/GenId.hh"
 #include "Offline/MCDataProducts/inc/GenParticle.hh"
 #include "Offline/SeedService/inc/SeedService.hh"
-#include "Offline/STMMC/inc/VDResamplerTransformDefaults.hh"
+#include "Offline/STMMC/inc/VDResamplerTransforms.hh"
 
 // ROOT includes
 #include "art_root_io/TFileService.h"
@@ -158,11 +159,11 @@ namespace mu2e {
       int pdgId_ = 0;
 
       // Detector-center parameters used in the same transform as training.
-      double x0_ = vdresampler::kX0;
-      double y0_ = vdresampler::kY0;
-      double t0_ = vdresampler::kT0;
-      double tScale_ = vdresampler::kTScale;
-      double p0_ = vdresampler::kP0;
+      double x0_ = VDResampler::kX0;
+      double y0_ = VDResampler::kY0;
+      double t0_ = VDResampler::kT0;
+      double tScale_ = VDResampler::kTScale;
+      double p0_ = VDResampler::kP0;
 
       // Variables for optional ROOT dump.
       TTree* outTree_ = nullptr;
@@ -194,6 +195,16 @@ namespace mu2e {
 
     produces<GenParticleCollection>();
 
+    if (VDr_ <= 0.0) {
+      throw cet::exception("VDResamplerGenerateFromModel")
+        << "VDr must be positive (got " << VDr_ << "); "
+        << "rho = r/VDr would produce inf/NaN in generated samples.";
+    }
+    if (!std::isfinite(VDz0_)) {
+      throw cet::exception("VDResamplerGenerateFromModel")
+        << "VDz0 must be finite (got " << VDz0_ << ").";
+    }
+
     if (useTwoStageModel_) {
       if (stage1ModelFile_.empty() || stage2ModelFile_.empty()) {
         throw cet::exception("VDResamplerGenerateFromModel")
@@ -284,43 +295,35 @@ namespace mu2e {
       pz_t = sample[5];
     }
 
-    // Recover detector coordinates from the transformed (x', y').
-    const double u = std::sqrt(x_trans * x_trans + y_trans * y_trans);
-    const double theta = std::atan2(y_trans, x_trans);
-    const double rho = std::tanh(u);
-    const double r = rho * VDr_;
-    const double dx = r * std::cos(theta);
-    const double dy = r * std::sin(theta);
-    x_gen_ = dx + x0_;
-    y_gen_ = dy + y0_;
-    z_gen_ = VDz0_;
-
-    // Invert the time transform.
-    t_gen_ = t0_ * std::exp(t_trans * tScale_);
-
-    // Invert the momentum transform.
-    const double pr = p0_ * std::sinh(pr_t);
-    const double pphi = p0_ * std::sinh(pphi_t);
-    pz_gen_ = p0_ * std::sinh(pz_t);
-
-    if (r > 1e-6) {
-      const double rx = dx / r;
-      const double ry = dy / r;
-      const double phix = -ry;
-      const double phiy = rx;
-      px_gen_ = pr * rx + pphi * phix;
-      py_gen_ = pr * ry + pphi * phiy;
-    } else {
-      px_gen_ = pr;
-      py_gen_ = pphi;
-    }
+    VDResampler::invertGeneratedSample(
+      x_trans,
+      y_trans,
+      t_trans,
+      pr_t,
+      pphi_t,
+      pz_t,
+      x0_,
+      y0_,
+      t0_,
+      tScale_,
+      p0_,
+      VDr_,
+      VDz0_,
+      x_gen_,
+      y_gen_,
+      z_gen_,
+      t_gen_,
+      px_gen_,
+      py_gen_,
+      pz_gen_
+    );
 
     mass_gen_ = pdt_->particle(pdgId_).mass();
     const CLHEP::Hep3Vector momParticle(px_gen_, py_gen_, pz_gen_);
     const CLHEP::Hep3Vector posParticle(x_gen_, y_gen_, z_gen_);
     const double eTotal = std::sqrt(momParticle.mag2() + mass_gen_ * mass_gen_);
     E_gen_ = eTotal - mass_gen_;
-    const CLHEP::HepLorentzVector fourMomParticle(eTotal, momParticle);
+    const CLHEP::HepLorentzVector fourMomParticle(momParticle, eTotal);
 
     output->emplace_back(
       PDGCode::type(pdgId_),
diff --git a/STMMC/src/VDResamplerGenerateMix_module.cc b/STMMC/src/VDResamplerGenerateMix_module.cc
index a0c477b375..fceedae421 100644
--- a/STMMC/src/VDResamplerGenerateMix_module.cc
+++ b/STMMC/src/VDResamplerGenerateMix_module.cc
@@ -48,7 +48,7 @@
 #include "Offline/MCDataProducts/inc/GenId.hh"
 #include "Offline/MCDataProducts/inc/GenParticle.hh"
 #include "Offline/SeedService/inc/SeedService.hh"
-#include "Offline/STMMC/inc/VDResamplerTransformDefaults.hh"
+#include "Offline/STMMC/inc/VDResamplerTransforms.hh"
 
 // ROOT includes
 #include "art_root_io/TFileService.h"
@@ -172,11 +172,11 @@ namespace mu2e {
       std::vector<SourceSummary> sources_;
       double totalSourceWeight_ = 0.0;
 
-      double x0_ = vdresampler::kX0;
-      double y0_ = vdresampler::kY0;
-      double t0_ = vdresampler::kT0;
-      double tScale_ = vdresampler::kTScale;
-      double p0_ = vdresampler::kP0;
+      double x0_ = VDResampler::kX0;
+      double y0_ = VDResampler::kY0;
+      double t0_ = VDResampler::kT0;
+      double tScale_ = VDResampler::kTScale;
+      double p0_ = VDResampler::kP0;
 
       TTree* outTree_ = nullptr;
       int summaryIndex_ = -1;
@@ -219,6 +219,16 @@ namespace mu2e {
 
     produces<GenParticleCollection>();
 
+    if (VDr_ <= 0.0) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "VDr must be positive (got " << VDr_ << "); "
+        << "rho = r/VDr would produce inf/NaN in generated samples.";
+    }
+    if (!std::isfinite(VDz0_)) {
+      throw cet::exception("VDResamplerGenerateMix")
+        << "VDz0 must be finite (got " << VDz0_ << ").";
+    }
+
     const auto& hitSummaryFiles = conf().hitSummaryFiles();
     const auto& potsPerFile = conf().potsPerFile();
     if (hitSummaryFiles.size() != potsPerFile.size()) {
@@ -441,33 +451,28 @@ namespace mu2e {
     double pzTrans = 0.0;
     generateFromModels(modelSet, xTrans, yTrans, tTrans, prTrans, pphiTrans, pzTrans);
 
-    const double u = std::sqrt(xTrans * xTrans + yTrans * yTrans);
-    const double theta = std::atan2(yTrans, xTrans);
-    const double rho = std::tanh(u);
-    const double r = rho * VDr_;
-    const double dx = r * std::cos(theta);
-    const double dy = r * std::sin(theta);
-    x_gen_ = dx + x0_;
-    y_gen_ = dy + y0_;
-    z_gen_ = VDz0_;
-
-    t_gen_ = t0_ * std::exp(tTrans * tScale_);
-
-    const double pr = p0_ * std::sinh(prTrans);
-    const double pphi = p0_ * std::sinh(pphiTrans);
-    pz_gen_ = p0_ * std::sinh(pzTrans);
-
-    if (r > 1e-6) {
-      const double rx = dx / r;
-      const double ry = dy / r;
-      const double phix = -ry;
-      const double phiy = rx;
-      px_gen_ = pr * rx + pphi * phix;
-      py_gen_ = pr * ry + pphi * phiy;
-    } else {
-      px_gen_ = pr;
-      py_gen_ = pphi;
-    }
+    VDResampler::invertGeneratedSample(
+      xTrans,
+      yTrans,
+      tTrans,
+      prTrans,
+      pphiTrans,
+      pzTrans,
+      x0_,
+      y0_,
+      t0_,
+      tScale_,
+      p0_,
+      VDr_,
+      VDz0_,
+      x_gen_,
+      y_gen_,
+      z_gen_,
+      t_gen_,
+      px_gen_,
+      py_gen_,
+      pz_gen_
+    );
 
     pdgId_ = modelSet.pdgId;
     generationMode_ = modelSet.useTwoStageModel ? 2 : 1;
@@ -478,7 +483,7 @@ namespace mu2e {
     const CLHEP::Hep3Vector posParticle(x_gen_, y_gen_, z_gen_);
     const double eTotal = std::sqrt(momParticle.mag2() + mass_gen_ * mass_gen_);
     E_gen_ = eTotal - mass_gen_;
-    const CLHEP::HepLorentzVector fourMomParticle(eTotal, momParticle);
+    const CLHEP::HepLorentzVector fourMomParticle(momParticle, eTotal);
 
     output->emplace_back(
       PDGCode::type(pdgId_),
diff --git a/STMMC/src/VDResamplerTrain_module.cc b/STMMC/src/VDResamplerTrain_module.cc
index 14c0f6d843..3a4464c923 100644
--- a/STMMC/src/VDResamplerTrain_module.cc
+++ b/STMMC/src/VDResamplerTrain_module.cc
@@ -13,6 +13,7 @@
 #include <cmath>
 #include <iostream>
 #include <fstream>
+#include <memory>
 
 #include "Offline/MachineLearningTools/inc/ScoreBasedDiffusionModel.hh"
 
@@ -44,9 +45,8 @@
 #include "Offline/MCDataProducts/inc/SimParticle.hh"
 #include "Offline/MCDataProducts/inc/StepPointMC.hh"
 #include "Offline/SeedService/inc/SeedService.hh"
-#include "Offline/STMMC/inc/VDResamplerTransformDefaults.hh"
+#include "Offline/STMMC/inc/VDResamplerTransforms.hh"
 
-typedef cet::map_vector_key key_type;
 typedef unsigned long VolumeId_type;
 
 namespace mu2e {
@@ -118,11 +118,13 @@ namespace mu2e {
       VolumeId_type virtualdetectorId = 0;
 
       // transform variables for training data preparation
-      double x0 = vdresampler::kX0;
-      double y0 = vdresampler::kY0;
+      double x0 = VDResampler::kX0;
+      double y0 = VDResampler::kY0;
       // time scaling
-      double t0 = vdresampler::kT0;
-      double tScale = vdresampler::kTScale;
+      double t0 = VDResampler::kT0;
+      double tScale = VDResampler::kTScale;
+      // momentum scaling
+      double p0 = VDResampler::kP0;
   };
 
   VDResamplerTrain::VDResamplerTrain(const Parameters& conf) :
@@ -143,17 +145,42 @@ namespace mu2e {
     trainingEpochs(conf().SBDMtrainingEpochs()),
     trainingSize(conf().SBDMtrainingSize())
   {
+    // Validate geometry configuration
+    if (VDr <= 0.0) {
+        throw cet::exception("VDResamplerTrain")
+            << "VDr must be positive (got " << VDr << "); "
+            << "rho = r/VDr would produce inf/NaN in training data.";
+    }
+    if (!std::isfinite(VDz0)) {
+        throw cet::exception("VDResamplerTrain")
+            << "VDz0 must be finite (got " << VDz0 << ").";
+    }
+
     // optimizer selection
-    ScoreBasedDiffusionModel::OptimizerType opt =
-        (conf().SBDMoptimizer() == "SGD") ?
-        ScoreBasedDiffusionModel::OptimizerType::SGD :
-        ScoreBasedDiffusionModel::OptimizerType::ADAM;
+    ScoreBasedDiffusionModel::OptimizerType opt;
+    if (conf().SBDMoptimizer() == "SGD") {
+        opt = ScoreBasedDiffusionModel::OptimizerType::SGD;
+    } else {
+        if (conf().SBDMoptimizer() != "ADAM") {
+            mf::LogWarning("VDResamplerTrain")
+                << "Unrecognized SBDMoptimizer value \"" << conf().SBDMoptimizer()
+                << "\"; falling back to ADAM.";
+        }
+        opt = ScoreBasedDiffusionModel::OptimizerType::ADAM;
+    }
 
     // noise schedule
-    ScoreBasedDiffusionModel::NoiseScheduleType sched =
-        (conf().SBDMnoiseSchedule() == "LINEAR") ?
-        ScoreBasedDiffusionModel::NoiseScheduleType::LINEAR :
-        ScoreBasedDiffusionModel::NoiseScheduleType::COSINE;
+    ScoreBasedDiffusionModel::NoiseScheduleType sched;
+    if (conf().SBDMnoiseSchedule() == "LINEAR") {
+        sched = ScoreBasedDiffusionModel::NoiseScheduleType::LINEAR;
+    } else {
+        if (conf().SBDMnoiseSchedule() != "COSINE") {
+            mf::LogWarning("VDResamplerTrain")
+                << "Unrecognized SBDMnoiseSchedule value \"" << conf().SBDMnoiseSchedule()
+                << "\"; falling back to COSINE.";
+        }
+        sched = ScoreBasedDiffusionModel::NoiseScheduleType::COSINE;
+    }
 
     if (useTwoStageTraining) {
       // create stage-1 model for (t', x', y')
@@ -265,57 +292,34 @@ namespace mu2e {
       if (virtualdetectorId != VirtualDetectorID || (stepPdgId != pdgID && pdgID != 0) || pz <= 0)
         continue; // Filter hits based on the virtual detector ID, particle type, and pz
 
-      // as z maybe slightly different from the nominal VDz0 due to the step size, we will extrapolate the (x, y) coordinates
-      // to the nominal VDz0 for all hits to compute the training parameters to be fed into the SBDM.
-      double extrapolationFactor = (VDz0 - z) / pz; // Assuming linear motion, this is the factor to extrapolate from current z to z0
-      double x_extrapolated = x + extrapolationFactor * px;
-      double y_extrapolated = y + extrapolationFactor * py;
-
-      // Now we have the extrapolated (x, y) at the nominal VDz0, we can compute the training parameters for the SBDM.
-      // We convert (t, x_extrapolated, y_extrapolated, px, py, pz) to (t', x_extrapolated, y_extrapolated, p_r', p_phi', p_z')
-
-      // shift to detector-centered coordinates
-      double dx = x_extrapolated - x0;
-      double dy = y_extrapolated - y0;
-      // polar coordinates
-      double r = std::sqrt(dx*dx + dy*dy);
-      double rho = r / VDr;
-      // numerical safety (avoid rho >= 1)
-      const double eps = 1e-6;
-      rho = std::min(rho, 1.0 - eps);
-      // boundary-removing transform
-      // u = atanh(r/R)
-      double u = 0.5 * std::log((1.0 + rho) / (1.0 - rho));
-      // angle
-      double theta = std::atan2(dy, dx);
-      // map back to Cartesian-like coordinates
-      double x_trans = u * std::cos(theta);
-      double y_trans = u * std::sin(theta);
-
-      // compute momentum components in the local polar coordinate system (r, phi, z)
-      double pr = 0.0;
-      double pphi = 0.0;
-      if (r > 1e-6) { // avoid division by zero, if r is very small, we can approximate pr ~ px and pphi ~ py
-        double rx = dx / r; // unit vector in the radial direction
-        double ry = dy / r;
-        double phix = -ry; // unit vector in the angular direction)
-        double phiy =  rx;
-        pr   = px*rx + py*ry; // radial momentum component
-        pphi = px*phix + py*phiy;
-      }
-      else {
-        pr = px;
-        pphi = py;
-      }
-      // momentum scaling
-      double p0 = vdresampler::kP0; // tunable scale where I want best resolution
-      double pr_t   = std::asinh(pr   / p0);
-      double pphi_t = std::asinh(pphi / p0);
-      double pz_t   = std::asinh(pz   / p0);
-
-      // time transform
-      double t_safe = std::max(time, 0.1); // avoid log(0)
-      double t_trans = std::log(t_safe / t0) / tScale;
+      double x_trans = 0.0;
+      double y_trans = 0.0;
+      double t_trans = 0.0;
+      double pr_t = 0.0;
+      double pphi_t = 0.0;
+      double pz_t = 0.0;
+      VDResampler::forwardTransformSample(
+        x,
+        y,
+        z,
+        time,
+        px,
+        py,
+        pz,
+        x0,
+        y0,
+        t0,
+        tScale,
+        p0,
+        VDr,
+        VDz0,
+        x_trans,
+        y_trans,
+        t_trans,
+        pr_t,
+        pphi_t,
+        pz_t
+      );
 
       if (useTwoStageTraining) {
         DiffusionTrainingSample stage1Sample;

From a774f0afe094662f611fdbad669fc088360e9a6b Mon Sep 17 00:00:00 2001
From: YongyiBWu <47263079+YongyiBWu@users.noreply.github.com>
Date: Mon, 13 Apr 2026 00:50:06 -0400
Subject: [PATCH 4/4] Add comment

---
 MachineLearningTools/src/ScoreBasedDiffusionModel.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MachineLearningTools/src/ScoreBasedDiffusionModel.cc b/MachineLearningTools/src/ScoreBasedDiffusionModel.cc
index 31ed371129..9e0fd318a4 100644
--- a/MachineLearningTools/src/ScoreBasedDiffusionModel.cc
+++ b/MachineLearningTools/src/ScoreBasedDiffusionModel.cc
@@ -538,6 +538,7 @@ namespace mu2e {
             for (size_t i = N - 1; i > 0; --i)
             {
                 size_t j = static_cast<size_t>(randFlat_.fire() * (i + 1));
+                j = std::min(j, i); // Ensure j is within bounds
                 std::swap(indices[i], indices[j]);
             }