maize-genetics · btmonier · May 25, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml
@@ -115,6 +115,22 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Free runner disk space
+        # ubuntu-latest ships with only ~14 GB free, which is not enough to
+        # build the seq-sim-dev image (pixi env + micromamba env + PHGv2 +
+        # JDK 21 + buildx layers all materialize on disk simultaneously).
+        # This reclaims ~25-30 GB by removing pre-installed Android SDK,
+        # .NET, GHC, and large apt packages we don't use.
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false       # keep JDKs/node we use elsewhere
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: false    # keep buildx daemon images
+          swap-storage: true
+
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
 

diff --git a/build.gradle.kts b/build.gradle.kts
@@ -4,7 +4,7 @@ plugins {
 }
 
 group = "net.maizegenetics"
-version = "0.2.9"
+version = "0.3.0"
 
 repositories {
     mavenCentral()
@@ -46,17 +46,41 @@ tasks.test {
     }
 }
 
+// Heavy test tiers spawn full pipelines that shell out to subprocess
+// Gradle daemons (MLImpute) and native tools (AnchorWave / minimap2 /
+// python). The kernel OOM killer (exit 137 / SIGKILL) fires when total
+// CONTAINER memory is exceeded -- not the JVM heap. We fork per test
+// class so memory is released between classes; the heap itself is left
+// at Gradle's default (which is enough for smallseq).
+fun Test.applyHeavyTestConfig() {
+    // Each end-to-end test runs a full pipeline; isolate them so the
+    // JVM frees process-wide resources (classloaders, threadpools,
+    // native handles) between classes.
+    setForkEvery(1L)
+}
+
 val integrationTest = tasks.register<Test>("integrationTest") {
     description = "Runs per-step integration tests against real external binaries (requires seq-sim-dev container)."
     group = "verification"
 
     testClassesDirs = sourceSets.test.get().output.classesDirs
     classpath = sourceSets.test.get().runtimeClasspath
 
+    // Restrict Gradle's class-file scan so `forkEvery = 1` only spawns
+    // JVMs for the integration test classes. Without this, the heavy
+    // test config forks one JVM per test class in the entire test
+    // source set (~20 classes), which is both wasteful and flaky --
+    // any single fork's startup failure surfaces as
+    // "Gradle Test Executor N finished with non-zero exit value 1"
+    // and aborts the whole task.
+    include("**/*IntegrationTest.class")
+
     useJUnitPlatform {
         includeTags("integration")
     }
 
+    applyHeavyTestConfig()
+
     shouldRunAfter(tasks.test)
     outputs.upToDateWhen { false }
 }
@@ -68,10 +92,17 @@ val e2eTest = tasks.register<Test>("e2eTest") {
     testClassesDirs = sourceSets.test.get().output.classesDirs
     classpath = sourceSets.test.get().runtimeClasspath
 
+    // See the comment on `integrationTest` above -- restrict scanning so
+    // `forkEvery = 1` only forks JVMs for `*E2ETest` classes instead of
+    // the entire test source set.
+    include("**/*E2ETest.class")
+
     useJUnitPlatform {
         includeTags("e2e")
     }
 
+    applyHeavyTestConfig()
+
     shouldRunAfter(integrationTest)
     outputs.upToDateWhen { false }
 }

diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
@@ -82,16 +82,56 @@ RUN mkdir -p "${MAMBA_ROOT_PREFIX}" && \
 # are directly callable without `pixi run` or `conda run` inside the container.
 ENV PATH="${MAMBA_ROOT_PREFIX}/envs/phgv2-conda/bin:${PATH}"
 
+# ---------------------------------------------------------------------------
+# `conda` shim that delegates to micromamba.
+#
+# PHGv2's AlignAssemblies hard-codes `conda run -n phgv2-conda <tool> ...`
+# (anchorwave, minimap2, samtools) and spawns it via Java's ProcessBuilder
+# instead of going through a shell. Since this image only ships micromamba
+# (lighter weight and what we use to manage the phgv2-conda env above),
+# `conda` is not normally available and PHG fails with
+# "Cannot run program 'conda': No such file or directory".
+#
+# A tiny exec-shim is enough: micromamba accepts the same `run -n NAME --
+# CMD ARGS` form so we just forward everything.
+# ---------------------------------------------------------------------------
+RUN printf '%s\n' \
+        '#!/bin/bash' \
+        'exec micromamba "$@"' \
+        > /usr/local/bin/conda && \
+    chmod +x /usr/local/bin/conda
+
 # ---------------------------------------------------------------------------
 # Pre-install the seq-sim pixi env. We do this against a scratch directory so
 # the image layer is stable regardless of the user's host-side seq_sim_work.
 # At runtime the pipeline's own pixi env (seq_sim_work/.pixi) will be created
 # on first use, but the solver cache is already populated.
 # ---------------------------------------------------------------------------
 COPY src/main/resources/pixi.toml /opt/seq-sim-prebuilt/pixi.toml
-RUN cd /opt/seq-sim-prebuilt && \
-    pixi install --manifest-path pixi.toml || \
-    echo "pixi install warmup failed; runtime will retry"
+# The pixi warmup MUST succeed -- if it fails silently, every runtime test
+# in a fresh workdir does a cold-cache pixi install whose solver+download
+# peak can exceed the container memory limit and get SIGKILL'd (exit 137).
+# Fail the image build loudly instead of hiding the regression.
+#
+# We point PIXI/RATTLER_CACHE_DIR at the same path the runtime container
+# uses (see docker-compose.yml). That way the downloaded .conda packages
+# end up in /var/cache/pixi during the image build, get baked into the
+# image layer, and seed the named volume on first mount -- so the
+# runtime `pixi install` is just hardlinks/copies from a warm cache.
+ENV PIXI_CACHE_DIR=/var/cache/pixi
+ENV RATTLER_CACHE_DIR=/var/cache/pixi
+# The realized env at /opt/seq-sim-prebuilt/.pixi/envs/default is only used
+# to warm /var/cache/pixi -- it is never executed at runtime because the
+# container sets SEQ_SIM_SKIP_PIXI_PREFIX=1 (see ProcessRunner.kt), which
+# routes all pipeline tool calls through the micromamba phgv2-conda env on
+# PATH instead. Delete it after the install so we don't ship ~2-3 GB of
+# duplicated python/anchorwave/minimap2/ropebwt3 binaries in the image
+# layer. The /var/cache/pixi cache is what seeds the runtime workdir.
+RUN mkdir -p /var/cache/pixi && \
+    cd /opt/seq-sim-prebuilt && \
+    pixi install --manifest-path pixi.toml && \
+    test -d /opt/seq-sim-prebuilt/.pixi/envs/default && \
+    rm -rf /opt/seq-sim-prebuilt/.pixi
 
 # ---------------------------------------------------------------------------
 # Pre-download PHGv2 latest release so `setup-environment` can skip the
@@ -130,7 +170,23 @@ ENV SEQ_SIM_IN_CONTAINER=1 \
 WORKDIR /workspace
 
 # Allow non-root operation by default (uid is overridden from docker-compose).
+# `scripts/dev.sh` sets the container user to the *host* uid (e.g. 501/503
+# on macOS) so files written into the bind-mounted repo are owned by the
+# host user. That means we can't pin the pixi cache to any specific uid:
+# the dev user (uid 1000) is just the in-image owner of the warmup
+# artifacts, but at runtime the cache must be writable by whatever uid
+# docker-compose decides to use.
+#
+# We therefore make /var/cache/pixi recursively world-readable/writable
+# (a+rwX). When the named docker volume for /var/cache/pixi (see
+# docker-compose.yml) is first mounted, docker seeds it from the image's
+# contents at that path, so the world-writable mode propagates to the
+# fresh volume and any uid can acquire pixi's lock file. New files pixi
+# writes inherit the container user's umask (typically 0022), which is
+# fine as long as a single host uid uses a given volume -- and the
+# volume gets recreated whenever the host user changes.
 RUN useradd -m -u 1000 -s /bin/bash dev && \
-    chown -R dev:dev /opt/seq-sim-prebuilt
+    chown -R dev:dev /opt/seq-sim-prebuilt /var/cache/pixi && \
+    chmod -R a+rwX /var/cache/pixi
 
 CMD ["/bin/bash"]
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
@@ -10,13 +10,37 @@ services:
     # On macOS Docker Desktop the bind mount translates these ids for you;
     # on Linux this matches the invoker's uid (see scripts/dev.sh).
     user: "${SEQ_SIM_UID:-1000}:${SEQ_SIM_GID:-1000}"
+    # End-to-end tests run a full pipeline: the test JVM forks MLImpute's
+    # Gradle daemon, biokotlin-tools' JVM, AnchorWave/minimap2 natives,
+    # python (pysam/CrossMap), and pixi shims. Combined RSS comfortably
+    # exceeds 4 GB during the heaviest steps. On macOS Docker Desktop the
+    # container also inherits whatever VM memory the user set in
+    # Preferences -> Resources, so we explicitly raise the per-container
+    # limit here to surface a clear error (the kernel OOM killer otherwise
+    # SIGKILLs the test JVM and reports it as `exit 137`).
+    #
+    # Override with SEQ_SIM_MEM_LIMIT=8g (or larger) when running
+    # `scripts/dev.sh e2e` on a memory-constrained host.
+    mem_limit: ${SEQ_SIM_MEM_LIMIT:-16g}
+    memswap_limit: ${SEQ_SIM_MEM_LIMIT:-16g}
     environment:
       - SEQ_SIM_IN_CONTAINER=1
       - SEQ_SIM_SKIP_PHG_SETUP=1
       - SEQ_SIM_PHG_DIR=/opt/phg_v2
       - SEQ_SIM_SKIP_PIXI_PREFIX=1
       - GRADLE_USER_HOME=/workspace/.gradle-container
       - HOME=/workspace/.home-container
+      # Keep pixi's package cache off the macOS-side bind mount. Docker
+      # Desktop on macOS occasionally returns EINVAL (os error 22) for
+      # hardlink operations inside virtiofs/gRPC-FUSE bind mounts, which
+      # breaks `pixi install` mid-link when it hardlinks from the cache
+      # into <workDir>/.pixi/envs/.... The named volume below is backed
+      # by the docker storage driver (overlayfs on Linux VMs), where
+      # hardlinks behave correctly. It also persists across the
+      # `compose run --rm` container recreations so we don't re-download
+      # packages on every `scripts/dev.sh e2e` invocation.
+      - PIXI_CACHE_DIR=/var/cache/pixi
+      - RATTLER_CACHE_DIR=/var/cache/pixi
     volumes:
       # Single bind mount covers everything: the repo itself plus the
       # .gradle-container/ and .home-container/ subdirs used as caches.
@@ -25,5 +49,10 @@ services:
       #   - first-run gradle wrapper downloads don't hit "permission denied"
       #   - tearing down (./scripts/dev.sh clean) leaves the caches visible
       - ..:/workspace
+      # Named volume for the pixi/rattler package cache (see env vars above).
+      - pixi-cache:/var/cache/pixi
     tty: true
     stdin_open: true
+
+volumes:
+  pixi-cache:
diff --git a/docker/phg_environment.yml b/docker/phg_environment.yml
@@ -19,3 +19,15 @@ dependencies:
   - agc>=3.1
   - ropebwt3>=3.8
   - minimap2>=2.28
+  # Python deps needed by the MLImpute scripts that back pick_crossovers,
+  # convert_coordinates, and generate_recombined_sequences. They live in
+  # the seq-sim pixi env, but inside the container SEQ_SIM_SKIP_PIXI_PREFIX=1
+  # routes those calls through this conda env directly, so the deps must
+  # be present here too.
+  - numpy
+  - pandas
+  - pysam
+  - crossmap
+  # Native CLI deps used by create_chain_files / format_recombined_fastas.
+  - seqkit
+  - parallel
diff --git a/docs/commands.md b/docs/commands.md
@@ -110,7 +110,11 @@ seq_sim setup-environment -w my_workdir
 
 ## align-assemblies (Step 01)
 
-Aligns multiple query assemblies to a reference genome using AnchorWave and minimap2.
+Aligns multiple query assemblies to a reference genome via the PHGv2
+[`align-assemblies`](https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters)
+command, which itself drives AnchorWave + minimap2 under the hood. This wrapper
+keeps seq_sim's CLI surface (`--ref-gff`, `--ref-fasta`, `--query-fasta`, ...)
+and the `maf_file_paths.txt` output contract that downstream steps depend on.
 
 **Usage:**
 ```bash
@@ -119,31 +123,47 @@ seq_sim align-assemblies [OPTIONS]
 
 **Options:**
 - `--work-dir`, `-w`: Working directory (default: `seq_sim_work`)
-- `--ref-gff`, `-g`: Reference GFF file (required)
-- `--ref-fasta`, `-r`: Reference FASTA file (required)
-- `--query-fasta`, `-q`: Query input (required) - can be a single FASTA (`.fa`, `.fasta`, `.fna`), a directory of FASTAs, or a text file listing one path per line
-- `--threads`, `-t`: Number of threads to use (default: 1)
+- `--ref-gff`, `-g`: Reference GFF file (required, forwarded as PHGv2 `--gff`)
+- `--ref-fasta`, `-r`: Reference FASTA file (required, forwarded as PHGv2 `--reference-file`). For best results this should be the output of `phg prepare-assemblies`.
+- `--query-fasta`, `-q`: Query input (required) - can be a single FASTA (`.fa`, `.fasta`, `.fna`), a directory of FASTAs, or a text file listing one path per line. Translated to a PHGv2 `--assembly-file-list` internally.
+- `--threads`, `-t`: Total number of threads available to PHGv2 (`--total-threads`, default: 1)
+- `--in-parallel`: How many alignments to run in parallel (PHGv2 `--in-parallel`). If omitted, PHGv2 picks a value from system memory and thread count.
+- `--ref-max-align-cov`: Maximum reference genome alignment coverage for AnchorWave `proali` (PHGv2 `--ref-max-align-cov`, default: 1)
+- `--query-max-align-cov`: Maximum query genome alignment coverage for AnchorWave `proali` (PHGv2 `--query-max-align-cov`, default: 1)
+- `--conda-env-prefix`: Path to a Conda env containing PHGv2's runtime deps (anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location.
+- `--just-ref-prep`: Only run PHGv2's reference-prep phase and stop. Useful for SLURM array workflows; no per-query MAFs and no `maf_file_paths.txt` are produced.
+- `--output-dir`, `-o`: Custom output directory (default: `<work-dir>/output/01_anchorwave_results`)
 
 **What it does:**
-1. Extracts CDS sequences from reference GFF using `anchorwave gff2seq`
-2. Aligns reference to CDS with `minimap2` (once for all queries)
-3. For each query, runs `minimap2` and `anchorwave proali` to produce alignments
-4. Generates `maf_file_paths.txt` listing all produced MAF files
+1. Collects the query FASTA list from `--query-fasta` and writes it as
+   `<output-dir>/assemblies_list.txt` (the PHGv2 `--assembly-file-list`).
+2. Invokes `phg align-assemblies` from `<work-dir>/src/phg_v2/bin/phg`. PHGv2
+   then runs `anchorwave gff2seq`, `minimap2`, and `anchorwave proali`
+   internally.
+3. Collects the resulting `.maf` files PHGv2 wrote to the output directory and
+   produces `maf_file_paths.txt` so downstream steps (`maf-to-gvcf`,
+   `create-chain-files`) continue to work unchanged.
 
 **Output:**
-- `<work-dir>/output/01_anchorwave_results/{refBase}_cds.fa`
-- `<work-dir>/output/01_anchorwave_results/{refBase}.sam`
-- `<work-dir>/output/01_anchorwave_results/{queryName}/` containing `{queryName}.sam`, `*.anchors`, `*.maf`, `*.f.maf`
+- `<work-dir>/output/01_anchorwave_results/assemblies_list.txt` (PHGv2 assembly-file-list, generated by this wrapper)
+- `<work-dir>/output/01_anchorwave_results/{queryName}.maf` (per-query alignment, one file each)
+- `<work-dir>/output/01_anchorwave_results/{queryName}.sam`
+- `<work-dir>/output/01_anchorwave_results/{queryName}_{refBase}.anchorspro`
+- `<work-dir>/output/01_anchorwave_results/{queryName}.svg` (dot plot)
+- `<work-dir>/output/01_anchorwave_results/ref.cds.fasta`, `{refBase}.sam` (reference-prep outputs)
 - `<work-dir>/output/01_anchorwave_results/maf_file_paths.txt`
 - `<work-dir>/logs/01_align_assemblies.log`
 
 **Examples:**
 ```bash
-# Directory of queries
+# Directory of queries, 8 threads
 seq_sim align-assemblies -g ref.gff -r ref.fa -q queries/ -t 8
 
-# Text list of query paths
-seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt -t 4
+# Text list of query paths, 4 threads, run 2 alignments in parallel
+seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt -t 4 --in-parallel 2
+
+# Reference-prep only (for SLURM array workflows)
+seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt --just-ref-prep
 ```
 
 ---
@@ -409,7 +429,12 @@ seq_sim format-recombined-fastas \
 ## align-mutated-assemblies (Step 10)
 
 Realigns the formatted recombined (or otherwise mutated) FASTA files back to
-the reference genome. This is the first step of the PS4G creation workflow.
+the reference genome via the PHGv2
+[`align-assemblies`](https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters)
+command, which itself drives AnchorWave + minimap2 under the hood. This is the
+first step of the PS4G creation workflow. The wrapper keeps seq_sim's existing
+CLI surface and the `maf_file_paths.txt` output contract that step 11
+(`mutated-maf-to-gvcf`) depends on.
 
 **Usage:**
 ```bash
@@ -418,23 +443,36 @@ seq_sim align-mutated-assemblies [OPTIONS]
 
 **Options:**
 - `--work-dir`, `-w`: Working directory (default: `seq_sim_work`)
-- `--ref-gff`, `-g`: Reference GFF file (required)
-- `--ref-fasta`, `-r`: Reference FASTA file (required)
-- `--fasta-input`, `-f`: FASTA input (required) - single file, directory, or text list
-- `--threads`, `-t`: Number of threads to use (default: 1)
-- `--output-dir`, `-o`: Custom output directory (default: `work_dir/output/10_mutated_alignment_results`)
+- `--ref-gff`, `-g`: Reference GFF file (required, forwarded as PHGv2 `--gff`)
+- `--ref-fasta`, `-r`: Reference FASTA file (required, forwarded as PHGv2 `--reference-file`). For best results this should be the output of `phg prepare-assemblies`.
+- `--fasta-input`, `-f`: FASTA input (required) - single file, directory, or text list. Translated to a PHGv2 `--assembly-file-list` internally.
+- `--threads`, `-t`: Total number of threads available to PHGv2 (`--total-threads`, default: 1)
+- `--in-parallel`: How many alignments to run in parallel (PHGv2 `--in-parallel`). If omitted, PHGv2 picks a value from system memory and thread count.
+- `--ref-max-align-cov`: Maximum reference genome alignment coverage for AnchorWave `proali` (PHGv2 `--ref-max-align-cov`, default: 1)
+- `--query-max-align-cov`: Maximum query genome alignment coverage for AnchorWave `proali` (PHGv2 `--query-max-align-cov`, default: 1)
+- `--conda-env-prefix`: Path to a Conda env containing PHGv2's runtime deps. Defaults to the `phgv2-conda` env in its standard location.
+- `--just-ref-prep`: Only run PHGv2's reference-prep phase and stop. No per-query MAFs and no `maf_file_paths.txt` are produced.
+- `--output-dir`, `-o`: Custom output directory (default: `<work-dir>/output/10_mutated_alignment_results`)
 
 **Output:**
-- `<work-dir>/output/10_mutated_alignment_results/{refBase}_cds.fa`
-- `<work-dir>/output/10_mutated_alignment_results/{refBase}.sam`
-- `<work-dir>/output/10_mutated_alignment_results/{fastaName}/` containing alignments
+- `<work-dir>/output/10_mutated_alignment_results/assemblies_list.txt` (PHGv2 assembly-file-list, generated by this wrapper)
+- `<work-dir>/output/10_mutated_alignment_results/{fastaName}.maf` (per-FASTA alignment, one file each)
+- `<work-dir>/output/10_mutated_alignment_results/{fastaName}.sam`
+- `<work-dir>/output/10_mutated_alignment_results/{fastaName}_{refBase}.anchorspro`
+- `<work-dir>/output/10_mutated_alignment_results/{fastaName}.svg` (dot plot)
+- `<work-dir>/output/10_mutated_alignment_results/ref.cds.fasta`, `{refBase}.sam` (reference-prep outputs)
 - `<work-dir>/output/10_mutated_alignment_results/maf_file_paths.txt`
 - `<work-dir>/logs/10_align_mutated_assemblies.log`
 
 **Example:**
 ```bash
 seq_sim align-mutated-assemblies \
     -g ref.gff -r ref.fa -f seq_sim_work/output/09_formatted_fastas/ -t 8
+
+# Run 2 alignments in parallel with 4 total threads
+seq_sim align-mutated-assemblies \
+    -g ref.gff -r ref.fa -f seq_sim_work/output/09_formatted_fastas/ \
+    -t 4 --in-parallel 2
 ```
 
 ---