Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .github/workflows/pr-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,22 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: Free runner disk space
# ubuntu-latest ships with only ~14 GB free, which is not enough to
# build the seq-sim-dev image (pixi env + micromamba env + PHGv2 +
# JDK 21 + buildx layers all materialize on disk simultaneously).
# This reclaims ~25-30 GB by removing pre-installed Android SDK,
# .NET, GHC, and large apt packages we don't use.
uses: jlumbroso/free-disk-space@main
with:
tool-cache: false # keep JDKs/node we use elsewhere
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false # keep buildx daemon images
swap-storage: true

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

Expand Down
33 changes: 32 additions & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ plugins {
}

group = "net.maizegenetics"
version = "0.2.9"
version = "0.3.0"

repositories {
mavenCentral()
Expand Down Expand Up @@ -46,17 +46,41 @@ tasks.test {
}
}

// Heavy test tiers spawn full pipelines that shell out to subprocess
// Gradle daemons (MLImpute) and native tools (AnchorWave / minimap2 /
// python). The kernel OOM killer (exit 137 / SIGKILL) fires when total
// CONTAINER memory is exceeded -- not the JVM heap. We fork per test
// class so memory is released between classes; the heap itself is left
// at Gradle's default (which is enough for smallseq).
fun Test.applyHeavyTestConfig() {
// Each end-to-end test runs a full pipeline; isolate them so the
// JVM frees process-wide resources (classloaders, threadpools,
// native handles) between classes.
setForkEvery(1L)
}

val integrationTest = tasks.register<Test>("integrationTest") {
description = "Runs per-step integration tests against real external binaries (requires seq-sim-dev container)."
group = "verification"

testClassesDirs = sourceSets.test.get().output.classesDirs
classpath = sourceSets.test.get().runtimeClasspath

// Restrict Gradle's class-file scan so `forkEvery = 1` only spawns
// JVMs for the integration test classes. Without this, the heavy
// test config forks one JVM per test class in the entire test
// source set (~20 classes), which is both wasteful and flaky --
// any single fork's startup failure surfaces as
// "Gradle Test Executor N finished with non-zero exit value 1"
// and aborts the whole task.
include("**/*IntegrationTest.class")

useJUnitPlatform {
includeTags("integration")
}

applyHeavyTestConfig()

shouldRunAfter(tasks.test)
outputs.upToDateWhen { false }
}
Expand All @@ -68,10 +92,17 @@ val e2eTest = tasks.register<Test>("e2eTest") {
testClassesDirs = sourceSets.test.get().output.classesDirs
classpath = sourceSets.test.get().runtimeClasspath

// See the comment on `integrationTest` above -- restrict scanning so
// `forkEvery = 1` only forks JVMs for `*E2ETest` classes instead of
// the entire test source set.
include("**/*E2ETest.class")

useJUnitPlatform {
includeTags("e2e")
}

applyHeavyTestConfig()

shouldRunAfter(integrationTest)
outputs.upToDateWhen { false }
}
Expand Down
64 changes: 60 additions & 4 deletions docker/Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,56 @@ RUN mkdir -p "${MAMBA_ROOT_PREFIX}" && \
# are directly callable without `pixi run` or `conda run` inside the container.
ENV PATH="${MAMBA_ROOT_PREFIX}/envs/phgv2-conda/bin:${PATH}"

# ---------------------------------------------------------------------------
# `conda` shim that delegates to micromamba.
#
# PHGv2's AlignAssemblies hard-codes `conda run -n phgv2-conda <tool> ...`
# (anchorwave, minimap2, samtools) and spawns it via Java's ProcessBuilder
# instead of going through a shell. Since this image only ships micromamba
# (lighter weight and what we use to manage the phgv2-conda env above),
# `conda` is not normally available and PHG fails with
# "Cannot run program 'conda': No such file or directory".
#
# A tiny exec-shim is enough: micromamba accepts the same `run -n NAME --
# CMD ARGS` form so we just forward everything.
# ---------------------------------------------------------------------------
RUN printf '%s\n' \
'#!/bin/bash' \
'exec micromamba "$@"' \
> /usr/local/bin/conda && \
chmod +x /usr/local/bin/conda

# ---------------------------------------------------------------------------
# Pre-install the seq-sim pixi env. We do this against a scratch directory so
# the image layer is stable regardless of the user's host-side seq_sim_work.
# At runtime the pipeline's own pixi env (seq_sim_work/.pixi) will be created
# on first use, but the solver cache is already populated.
# ---------------------------------------------------------------------------
COPY src/main/resources/pixi.toml /opt/seq-sim-prebuilt/pixi.toml
RUN cd /opt/seq-sim-prebuilt && \
pixi install --manifest-path pixi.toml || \
echo "pixi install warmup failed; runtime will retry"
# The pixi warmup MUST succeed -- if it fails silently, every runtime test
# in a fresh workdir does a cold-cache pixi install whose solver+download
# peak can exceed the container memory limit and get SIGKILL'd (exit 137).
# Fail the image build loudly instead of hiding the regression.
#
# We point PIXI/RATTLER_CACHE_DIR at the same path the runtime container
# uses (see docker-compose.yml). That way the downloaded .conda packages
# end up in /var/cache/pixi during the image build, get baked into the
# image layer, and seed the named volume on first mount -- so the
# runtime `pixi install` is just hardlinks/copies from a warm cache.
ENV PIXI_CACHE_DIR=/var/cache/pixi
ENV RATTLER_CACHE_DIR=/var/cache/pixi
# The realized env at /opt/seq-sim-prebuilt/.pixi/envs/default is only used
# to warm /var/cache/pixi -- it is never executed at runtime because the
# container sets SEQ_SIM_SKIP_PIXI_PREFIX=1 (see ProcessRunner.kt), which
# routes all pipeline tool calls through the micromamba phgv2-conda env on
# PATH instead. Delete it after the install so we don't ship ~2-3 GB of
# duplicated python/anchorwave/minimap2/ropebwt3 binaries in the image
# layer. The /var/cache/pixi cache is what seeds the runtime workdir.
RUN mkdir -p /var/cache/pixi && \
cd /opt/seq-sim-prebuilt && \
pixi install --manifest-path pixi.toml && \
test -d /opt/seq-sim-prebuilt/.pixi/envs/default && \
rm -rf /opt/seq-sim-prebuilt/.pixi

# ---------------------------------------------------------------------------
# Pre-download PHGv2 latest release so `setup-environment` can skip the
Expand Down Expand Up @@ -130,7 +170,23 @@ ENV SEQ_SIM_IN_CONTAINER=1 \
WORKDIR /workspace

# Allow non-root operation by default (uid is overridden from docker-compose).
# `scripts/dev.sh` sets the container user to the *host* uid (e.g. 501/503
# on macOS) so files written into the bind-mounted repo are owned by the
# host user. That means we can't pin the pixi cache to any specific uid:
# the dev user (uid 1000) is just the in-image owner of the warmup
# artifacts, but at runtime the cache must be writable by whatever uid
# docker-compose decides to use.
#
# We therefore make /var/cache/pixi recursively world-readable/writable
# (a+rwX). When the named docker volume for /var/cache/pixi (see
# docker-compose.yml) is first mounted, docker seeds it from the image's
# contents at that path, so the world-writable mode propagates to the
# fresh volume and any uid can acquire pixi's lock file. New files pixi
# writes inherit the container user's umask (typically 0022), which is
# fine as long as a single host uid uses a given volume -- and the
# volume gets recreated whenever the host user changes.
RUN useradd -m -u 1000 -s /bin/bash dev && \
chown -R dev:dev /opt/seq-sim-prebuilt
chown -R dev:dev /opt/seq-sim-prebuilt /var/cache/pixi && \
chmod -R a+rwX /var/cache/pixi

CMD ["/bin/bash"]
29 changes: 29 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,37 @@ services:
# On macOS Docker Desktop the bind mount translates these ids for you;
# on Linux this matches the invoker's uid (see scripts/dev.sh).
user: "${SEQ_SIM_UID:-1000}:${SEQ_SIM_GID:-1000}"
# End-to-end tests run a full pipeline: the test JVM forks MLImpute's
# Gradle daemon, biokotlin-tools' JVM, AnchorWave/minimap2 natives,
# python (pysam/CrossMap), and pixi shims. Combined RSS comfortably
# exceeds 4 GB during the heaviest steps. On macOS Docker Desktop the
# container also inherits whatever VM memory the user set in
# Preferences -> Resources, so we explicitly raise the per-container
# limit here to surface a clear error (the kernel OOM killer otherwise
# SIGKILLs the test JVM and reports it as `exit 137`).
#
# Override with SEQ_SIM_MEM_LIMIT=8g (or larger) when running
# `scripts/dev.sh e2e` on a memory-constrained host.
mem_limit: ${SEQ_SIM_MEM_LIMIT:-16g}
memswap_limit: ${SEQ_SIM_MEM_LIMIT:-16g}
environment:
- SEQ_SIM_IN_CONTAINER=1
- SEQ_SIM_SKIP_PHG_SETUP=1
- SEQ_SIM_PHG_DIR=/opt/phg_v2
- SEQ_SIM_SKIP_PIXI_PREFIX=1
- GRADLE_USER_HOME=/workspace/.gradle-container
- HOME=/workspace/.home-container
# Keep pixi's package cache off the macOS-side bind mount. Docker
# Desktop on macOS occasionally returns EINVAL (os error 22) for
# hardlink operations inside virtiofs/gRPC-FUSE bind mounts, which
# breaks `pixi install` mid-link when it hardlinks from the cache
# into <workDir>/.pixi/envs/.... The named volume below is backed
# by the docker storage driver (overlayfs on Linux VMs), where
# hardlinks behave correctly. It also persists across the
# `compose run --rm` container recreations so we don't re-download
# packages on every `scripts/dev.sh e2e` invocation.
- PIXI_CACHE_DIR=/var/cache/pixi
- RATTLER_CACHE_DIR=/var/cache/pixi
volumes:
# Single bind mount covers everything: the repo itself plus the
# .gradle-container/ and .home-container/ subdirs used as caches.
Expand All @@ -25,5 +49,10 @@ services:
# - first-run gradle wrapper downloads don't hit "permission denied"
# - tearing down (./scripts/dev.sh clean) leaves the caches visible
- ..:/workspace
# Named volume for the pixi/rattler package cache (see env vars above).
- pixi-cache:/var/cache/pixi
tty: true
stdin_open: true

volumes:
pixi-cache:
12 changes: 12 additions & 0 deletions docker/phg_environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,15 @@ dependencies:
- agc>=3.1
- ropebwt3>=3.8
- minimap2>=2.28
# Python deps needed by the MLImpute scripts that back pick_crossovers,
# convert_coordinates, and generate_recombined_sequences. They live in
# the seq-sim pixi env, but inside the container SEQ_SIM_SKIP_PIXI_PREFIX=1
# routes those calls through this conda env directly, so the deps must
# be present here too.
- numpy
- pandas
- pysam
- crossmap
# Native CLI deps used by create_chain_files / format_recombined_fastas.
- seqkit
- parallel
86 changes: 62 additions & 24 deletions docs/commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,11 @@ seq_sim setup-environment -w my_workdir

## align-assemblies (Step 01)

Aligns multiple query assemblies to a reference genome using AnchorWave and minimap2.
Aligns multiple query assemblies to a reference genome via the PHGv2
[`align-assemblies`](https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters)
command, which itself drives AnchorWave + minimap2 under the hood. This wrapper
keeps seq_sim's CLI surface (`--ref-gff`, `--ref-fasta`, `--query-fasta`, ...)
and the `maf_file_paths.txt` output contract that downstream steps depend on.

**Usage:**
```bash
Expand All @@ -119,31 +123,47 @@ seq_sim align-assemblies [OPTIONS]

**Options:**
- `--work-dir`, `-w`: Working directory (default: `seq_sim_work`)
- `--ref-gff`, `-g`: Reference GFF file (required)
- `--ref-fasta`, `-r`: Reference FASTA file (required)
- `--query-fasta`, `-q`: Query input (required) - can be a single FASTA (`.fa`, `.fasta`, `.fna`), a directory of FASTAs, or a text file listing one path per line
- `--threads`, `-t`: Number of threads to use (default: 1)
- `--ref-gff`, `-g`: Reference GFF file (required, forwarded as PHGv2 `--gff`)
- `--ref-fasta`, `-r`: Reference FASTA file (required, forwarded as PHGv2 `--reference-file`). For best results this should be the output of `phg prepare-assemblies`.
- `--query-fasta`, `-q`: Query input (required) - can be a single FASTA (`.fa`, `.fasta`, `.fna`), a directory of FASTAs, or a text file listing one path per line. Translated to a PHGv2 `--assembly-file-list` internally.
- `--threads`, `-t`: Total number of threads available to PHGv2 (`--total-threads`, default: 1)
- `--in-parallel`: How many alignments to run in parallel (PHGv2 `--in-parallel`). If omitted, PHGv2 picks a value from system memory and thread count.
- `--ref-max-align-cov`: Maximum reference genome alignment coverage for AnchorWave `proali` (PHGv2 `--ref-max-align-cov`, default: 1)
- `--query-max-align-cov`: Maximum query genome alignment coverage for AnchorWave `proali` (PHGv2 `--query-max-align-cov`, default: 1)
- `--conda-env-prefix`: Path to a Conda env containing PHGv2's runtime deps (anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location.
- `--just-ref-prep`: Only run PHGv2's reference-prep phase and stop. Useful for SLURM array workflows; no per-query MAFs and no `maf_file_paths.txt` are produced.
- `--output-dir`, `-o`: Custom output directory (default: `<work-dir>/output/01_anchorwave_results`)

**What it does:**
1. Extracts CDS sequences from reference GFF using `anchorwave gff2seq`
2. Aligns reference to CDS with `minimap2` (once for all queries)
3. For each query, runs `minimap2` and `anchorwave proali` to produce alignments
4. Generates `maf_file_paths.txt` listing all produced MAF files
1. Collects the query FASTA list from `--query-fasta` and writes it as
`<output-dir>/assemblies_list.txt` (the PHGv2 `--assembly-file-list`).
2. Invokes `phg align-assemblies` from `<work-dir>/src/phg_v2/bin/phg`. PHGv2
then runs `anchorwave gff2seq`, `minimap2`, and `anchorwave proali`
internally.
3. Collects the resulting `.maf` files PHGv2 wrote to the output directory and
produces `maf_file_paths.txt` so downstream steps (`maf-to-gvcf`,
`create-chain-files`) continue to work unchanged.

**Output:**
- `<work-dir>/output/01_anchorwave_results/{refBase}_cds.fa`
- `<work-dir>/output/01_anchorwave_results/{refBase}.sam`
- `<work-dir>/output/01_anchorwave_results/{queryName}/` containing `{queryName}.sam`, `*.anchors`, `*.maf`, `*.f.maf`
- `<work-dir>/output/01_anchorwave_results/assemblies_list.txt` (PHGv2 assembly-file-list, generated by this wrapper)
- `<work-dir>/output/01_anchorwave_results/{queryName}.maf` (per-query alignment, one file each)
- `<work-dir>/output/01_anchorwave_results/{queryName}.sam`
- `<work-dir>/output/01_anchorwave_results/{queryName}_{refBase}.anchorspro`
- `<work-dir>/output/01_anchorwave_results/{queryName}.svg` (dot plot)
- `<work-dir>/output/01_anchorwave_results/ref.cds.fasta`, `{refBase}.sam` (reference-prep outputs)
- `<work-dir>/output/01_anchorwave_results/maf_file_paths.txt`
- `<work-dir>/logs/01_align_assemblies.log`

**Examples:**
```bash
# Directory of queries
# Directory of queries, 8 threads
seq_sim align-assemblies -g ref.gff -r ref.fa -q queries/ -t 8

# Text list of query paths
seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt -t 4
# Text list of query paths, 4 threads, run 2 alignments in parallel
seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt -t 4 --in-parallel 2

# Reference-prep only (for SLURM array workflows)
seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt --just-ref-prep
```

---
Expand Down Expand Up @@ -409,7 +429,12 @@ seq_sim format-recombined-fastas \
## align-mutated-assemblies (Step 10)

Realigns the formatted recombined (or otherwise mutated) FASTA files back to
the reference genome. This is the first step of the PS4G creation workflow.
the reference genome via the PHGv2
[`align-assemblies`](https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters)
command, which itself drives AnchorWave + minimap2 under the hood. This is the
first step of the PS4G creation workflow. The wrapper keeps seq_sim's existing
CLI surface and the `maf_file_paths.txt` output contract that step 11
(`mutated-maf-to-gvcf`) depends on.

**Usage:**
```bash
Expand All @@ -418,23 +443,36 @@ seq_sim align-mutated-assemblies [OPTIONS]

**Options:**
- `--work-dir`, `-w`: Working directory (default: `seq_sim_work`)
- `--ref-gff`, `-g`: Reference GFF file (required)
- `--ref-fasta`, `-r`: Reference FASTA file (required)
- `--fasta-input`, `-f`: FASTA input (required) - single file, directory, or text list
- `--threads`, `-t`: Number of threads to use (default: 1)
- `--output-dir`, `-o`: Custom output directory (default: `work_dir/output/10_mutated_alignment_results`)
- `--ref-gff`, `-g`: Reference GFF file (required, forwarded as PHGv2 `--gff`)
- `--ref-fasta`, `-r`: Reference FASTA file (required, forwarded as PHGv2 `--reference-file`). For best results this should be the output of `phg prepare-assemblies`.
- `--fasta-input`, `-f`: FASTA input (required) - single file, directory, or text list. Translated to a PHGv2 `--assembly-file-list` internally.
- `--threads`, `-t`: Total number of threads available to PHGv2 (`--total-threads`, default: 1)
- `--in-parallel`: How many alignments to run in parallel (PHGv2 `--in-parallel`). If omitted, PHGv2 picks a value from system memory and thread count.
- `--ref-max-align-cov`: Maximum reference genome alignment coverage for AnchorWave `proali` (PHGv2 `--ref-max-align-cov`, default: 1)
- `--query-max-align-cov`: Maximum query genome alignment coverage for AnchorWave `proali` (PHGv2 `--query-max-align-cov`, default: 1)
- `--conda-env-prefix`: Path to a Conda env containing PHGv2's runtime deps. Defaults to the `phgv2-conda` env in its standard location.
- `--just-ref-prep`: Only run PHGv2's reference-prep phase and stop. No per-query MAFs and no `maf_file_paths.txt` are produced.
- `--output-dir`, `-o`: Custom output directory (default: `<work-dir>/output/10_mutated_alignment_results`)

**Output:**
- `<work-dir>/output/10_mutated_alignment_results/{refBase}_cds.fa`
- `<work-dir>/output/10_mutated_alignment_results/{refBase}.sam`
- `<work-dir>/output/10_mutated_alignment_results/{fastaName}/` containing alignments
- `<work-dir>/output/10_mutated_alignment_results/assemblies_list.txt` (PHGv2 assembly-file-list, generated by this wrapper)
- `<work-dir>/output/10_mutated_alignment_results/{fastaName}.maf` (per-FASTA alignment, one file each)
- `<work-dir>/output/10_mutated_alignment_results/{fastaName}.sam`
- `<work-dir>/output/10_mutated_alignment_results/{fastaName}_{refBase}.anchorspro`
- `<work-dir>/output/10_mutated_alignment_results/{fastaName}.svg` (dot plot)
- `<work-dir>/output/10_mutated_alignment_results/ref.cds.fasta`, `{refBase}.sam` (reference-prep outputs)
- `<work-dir>/output/10_mutated_alignment_results/maf_file_paths.txt`
- `<work-dir>/logs/10_align_mutated_assemblies.log`

**Example:**
```bash
seq_sim align-mutated-assemblies \
-g ref.gff -r ref.fa -f seq_sim_work/output/09_formatted_fastas/ -t 8

# Run 2 alignments in parallel with 4 total threads
seq_sim align-mutated-assemblies \
-g ref.gff -r ref.fa -f seq_sim_work/output/09_formatted_fastas/ \
-t 4 --in-parallel 2
```

---
Expand Down
Loading
Loading