From 3f0e77ffbe404ffeb755481b0450ffe480634e4f Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Tue, 12 May 2026 10:31:04 -0500 Subject: [PATCH 01/18] Swap for PHGv2 --- .../maizegenetics/commands/AlignAssemblies.kt | 284 ++++++++--------- .../commands/AlignMutatedAssemblies.kt | 289 +++++++++--------- .../net/maizegenetics/commands/Orchestrate.kt | 74 ++++- 3 files changed, 334 insertions(+), 313 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt index 8f494c8..77c8894 100644 --- a/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt @@ -2,6 +2,7 @@ package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.default +import com.github.ajalt.clikt.parameters.options.flag import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required import com.github.ajalt.clikt.parameters.types.int @@ -17,22 +18,23 @@ import org.apache.logging.log4j.Logger import java.nio.file.Path import kotlin.io.path.* +/** + * Wraps the PHGv2 `align-assemblies` command, which itself drives AnchorWave + + * minimap2 to align query assemblies against a reference. The wrapper keeps + * seq_sim's existing inputs (`--ref-gff`, `--ref-fasta`, `--query-fasta`, ...) + * and existing output contract (`output/01_anchorwave_results/maf_file_paths.txt`) + * so downstream pipeline steps continue to work unchanged. New PHGv2-specific + * options (`--in-parallel`, `--ref-max-align-cov`, ...) are surfaced as + * additional optional flags. + * + * See: https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters + */ class AlignAssemblies : CliktCommand(name = "align-assemblies") { companion object { private const val LOG_FILE_NAME = "01_align_assemblies.log" - private const val OUTPUT_DIR = "output" private const val ANCHORWAVE_RESULTS_DIR = "01_anchorwave_results" private const val MAF_PATHS_FILE = "maf_file_paths.txt" - - // minimap2 parameters - private const val MINIMAP2_PRESET = "splice" - private const val MINIMAP2_KMER_SIZE = "12" - private const val MINIMAP2_P_VALUE = "0.4" - private const val MINIMAP2_N_VALUE = "20" - - // anchorwave proali parameters - private const val ANCHORWAVE_R_VALUE = "1" - private const val ANCHORWAVE_Q_VALUE = "1" + private const val ASSEMBLY_LIST_FILE = "assemblies_list.txt" // Default values private const val DEFAULT_THREADS = 1 @@ -48,28 +50,60 @@ class AlignAssemblies : CliktCommand(name = "align-assemblies") { private val refGff by option( "--ref-gff", "-g", - help = "Reference GFF file" + help = "Reference GFF file (passed to PHGv2 as --gff)" ).path(mustExist = true, canBeFile = true, canBeDir = false) .required() private val refFasta by option( "--ref-fasta", "-r", - help = "Reference FASTA file" + help = "Reference FASTA file (passed to PHGv2 as --reference-file). For best results " + + "this should be the output of `phg prepare-assemblies`." ).path(mustExist = true, canBeFile = true, canBeDir = false) .required() private val queryInput by option( "--query-fasta", "-q", - help = "Query FASTA file, directory of FASTA files, or text file with paths to FASTA files (one per line)" + help = "Query FASTA file, directory of FASTA files, or text file with paths to FASTA files (one per line). " + + "Translated to a PHGv2 --assembly-file-list internally." ).path(mustExist = true) .required() private val threads by option( "--threads", "-t", - help = "Number of threads to use" + help = "Total number of threads available to PHGv2 (--total-threads)" ).int() .default(DEFAULT_THREADS) + private val inParallel by option( + "--in-parallel", + help = "Number of alignments to run in parallel (PHGv2 --in-parallel). " + + "If omitted, PHGv2 picks a value from system memory + thread count." + ).int() + + private val refMaxAlignCov by option( + "--ref-max-align-cov", + help = "Maximum reference genome alignment coverage for AnchorWave proali (PHGv2 --ref-max-align-cov, " + + "passed through as proali's `-R`). PHGv2 defaults this to 1." + ).int() + + private val queryMaxAlignCov by option( + "--query-max-align-cov", + help = "Maximum query genome alignment coverage for AnchorWave proali (PHGv2 --query-max-align-cov, " + + "passed through as proali's `-Q`). PHGv2 defaults this to 1." + ).int() + + private val condaEnvPrefix by option( + "--conda-env-prefix", + help = "Path to a Conda environment that contains PHGv2's runtime dependencies " + + "(anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location." + ).path(mustExist = false, canBeFile = false, canBeDir = true) + + private val justRefPrep by option( + "--just-ref-prep", + help = "Only run PHGv2's reference-prep phase (writes ref.cds.fasta + Ref.sam) and stop. " + + "Useful when feeding a SLURM array; skips writing maf_file_paths.txt because no MAFs are produced." + ).flag() + private val outputDir by option( "--output-dir", "-o", help = "Custom output directory (default: work_dir/output/01_anchorwave_results)" @@ -84,174 +118,118 @@ class AlignAssemblies : CliktCommand(name = "align-assemblies") { ) } + /** + * Materializes a PHGv2 `--assembly-file-list` from whatever the user + * passed via `--query-fasta` (a single FASTA, a directory, or a .txt list). + * The reference FASTA is filtered out if it accidentally appears in the + * collected list (PHGv2 warns against including the reference here). + */ + private fun writeAssemblyFileList(queryFiles: List, baseOutputDir: Path): Path { + val refAbsolute = refFasta.toAbsolutePath().normalize() + val filtered = queryFiles + .map { it.toAbsolutePath().normalize() } + .filter { it != refAbsolute } + .distinct() + + if (filtered.size != queryFiles.size) { + logger.warn( + "Reference FASTA was present in the query list and was removed; PHGv2 " + + "expects the reference to be passed only via --reference-file." + ) + } + + val listFile = baseOutputDir.resolve(ASSEMBLY_LIST_FILE) + listFile.writeLines(filtered.map { it.toString() }) + logger.info("Wrote PHGv2 assembly file list (${filtered.size} entries): $listFile") + return listFile + } + override fun run() { - // Validate working directory - ValidationUtils.validateWorkingDirectory(workDir, logger) + // Validate working directory and PHG binary + val phgBinary = ValidationUtils.validatePhgSetup(workDir, logger) // Configure file logging to working directory LoggingUtils.setupFileLogging(workDir, LOG_FILE_NAME, logger) - logger.info("Starting assembly alignment") + logger.info("Starting assembly alignment via PHGv2 `align-assemblies`") logger.info("Working directory: $workDir") logger.info("Reference GFF: $refGff") logger.info("Reference FASTA: $refFasta") - logger.info("Threads: $threads") + logger.info("Total threads: $threads") + inParallel?.let { logger.info("In-parallel: $it") } + refMaxAlignCov?.let { logger.info("Ref max align cov (proali -R): $it") } + queryMaxAlignCov?.let { logger.info("Query max align cov (proali -Q): $it") } + condaEnvPrefix?.let { logger.info("Conda env prefix: $it") } + if (justRefPrep) { + logger.info("Just-ref-prep mode enabled (will not produce per-query MAFs)") + } - // Collect query files + // Collect query files into a PHGv2-shaped assembly-file-list val queryFiles = collectQueryFiles() logger.info("Processing ${queryFiles.size} query file(s)") - // Create base output directory (use custom or default) + // Create base output directory (use custom or default). + // PHGv2 requires the output directory to exist before invocation. val baseOutputDir = FileUtils.resolveOutputDirectory(workDir, outputDir, ANCHORWAVE_RESULTS_DIR) FileUtils.createOutputDirectory(baseOutputDir, logger) - // Derive reference base name - val refBase = refFasta.nameWithoutExtension - logger.info("Reference base name: $refBase") - - // Step 1: Run anchorwave gff2seq (once for reference) - logger.info("Step 1: Extracting CDS sequences with anchorwave gff2seq") - val cdsFile = baseOutputDir.resolve("${refBase}_cds.fa") - val gff2seqExitCode = ProcessRunner.runCommand( - "pixi", "run", "anchorwave", "gff2seq", - "-i", refGff.toString(), - "-r", refFasta.toString(), - "-o", cdsFile.toString(), - workingDir = workDir.toFile(), - logger = logger + val assemblyListFile = writeAssemblyFileList(queryFiles, baseOutputDir) + + // Build the PHGv2 align-assemblies command + val commandArgs = mutableListOf( + phgBinary.toString(), + "align-assemblies", + "--gff", refGff.toAbsolutePath().toString(), + "--reference-file", refFasta.toAbsolutePath().toString(), + "--assembly-file-list", assemblyListFile.toAbsolutePath().toString(), + "--total-threads", threads.toString(), + "-o", baseOutputDir.toAbsolutePath().toString() ) - if (gff2seqExitCode != 0) { - logger.error("anchorwave gff2seq failed with exit code $gff2seqExitCode") - throw SeqSimCommandException("anchorwave gff2seq failed with exit code $gff2seqExitCode", gff2seqExitCode) + inParallel?.let { commandArgs += listOf("--in-parallel", it.toString()) } + refMaxAlignCov?.let { commandArgs += listOf("--ref-max-align-cov", it.toString()) } + queryMaxAlignCov?.let { commandArgs += listOf("--query-max-align-cov", it.toString()) } + condaEnvPrefix?.let { commandArgs += listOf("--conda-env-prefix", it.toAbsolutePath().toString()) } + if (justRefPrep) { + commandArgs += "--just-ref-prep" } - logger.info("CDS file created: $cdsFile") - - // Step 2: Run minimap2 for reference (once for all queries) - logger.info("Step 2: Running minimap2 alignment for reference") - val refSam = baseOutputDir.resolve("${refBase}.sam") - val minimap2RefExitCode = ProcessRunner.runCommand( - "pixi", "run", "minimap2", - "-x", MINIMAP2_PRESET, - "-t", threads.toString(), - "-k", MINIMAP2_KMER_SIZE, - "-a", - "-p", MINIMAP2_P_VALUE, - "-N", MINIMAP2_N_VALUE, - refFasta.toString(), - cdsFile.toString(), + + logger.info("Running PHG align-assemblies...") + val exitCode = ProcessRunner.runCommand( + *commandArgs.toTypedArray(), workingDir = workDir.toFile(), - outputFile = refSam.toFile(), logger = logger ) - if (minimap2RefExitCode != 0) { - logger.error("minimap2 (reference) failed with exit code $minimap2RefExitCode") - throw SeqSimCommandException("minimap2 (reference) failed with exit code $minimap2RefExitCode", minimap2RefExitCode) + + if (exitCode != 0) { + logger.error("PHG align-assemblies failed with exit code $exitCode") + throw SeqSimCommandException("PHG align-assemblies failed with exit code $exitCode", exitCode) } - logger.info("Reference SAM file created: $refSam") - - // Step 3: Process each query file - var successCount = 0 - var failureCount = 0 - val mafFilePaths = mutableListOf() - - queryFiles.forEachIndexed { index, queryFasta -> - logger.info("=".repeat(80)) - logger.info("Processing query ${index + 1}/${queryFiles.size}: ${queryFasta.name}") - logger.info("=".repeat(80)) - - try { - val mafPath = alignQuery(queryFasta, refBase, refSam, cdsFile, baseOutputDir) - mafFilePaths.add(mafPath) - successCount++ - logger.info("Successfully completed alignment for: ${queryFasta.name}") - } catch (e: Exception) { - failureCount++ - logger.error("Failed to align query: ${queryFasta.name}", e) - logger.error("Continuing with next query...") - } + + if (justRefPrep) { + logger.info("--just-ref-prep was set; skipping MAF collection.") + logger.info("Reference-prep outputs written to: $baseOutputDir") + return } - // Write MAF file paths to text file + // Collect MAF outputs PHGv2 wrote into the output directory and + // surface them via the standard maf_file_paths.txt contract so + // downstream pipeline steps (maf-to-gvcf, create-chain-files, ...) + // keep working unchanged. + val mafFiles = baseOutputDir.listDirectoryEntries() + .filter { it.isRegularFile() && it.name.endsWith(".maf") } + .sorted() + FileUtils.writeFilePaths( - mafFilePaths, + mafFiles, baseOutputDir.resolve(MAF_PATHS_FILE), logger, "MAF file" ) logger.info("=".repeat(80)) - logger.info("All alignments completed!") - logger.info("Total queries processed: ${queryFiles.size}") - logger.info("Successful: $successCount") - logger.info("Failed: $failureCount") + logger.info("PHG align-assemblies completed successfully") + logger.info("Total assemblies aligned: ${queryFiles.size}") + logger.info("MAF files written: ${mafFiles.size}") logger.info("Output directory: $baseOutputDir") } - - private fun alignQuery(queryFasta: Path, refBase: String, refSam: Path, cdsFile: Path, baseOutputDir: Path): Path { - val queryName = queryFasta.nameWithoutExtension - - // Create query-specific output directory - val queryOutputDir = baseOutputDir.resolve(queryName) - if (!queryOutputDir.exists()) { - queryOutputDir.createDirectories() - } - - // Step 1: Run minimap2 for query - logger.info("Running minimap2 alignment for query") - val querySam = queryOutputDir.resolve("${queryName}.sam") - val minimap2QueryExitCode = ProcessRunner.runCommand( - "pixi", "run", "minimap2", - "-x", MINIMAP2_PRESET, - "-t", threads.toString(), - "-k", MINIMAP2_KMER_SIZE, - "-a", - "-p", MINIMAP2_P_VALUE, - "-N", MINIMAP2_N_VALUE, - queryFasta.toString(), - cdsFile.toString(), - workingDir = workDir.toFile(), - outputFile = querySam.toFile(), - logger = logger - ) - if (minimap2QueryExitCode != 0) { - throw RuntimeException("minimap2 (query) failed with exit code $minimap2QueryExitCode") - } - logger.info("Query SAM file created: $querySam") - - // Step 2: Run anchorwave proali - logger.info("Running anchorwave proali") - val anchorsFile = queryOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${queryName}_Q${ANCHORWAVE_Q_VALUE}.anchors") - val mafFile = queryOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${queryName}_Q${ANCHORWAVE_Q_VALUE}.maf") - val fMafFile = queryOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${queryName}_Q${ANCHORWAVE_Q_VALUE}.f.maf") - - val proaliExitCode = ProcessRunner.runCommand( - "pixi", "run", "anchorwave", "proali", - "-i", refGff.toString(), - "-as", cdsFile.toString(), - "-r", refFasta.toString(), - "-a", querySam.toString(), - "-ar", refSam.toString(), - "-s", queryFasta.toString(), - "-n", anchorsFile.toString(), - "-R", ANCHORWAVE_R_VALUE, - "-Q", ANCHORWAVE_Q_VALUE, - "-o", mafFile.toString(), - "-f", fMafFile.toString(), - "-t", threads.toString(), - workingDir = workDir.toFile(), - logger = logger - ) - if (proaliExitCode != 0) { - throw RuntimeException("anchorwave proali failed with exit code $proaliExitCode") - } - - logger.info("Output files for ${queryName}:") - logger.info(" Query SAM: $querySam") - logger.info(" Anchors: $anchorsFile") - logger.info(" MAF: $mafFile") - logger.info(" Filtered MAF: $fMafFile") - - // Return the MAF file path (not the filtered one) - return mafFile - } } diff --git a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt index c642224..5ad4e27 100644 --- a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt @@ -2,6 +2,7 @@ package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.default +import com.github.ajalt.clikt.parameters.options.flag import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required import com.github.ajalt.clikt.parameters.types.int @@ -10,28 +11,31 @@ import net.maizegenetics.Constants import net.maizegenetics.utils.FileUtils import net.maizegenetics.utils.LoggingUtils import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.SeqSimCommandException import net.maizegenetics.utils.ValidationUtils import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.Logger import java.nio.file.Path import kotlin.io.path.* -import kotlin.system.exitProcess +/** + * Wraps the PHGv2 `align-assemblies` command for the "circular" mutated / + * recombined FASTA realignment step (step 10). PHGv2 internally drives + * AnchorWave + minimap2; this wrapper keeps seq_sim's existing inputs + * (`--ref-gff`, `--ref-fasta`, `--fasta-input`, ...) and existing output + * contract (`output/10_mutated_alignment_results/maf_file_paths.txt`) so + * downstream pipeline steps continue to work unchanged. New PHGv2-specific + * options (`--in-parallel`, `--ref-max-align-cov`, ...) are surfaced as + * additional optional flags. + * + * See: https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters + */ class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { companion object { private const val LOG_FILE_NAME = "10_align_mutated_assemblies.log" private const val MUTATED_ALIGNMENT_RESULTS_DIR = "10_mutated_alignment_results" private const val MAF_PATHS_FILE = "maf_file_paths.txt" - - // minimap2 parameters - private const val MINIMAP2_PRESET = "splice" - private const val MINIMAP2_KMER_SIZE = "12" - private const val MINIMAP2_P_VALUE = "0.4" - private const val MINIMAP2_N_VALUE = "20" - - // anchorwave proali parameters - private const val ANCHORWAVE_R_VALUE = "1" - private const val ANCHORWAVE_Q_VALUE = "1" + private const val ASSEMBLY_LIST_FILE = "assemblies_list.txt" // Default values private const val DEFAULT_THREADS = 1 @@ -47,28 +51,60 @@ class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { private val refGff by option( "--ref-gff", "-g", - help = "Reference GFF file" + help = "Reference GFF file (passed to PHGv2 as --gff)" ).path(mustExist = true, canBeFile = true, canBeDir = false) .required() private val refFasta by option( "--ref-fasta", "-r", - help = "Reference FASTA file" + help = "Reference FASTA file (passed to PHGv2 as --reference-file). For best results " + + "this should be the output of `phg prepare-assemblies`." ).path(mustExist = true, canBeFile = true, canBeDir = false) .required() private val fastaInput by option( "--fasta-input", "-f", - help = "FASTA file, directory of FASTA files, or text file with paths to FASTA files (one per line)" + help = "FASTA file, directory of FASTA files, or text file with paths to FASTA files (one per line). " + + "Translated to a PHGv2 --assembly-file-list internally." ).path(mustExist = true) .required() private val threads by option( "--threads", "-t", - help = "Number of threads to use" + help = "Total number of threads available to PHGv2 (--total-threads)" ).int() .default(DEFAULT_THREADS) + private val inParallel by option( + "--in-parallel", + help = "Number of alignments to run in parallel (PHGv2 --in-parallel). " + + "If omitted, PHGv2 picks a value from system memory + thread count." + ).int() + + private val refMaxAlignCov by option( + "--ref-max-align-cov", + help = "Maximum reference genome alignment coverage for AnchorWave proali (PHGv2 --ref-max-align-cov, " + + "passed through as proali's `-R`). PHGv2 defaults this to 1." + ).int() + + private val queryMaxAlignCov by option( + "--query-max-align-cov", + help = "Maximum query genome alignment coverage for AnchorWave proali (PHGv2 --query-max-align-cov, " + + "passed through as proali's `-Q`). PHGv2 defaults this to 1." + ).int() + + private val condaEnvPrefix by option( + "--conda-env-prefix", + help = "Path to a Conda environment that contains PHGv2's runtime dependencies " + + "(anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location." + ).path(mustExist = false, canBeFile = false, canBeDir = true) + + private val justRefPrep by option( + "--just-ref-prep", + help = "Only run PHGv2's reference-prep phase (writes ref.cds.fasta + Ref.sam) and stop. " + + "Useful when feeding a SLURM array; skips writing maf_file_paths.txt because no MAFs are produced." + ).flag() + private val outputDir by option( "--output-dir", "-o", help = "Custom output directory (default: work_dir/output/10_mutated_alignment_results)" @@ -83,174 +119,121 @@ class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { ) } + /** + * Materializes a PHGv2 `--assembly-file-list` from whatever the user passed + * via `--fasta-input` (a single FASTA, a directory, or a .txt list). The + * reference FASTA is filtered out if it accidentally appears in the list + * (PHGv2 warns against including the reference here). + */ + private fun writeAssemblyFileList(fastaFiles: List, baseOutputDir: Path): Path { + val refAbsolute = refFasta.toAbsolutePath().normalize() + val filtered = fastaFiles + .map { it.toAbsolutePath().normalize() } + .filter { it != refAbsolute } + .distinct() + + if (filtered.size != fastaFiles.size) { + logger.warn( + "Reference FASTA was present in the FASTA input list and was removed; " + + "PHGv2 expects the reference to be passed only via --reference-file." + ) + } + + val listFile = baseOutputDir.resolve(ASSEMBLY_LIST_FILE) + listFile.writeLines(filtered.map { it.toString() }) + logger.info("Wrote PHGv2 assembly file list (${filtered.size} entries): $listFile") + return listFile + } + override fun run() { - // Validate working directory exists - ValidationUtils.validateWorkingDirectory(workDir, logger) + // Validate working directory and PHG binary + val phgBinary = ValidationUtils.validatePhgSetup(workDir, logger) // Configure file logging to working directory LoggingUtils.setupFileLogging(workDir, LOG_FILE_NAME, logger) - logger.info("Starting mutated assembly alignment") + logger.info("Starting mutated assembly alignment via PHGv2 `align-assemblies`") logger.info("Working directory: $workDir") logger.info("Reference GFF: $refGff") logger.info("Reference FASTA: $refFasta") - logger.info("Threads: $threads") + logger.info("Total threads: $threads") + inParallel?.let { logger.info("In-parallel: $it") } + refMaxAlignCov?.let { logger.info("Ref max align cov (proali -R): $it") } + queryMaxAlignCov?.let { logger.info("Query max align cov (proali -Q): $it") } + condaEnvPrefix?.let { logger.info("Conda env prefix: $it") } + if (justRefPrep) { + logger.info("Just-ref-prep mode enabled (will not produce per-query MAFs)") + } - // Collect FASTA files + // Collect FASTA files into a PHGv2-shaped assembly-file-list val fastaFiles = collectFastaFiles() logger.info("Processing ${fastaFiles.size} FASTA file(s)") - // Create base output directory (use custom or default) + // Create base output directory (use custom or default). + // PHGv2 requires the output directory to exist before invocation. val baseOutputDir = FileUtils.resolveOutputDirectory(workDir, outputDir, MUTATED_ALIGNMENT_RESULTS_DIR) FileUtils.createOutputDirectory(baseOutputDir, logger) - // Derive reference base name - val refBase = refFasta.nameWithoutExtension - logger.info("Reference base name: $refBase") - - // Step 1: Run anchorwave gff2seq (once for reference) - logger.info("Step 1: Extracting CDS sequences with anchorwave gff2seq") - val cdsFile = baseOutputDir.resolve("${refBase}_cds.fa") - val gff2seqExitCode = ProcessRunner.runCommand( - "pixi", "run", "anchorwave", "gff2seq", - "-i", refGff.toString(), - "-r", refFasta.toString(), - "-o", cdsFile.toString(), - workingDir = workDir.toFile(), - logger = logger + val assemblyListFile = writeAssemblyFileList(fastaFiles, baseOutputDir) + + // Build the PHGv2 align-assemblies command + val commandArgs = mutableListOf( + phgBinary.toString(), + "align-assemblies", + "--gff", refGff.toAbsolutePath().toString(), + "--reference-file", refFasta.toAbsolutePath().toString(), + "--assembly-file-list", assemblyListFile.toAbsolutePath().toString(), + "--total-threads", threads.toString(), + "-o", baseOutputDir.toAbsolutePath().toString() ) - if (gff2seqExitCode != 0) { - logger.error("anchorwave gff2seq failed with exit code $gff2seqExitCode") - exitProcess(gff2seqExitCode) + inParallel?.let { commandArgs += listOf("--in-parallel", it.toString()) } + refMaxAlignCov?.let { commandArgs += listOf("--ref-max-align-cov", it.toString()) } + queryMaxAlignCov?.let { commandArgs += listOf("--query-max-align-cov", it.toString()) } + condaEnvPrefix?.let { commandArgs += listOf("--conda-env-prefix", it.toAbsolutePath().toString()) } + if (justRefPrep) { + commandArgs += "--just-ref-prep" } - logger.info("CDS file created: $cdsFile") - - // Step 2: Run minimap2 for reference (once for all queries) - logger.info("Step 2: Running minimap2 alignment for reference") - val refSam = baseOutputDir.resolve("${refBase}.sam") - val minimap2RefExitCode = ProcessRunner.runCommand( - "pixi", "run", "minimap2", - "-x", MINIMAP2_PRESET, - "-t", threads.toString(), - "-k", MINIMAP2_KMER_SIZE, - "-a", - "-p", MINIMAP2_P_VALUE, - "-N", MINIMAP2_N_VALUE, - refFasta.toString(), - cdsFile.toString(), + + logger.info("Running PHG align-assemblies (mutated)...") + val exitCode = ProcessRunner.runCommand( + *commandArgs.toTypedArray(), workingDir = workDir.toFile(), - outputFile = refSam.toFile(), logger = logger ) - if (minimap2RefExitCode != 0) { - logger.error("minimap2 (reference) failed with exit code $minimap2RefExitCode") - exitProcess(minimap2RefExitCode) + + if (exitCode != 0) { + logger.error("PHG align-assemblies (mutated) failed with exit code $exitCode") + throw SeqSimCommandException( + "PHG align-assemblies (mutated) failed with exit code $exitCode", + exitCode + ) } - logger.info("Reference SAM file created: $refSam") - - // Step 3: Process each FASTA file - var successCount = 0 - var failureCount = 0 - val mafFilePaths = mutableListOf() - - fastaFiles.forEachIndexed { index, fastaFile -> - logger.info("=".repeat(80)) - logger.info("Processing FASTA ${index + 1}/${fastaFiles.size}: ${fastaFile.name}") - logger.info("=".repeat(80)) - - try { - val mafPath = alignFasta(fastaFile, refBase, refSam, cdsFile, baseOutputDir) - mafFilePaths.add(mafPath) - successCount++ - logger.info("Successfully completed alignment for: ${fastaFile.name}") - } catch (e: Exception) { - failureCount++ - logger.error("Failed to align FASTA: ${fastaFile.name}", e) - logger.error("Continuing with next FASTA...") - } + + if (justRefPrep) { + logger.info("--just-ref-prep was set; skipping MAF collection.") + logger.info("Reference-prep outputs written to: $baseOutputDir") + return } - // Write MAF file paths to text file + // Collect MAF outputs PHGv2 wrote into the output directory and + // surface them via the standard maf_file_paths.txt contract so + // downstream pipeline steps (mutated_maf_to_gvcf, ...) keep working + // unchanged. + val mafFiles = baseOutputDir.listDirectoryEntries() + .filter { it.isRegularFile() && it.name.endsWith(".maf") } + .sorted() + FileUtils.writeFilePaths( - mafFilePaths, + mafFiles, baseOutputDir.resolve(MAF_PATHS_FILE), logger, "MAF file" ) logger.info("=".repeat(80)) - logger.info("All alignments completed!") - logger.info("Total FASTA files processed: ${fastaFiles.size}") - logger.info("Successful: $successCount") - logger.info("Failed: $failureCount") + logger.info("PHG align-assemblies (mutated) completed successfully") + logger.info("Total assemblies aligned: ${fastaFiles.size}") + logger.info("MAF files written: ${mafFiles.size}") logger.info("Output directory: $baseOutputDir") } - - private fun alignFasta(fastaFile: Path, refBase: String, refSam: Path, cdsFile: Path, baseOutputDir: Path): Path { - val fastaName = fastaFile.nameWithoutExtension - - // Create FASTA-specific output directory - val fastaOutputDir = baseOutputDir.resolve(fastaName) - if (!fastaOutputDir.exists()) { - fastaOutputDir.createDirectories() - } - - // Step 1: Run minimap2 for FASTA - logger.info("Running minimap2 alignment for FASTA") - val fastaSam = fastaOutputDir.resolve("${fastaName}.sam") - val minimap2FastaExitCode = ProcessRunner.runCommand( - "pixi", "run", "minimap2", - "-x", MINIMAP2_PRESET, - "-t", threads.toString(), - "-k", MINIMAP2_KMER_SIZE, - "-a", - "-p", MINIMAP2_P_VALUE, - "-N", MINIMAP2_N_VALUE, - fastaFile.toString(), - cdsFile.toString(), - workingDir = workDir.toFile(), - outputFile = fastaSam.toFile(), - logger = logger - ) - if (minimap2FastaExitCode != 0) { - throw RuntimeException("minimap2 (FASTA) failed with exit code $minimap2FastaExitCode") - } - logger.info("FASTA SAM file created: $fastaSam") - - // Step 2: Run anchorwave proali - logger.info("Running anchorwave proali") - val anchorsFile = fastaOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${fastaName}_Q${ANCHORWAVE_Q_VALUE}.anchors") - val mafFile = fastaOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${fastaName}_Q${ANCHORWAVE_Q_VALUE}.maf") - val fMafFile = fastaOutputDir.resolve("${refBase}_R${ANCHORWAVE_R_VALUE}_${fastaName}_Q${ANCHORWAVE_Q_VALUE}.f.maf") - - val proaliExitCode = ProcessRunner.runCommand( - "pixi", "run", "anchorwave", "proali", - "-i", refGff.toString(), - "-as", cdsFile.toString(), - "-r", refFasta.toString(), - "-a", fastaSam.toString(), - "-ar", refSam.toString(), - "-s", fastaFile.toString(), - "-n", anchorsFile.toString(), - "-R", ANCHORWAVE_R_VALUE, - "-Q", ANCHORWAVE_Q_VALUE, - "-o", mafFile.toString(), - "-f", fMafFile.toString(), - "-t", threads.toString(), - workingDir = workDir.toFile(), - logger = logger - ) - if (proaliExitCode != 0) { - throw RuntimeException("anchorwave proali failed with exit code $proaliExitCode") - } - - logger.info("Output files for ${fastaName}:") - logger.info(" FASTA SAM: $fastaSam") - logger.info(" Anchors: $anchorsFile") - logger.info(" MAF: $mafFile") - logger.info(" Filtered MAF: $fMafFile") - - // Return the MAF file path (not the filtered one) - return mafFile - } } diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index f1306f0..cf959f5 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -40,8 +40,13 @@ data class AlignAssembliesConfig( val ref_gff: String, val ref_fasta: String, val query_fasta: String, - val threads: Int? = null, - val output: String? = null // Custom output directory + val threads: Int? = null, // PHGv2 --total-threads + val in_parallel: Int? = null, // PHGv2 --in-parallel + val ref_max_align_cov: Int? = null, // PHGv2 --ref-max-align-cov (proali -R) + val query_max_align_cov: Int? = null, // PHGv2 --query-max-align-cov (proali -Q) + val conda_env_prefix: String? = null, // PHGv2 --conda-env-prefix + val just_ref_prep: Boolean? = null, // PHGv2 --just-ref-prep + val output: String? = null // Custom output directory ) data class MafToGvcfConfig( @@ -71,11 +76,16 @@ data class ConvertToFastaConfig( ) data class AlignMutatedAssembliesConfig( - val ref_gff: String? = null, // Optional: Reference GFF (uses align_assemblies.ref_gff if not specified) - val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) - val fasta_input: String? = null, // Optional: Query FASTA input (uses format_recombined_fastas output if not specified) - val threads: Int? = null, - val output: String? = null // Custom output directory + val ref_gff: String? = null, // Optional: Reference GFF (uses align_assemblies.ref_gff if not specified) + val ref_fasta: String? = null, // Optional: Reference FASTA (uses align_assemblies.ref_fasta if not specified) + val fasta_input: String? = null, // Optional: Query FASTA input (uses format_recombined_fastas output if not specified) + val threads: Int? = null, // PHGv2 --total-threads + val in_parallel: Int? = null, // PHGv2 --in-parallel + val ref_max_align_cov: Int? = null, // PHGv2 --ref-max-align-cov (proali -R) + val query_max_align_cov: Int? = null, // PHGv2 --query-max-align-cov (proali -Q) + val conda_env_prefix: String? = null, // PHGv2 --conda-env-prefix + val just_ref_prep: Boolean? = null, // PHGv2 --just-ref-prep + val output: String? = null // Custom output directory ) data class PickCrossoversConfig( @@ -179,6 +189,16 @@ class Orchestrate : CliktCommand(name = "orchestrate") { return false } + // Check if PHGv2 binary exists (align-assemblies + later steps shell out to it) + val phgBinary = workDir.resolve(Constants.SRC_DIR) + .resolve(Constants.PHGV2_DIR) + .resolve("bin") + .resolve("phg") + if (!phgBinary.exists()) { + logger.info("PHGv2 binary not found: $phgBinary") + return false + } + // All checks passed logger.info("Environment validation passed - all required tools are present") return true @@ -234,6 +254,11 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ref_fasta = it["ref_fasta"] as? String ?: throw IllegalArgumentException("align_assemblies.ref_fasta is required"), query_fasta = it["query_fasta"] as? String ?: throw IllegalArgumentException("align_assemblies.query_fasta is required"), threads = it["threads"] as? Int, + in_parallel = it["in_parallel"] as? Int, + ref_max_align_cov = it["ref_max_align_cov"] as? Int, + query_max_align_cov = it["query_max_align_cov"] as? Int, + conda_env_prefix = it["conda_env_prefix"] as? String, + just_ref_prep = it["just_ref_prep"] as? Boolean, output = it["output"] as? String ) } @@ -288,6 +313,11 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ref_fasta = alignMutatedAssembliesMap?.get("ref_fasta") as? String, fasta_input = alignMutatedAssembliesMap?.get("fasta_input") as? String, threads = alignMutatedAssembliesMap?.get("threads") as? Int, + in_parallel = alignMutatedAssembliesMap?.get("in_parallel") as? Int, + ref_max_align_cov = alignMutatedAssembliesMap?.get("ref_max_align_cov") as? Int, + query_max_align_cov = alignMutatedAssembliesMap?.get("query_max_align_cov") as? Int, + conda_env_prefix = alignMutatedAssembliesMap?.get("conda_env_prefix") as? String, + just_ref_prep = alignMutatedAssembliesMap?.get("just_ref_prep") as? Boolean, output = alignMutatedAssembliesMap?.get("output") as? String ) } else null @@ -479,6 +509,21 @@ class Orchestrate : CliktCommand(name = "orchestrate") { if (config.align_assemblies.threads != null) { add("--threads=${config.align_assemblies.threads}") } + if (config.align_assemblies.in_parallel != null) { + add("--in-parallel=${config.align_assemblies.in_parallel}") + } + if (config.align_assemblies.ref_max_align_cov != null) { + add("--ref-max-align-cov=${config.align_assemblies.ref_max_align_cov}") + } + if (config.align_assemblies.query_max_align_cov != null) { + add("--query-max-align-cov=${config.align_assemblies.query_max_align_cov}") + } + if (config.align_assemblies.conda_env_prefix != null) { + add("--conda-env-prefix=${config.align_assemblies.conda_env_prefix}") + } + if (config.align_assemblies.just_ref_prep == true) { + add("--just-ref-prep") + } if (customOutput != null) { add("--output-dir=$customOutput") } @@ -1182,6 +1227,21 @@ class Orchestrate : CliktCommand(name = "orchestrate") { if (config.align_mutated_assemblies.threads != null) { add("--threads=${config.align_mutated_assemblies.threads}") } + if (config.align_mutated_assemblies.in_parallel != null) { + add("--in-parallel=${config.align_mutated_assemblies.in_parallel}") + } + if (config.align_mutated_assemblies.ref_max_align_cov != null) { + add("--ref-max-align-cov=${config.align_mutated_assemblies.ref_max_align_cov}") + } + if (config.align_mutated_assemblies.query_max_align_cov != null) { + add("--query-max-align-cov=${config.align_mutated_assemblies.query_max_align_cov}") + } + if (config.align_mutated_assemblies.conda_env_prefix != null) { + add("--conda-env-prefix=${config.align_mutated_assemblies.conda_env_prefix}") + } + if (config.align_mutated_assemblies.just_ref_prep == true) { + add("--just-ref-prep") + } if (customOutput != null) { add("--output-dir=${customOutput}") } From 936b02ed17eb7169d89359114e1c3c507856ef68 Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Tue, 12 May 2026 12:05:12 -0500 Subject: [PATCH 02/18] Update tests for PHG internals --- .../commands/AlignAssembliesUnitTest.kt | 139 +++++++++++++----- 1 file changed, 105 insertions(+), 34 deletions(-) diff --git a/src/test/kotlin/net/maizegenetics/commands/AlignAssembliesUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/AlignAssembliesUnitTest.kt index d1cc466..60b008b 100644 --- a/src/test/kotlin/net/maizegenetics/commands/AlignAssembliesUnitTest.kt +++ b/src/test/kotlin/net/maizegenetics/commands/AlignAssembliesUnitTest.kt @@ -8,13 +8,15 @@ import org.junit.jupiter.api.Test import org.junit.jupiter.api.io.TempDir import java.io.File import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText import kotlin.test.assertEquals import kotlin.test.assertTrue /** - * Unit tests for [AlignAssemblies] that don't actually shell out to - * anchorwave/minimap2 -- we install a [RecordingProcessExecutor] and verify - * the exact command lines seq-sim would send. + * Unit tests for [AlignAssemblies] that don't actually shell out to the + * PHGv2 binary -- we install a [RecordingProcessExecutor] and verify the + * exact command line seq-sim would send to `phg align-assemblies`. */ class AlignAssembliesUnitTest { @@ -26,8 +28,22 @@ class AlignAssembliesUnitTest { ProcessRunner.resetExecutor() } + /** + * Create a fake PHG layout (bin/phg) inside [workDir] so the command's + * [net.maizegenetics.utils.ValidationUtils.validatePhgSetup] passes. + */ + private fun stubPhgBinary(workDir: Path): Path { + val phgDir = workDir.resolve("src/phg_v2/bin") + phgDir.createDirectories() + val phg = phgDir.resolve("phg") + phg.writeText("#!/bin/sh\nexit 0\n") + phg.toFile().setExecutable(true) + return phg + } + @Test - fun gff2seqAndMinimap2AreInvokedOncePerReference(@TempDir workDir: Path) { + fun phgAlignAssembliesIsInvokedExactlyOnce(@TempDir workDir: Path) { + stubPhgBinary(workDir) val executor = RecordingProcessExecutor(defaultExitCode = 0) ProcessRunner.withExecutor(executor) { @@ -42,35 +58,44 @@ class AlignAssembliesUnitTest { ) } - // Exactly one gff2seq invocation for the reference. - val gff2seqCalls = executor.invocations.filter { - it.command.contains("gff2seq") - } - assertEquals(1, gff2seqCalls.size, "gff2seq should run once per reference") - assertTrue( - executor.containsSubsequence("anchorwave", "gff2seq"), - "anchorwave gff2seq should be invoked" - ) + assertEquals(1, executor.invocations.size, "phg align-assemblies should be invoked exactly once") + val inv = executor.invocations.single() + assertTrue(inv.command.first().endsWith("phg"), "First token should be the phg binary") + assertEquals("align-assemblies", inv.command[1]) - // minimap2 is invoked once for the reference plus once per query (3). - val minimap2Calls = executor.invocationsOf("pixi").filter { - it.command.contains("minimap2") - } + executor.invocationsOf("minimap2") + // Required PHGv2 args are present + assertEquals( + smallseqRoot.resolve("anchors.gff").toAbsolutePath().toString(), + inv.argAfter("--gff") + ) assertEquals( - 4, minimap2Calls.size, - "minimap2 should run once for the reference and once per query" + smallseqRoot.resolve("Ref.fa").toAbsolutePath().toString(), + inv.argAfter("--reference-file") ) + assertEquals("2", inv.argAfter("--total-threads")) - // proali is invoked once per query (3 queries in smallseq). - val proaliCalls = executor.invocations.filter { - it.command.contains("proali") - } - assertEquals(3, proaliCalls.size, "anchorwave proali should run once per query") + // PHGv2 expects the output dir to exist before running and we hand it + // an assembly-file-list materialized inside that output dir. + val expectedOutputDir = workDir.resolve("output/01_anchorwave_results") + assertEquals(expectedOutputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + val assemblyList = expectedOutputDir.resolve("assemblies_list.txt").toFile() + assertTrue(assemblyList.exists(), "assemblies_list.txt should have been written") + val listed = assemblyList.readLines().filter { it.isNotBlank() } + assertEquals(3, listed.size, "Smallseq queries directory contains 3 FASTAs") + + // Optional flags should NOT be present when not set + assertTrue(!inv.command.contains("--in-parallel")) + assertTrue(!inv.command.contains("--ref-max-align-cov")) + assertTrue(!inv.command.contains("--query-max-align-cov")) + assertTrue(!inv.command.contains("--conda-env-prefix")) + assertTrue(!inv.command.contains("--just-ref-prep")) } @Test - fun anchorwaveProaliCommandIncludesRefGffAndQuerySam(@TempDir workDir: Path) { + fun optionalPhgParametersAreForwardedWhenProvided(@TempDir workDir: Path) { + stubPhgBinary(workDir) val executor = RecordingProcessExecutor(defaultExitCode = 0) + val condaPrefix = workDir.resolve("conda_env").also { it.createDirectories() } ProcessRunner.withExecutor(executor) { AlignAssemblies().parse( @@ -79,23 +104,68 @@ class AlignAssembliesUnitTest { "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), "--query-fasta", smallseqRoot.resolve("queries/LineA.fa").toString(), - "--threads", "4" + "--threads", "4", + "--in-parallel", "2", + "--ref-max-align-cov", "3", + "--query-max-align-cov", "5", + "--conda-env-prefix", condaPrefix.toString() ) ) } - val proali = executor.invocations.single { it.command.contains("proali") } - assertEquals(smallseqRoot.resolve("anchors.gff").toString(), proali.argAfter("-i")) - assertEquals(smallseqRoot.resolve("Ref.fa").toString(), proali.argAfter("-r")) - assertEquals("4", proali.argAfter("-t")) - assertEquals("1", proali.argAfter("-R")) - assertEquals("1", proali.argAfter("-Q")) + val inv = executor.invocations.single() + assertEquals("4", inv.argAfter("--total-threads")) + assertEquals("2", inv.argAfter("--in-parallel")) + assertEquals("3", inv.argAfter("--ref-max-align-cov")) + assertEquals("5", inv.argAfter("--query-max-align-cov")) + assertEquals(condaPrefix.toAbsolutePath().toString(), inv.argAfter("--conda-env-prefix")) } @Test - fun mafFilePathsTextFileIsWrittenForEachQuery(@TempDir workDir: Path) { + fun justRefPrepSkipsMafFilePathsTextFile(@TempDir workDir: Path) { + stubPhgBinary(workDir) val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + AlignAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--query-fasta", smallseqRoot.resolve("queries").toString(), + "--just-ref-prep" + ) + ) + } + + val inv = executor.invocations.single() + assertTrue(inv.command.contains("--just-ref-prep"), "--just-ref-prep should be forwarded") + + val mafPathsFile = workDir.resolve("output/01_anchorwave_results/maf_file_paths.txt").toFile() + assertTrue( + !mafPathsFile.exists(), + "maf_file_paths.txt should NOT be written when --just-ref-prep is set" + ) + } + + @Test + fun mafFilePathsTextFileListsMafsWrittenByPhg(@TempDir workDir: Path) { + stubPhgBinary(workDir) + + // RecordingProcessExecutor doesn't actually run phg, so simulate its + // side-effect: drop one .maf file per query into the output directory + // before the phg invocation "returns". + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + val outputDir = inv.command.dropWhile { it != "-o" }.getOrNull(1)?.let { File(it) } + outputDir?.mkdirs() + val listFile = inv.command.dropWhile { it != "--assembly-file-list" }.getOrNull(1)?.let { File(it) } + listFile?.readLines()?.filter { it.isNotBlank() }?.forEach { fastaPath -> + val sampleName = File(fastaPath).nameWithoutExtension + File(outputDir, "$sampleName.maf").writeText("##maf version=1\n") + } + 0 + } + ProcessRunner.withExecutor(executor) { AlignAssemblies().parse( listOf( @@ -110,6 +180,7 @@ class AlignAssembliesUnitTest { val mafPathsFile = workDir.resolve("output/01_anchorwave_results/maf_file_paths.txt").toFile() assertTrue(mafPathsFile.exists(), "maf_file_paths.txt should be written") val lines = mafPathsFile.readLines().filter { it.isNotBlank() } - assertEquals(3, lines.size, "Should list one MAF path per query") + assertEquals(3, lines.size, "Should list one MAF path per simulated phg output") + assertTrue(lines.all { it.endsWith(".maf") }, "Every listed path should be a .maf file") } } From 2bcfadc3194775feff90b4464e0b7441ced27c8b Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Tue, 12 May 2026 12:05:23 -0500 Subject: [PATCH 03/18] Initial commit --- .../AlignMutatedAssembliesUnitTest.kt | 189 ++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 src/test/kotlin/net/maizegenetics/commands/AlignMutatedAssembliesUnitTest.kt diff --git a/src/test/kotlin/net/maizegenetics/commands/AlignMutatedAssembliesUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/AlignMutatedAssembliesUnitTest.kt new file mode 100644 index 0000000..61884e6 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/AlignMutatedAssembliesUnitTest.kt @@ -0,0 +1,189 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [AlignMutatedAssemblies] that don't actually shell out to + * the PHGv2 binary -- we install a [RecordingProcessExecutor] and verify the + * exact command line seq-sim would send to `phg align-assemblies`. + * + * Mirrors [AlignAssembliesUnitTest] since both commands now wrap the same + * PHGv2 subcommand. + */ +class AlignMutatedAssembliesUnitTest { + + private val smallseqRoot: Path = File("src/test/resources/smallseq") + .absoluteFile.toPath() + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + /** + * Create a fake PHG layout (bin/phg) inside [workDir] so the command's + * [net.maizegenetics.utils.ValidationUtils.validatePhgSetup] passes. + */ + private fun stubPhgBinary(workDir: Path): Path { + val phgDir = workDir.resolve("src/phg_v2/bin") + phgDir.createDirectories() + val phg = phgDir.resolve("phg") + phg.writeText("#!/bin/sh\nexit 0\n") + phg.toFile().setExecutable(true) + return phg + } + + @Test + fun phgAlignAssembliesIsInvokedExactlyOnce(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + AlignMutatedAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--fasta-input", smallseqRoot.resolve("queries").toString(), + "--threads", "2" + ) + ) + } + + assertEquals(1, executor.invocations.size, "phg align-assemblies should be invoked exactly once") + val inv = executor.invocations.single() + assertTrue(inv.command.first().endsWith("phg"), "First token should be the phg binary") + assertEquals("align-assemblies", inv.command[1]) + + // Required PHGv2 args are present + assertEquals( + smallseqRoot.resolve("anchors.gff").toAbsolutePath().toString(), + inv.argAfter("--gff") + ) + assertEquals( + smallseqRoot.resolve("Ref.fa").toAbsolutePath().toString(), + inv.argAfter("--reference-file") + ) + assertEquals("2", inv.argAfter("--total-threads")) + + // PHGv2 expects the output dir to exist before running and we hand it + // an assembly-file-list materialized inside that output dir. + val expectedOutputDir = workDir.resolve("output/10_mutated_alignment_results") + assertEquals(expectedOutputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + val assemblyList = expectedOutputDir.resolve("assemblies_list.txt").toFile() + assertTrue(assemblyList.exists(), "assemblies_list.txt should have been written") + val listed = assemblyList.readLines().filter { it.isNotBlank() } + assertEquals(3, listed.size, "Smallseq queries directory contains 3 FASTAs") + + // Optional flags should NOT be present when not set + assertTrue(!inv.command.contains("--in-parallel")) + assertTrue(!inv.command.contains("--ref-max-align-cov")) + assertTrue(!inv.command.contains("--query-max-align-cov")) + assertTrue(!inv.command.contains("--conda-env-prefix")) + assertTrue(!inv.command.contains("--just-ref-prep")) + } + + @Test + fun optionalPhgParametersAreForwardedWhenProvided(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + val condaPrefix = workDir.resolve("conda_env").also { it.createDirectories() } + + ProcessRunner.withExecutor(executor) { + AlignMutatedAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--fasta-input", smallseqRoot.resolve("queries/LineA.fa").toString(), + "--threads", "4", + "--in-parallel", "2", + "--ref-max-align-cov", "3", + "--query-max-align-cov", "5", + "--conda-env-prefix", condaPrefix.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals("4", inv.argAfter("--total-threads")) + assertEquals("2", inv.argAfter("--in-parallel")) + assertEquals("3", inv.argAfter("--ref-max-align-cov")) + assertEquals("5", inv.argAfter("--query-max-align-cov")) + assertEquals(condaPrefix.toAbsolutePath().toString(), inv.argAfter("--conda-env-prefix")) + } + + @Test + fun justRefPrepSkipsMafFilePathsTextFile(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + AlignMutatedAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--fasta-input", smallseqRoot.resolve("queries").toString(), + "--just-ref-prep" + ) + ) + } + + val inv = executor.invocations.single() + assertTrue(inv.command.contains("--just-ref-prep"), "--just-ref-prep should be forwarded") + + val mafPathsFile = workDir.resolve("output/10_mutated_alignment_results/maf_file_paths.txt").toFile() + assertTrue( + !mafPathsFile.exists(), + "maf_file_paths.txt should NOT be written when --just-ref-prep is set" + ) + } + + @Test + fun mafFilePathsTextFileListsMafsWrittenByPhg(@TempDir workDir: Path) { + stubPhgBinary(workDir) + + // RecordingProcessExecutor doesn't actually run phg, so simulate its + // side-effect: drop one .maf file per query into the output directory + // before the phg invocation "returns". + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + val outputDir = inv.command.dropWhile { it != "-o" }.getOrNull(1)?.let { File(it) } + outputDir?.mkdirs() + val listFile = inv.command.dropWhile { it != "--assembly-file-list" }.getOrNull(1)?.let { File(it) } + listFile?.readLines()?.filter { it.isNotBlank() }?.forEach { fastaPath -> + val sampleName = File(fastaPath).nameWithoutExtension + File(outputDir, "$sampleName.maf").writeText("##maf version=1\n") + } + 0 + } + + ProcessRunner.withExecutor(executor) { + AlignMutatedAssemblies().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-gff", smallseqRoot.resolve("anchors.gff").toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--fasta-input", smallseqRoot.resolve("queries").toString() + ) + ) + } + + val mafPathsFile = workDir.resolve("output/10_mutated_alignment_results/maf_file_paths.txt").toFile() + assertTrue(mafPathsFile.exists(), "maf_file_paths.txt should be written") + val lines = mafPathsFile.readLines().filter { it.isNotBlank() } + assertEquals(3, lines.size, "Should list one MAF path per simulated phg output") + assertTrue(lines.all { it.endsWith(".maf") }, "Every listed path should be a .maf file") + } +} From 5a7cbc215d7b2f329528ddc9177fd1707b9d675c Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Tue, 12 May 2026 17:27:02 -0500 Subject: [PATCH 04/18] Update integrations for PHG internals --- .../integration/AlignAssembliesIntegrationTest.kt | 9 +++++++-- .../net/maizegenetics/integration/OrchestrateE2ETest.kt | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/test/kotlin/net/maizegenetics/integration/AlignAssembliesIntegrationTest.kt b/src/test/kotlin/net/maizegenetics/integration/AlignAssembliesIntegrationTest.kt index cfcbf1e..468d471 100644 --- a/src/test/kotlin/net/maizegenetics/integration/AlignAssembliesIntegrationTest.kt +++ b/src/test/kotlin/net/maizegenetics/integration/AlignAssembliesIntegrationTest.kt @@ -12,8 +12,8 @@ import kotlin.io.path.createDirectories import kotlin.test.assertTrue /** - * Integration test that actually invokes AnchorWave + minimap2 against the - * smallseq test resources. + * Integration test that actually invokes `phg align-assemblies` (which itself + * drives AnchorWave + minimap2) against the smallseq test resources. * * Runs only inside the seq-sim-dev container (gated by [IntegrationGuard]). * Outside the container this is a no-op assumption skip. @@ -26,10 +26,15 @@ class AlignAssembliesIntegrationTest { @Test fun alignsQueryAgainstSmallseqReference(@TempDir workDir: Path) { + IntegrationGuard.requirePhg() IntegrationGuard.requireAnchorwave() // seq-sim expects a pre-existing work dir (validateWorkingDirectory). + // We also need a phg binary at /src/phg_v2/bin/phg. workDir.createDirectories() + val phgSrcDir = workDir.resolve("src/phg_v2/bin").also { it.createDirectories() } + val phgFromEnv = File("${IntegrationGuard.phgDir}/bin/phg") + java.nio.file.Files.createSymbolicLink(phgSrcDir.resolve("phg"), phgFromEnv.toPath()) // Copy the single query into a dir so align-assemblies globs it. val queriesDir = workDir.resolve("queries").also { it.createDirectories() } diff --git a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt index f7e214c..d01aaf1 100644 --- a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt +++ b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt @@ -25,6 +25,10 @@ class OrchestrateE2ETest { @Test fun orchestrateSmallseqPipelineProducesMafAndGvcf(@TempDir workDir: Path) { + // align-assemblies now drives PHGv2 internally, so we need both the + // phg binary and AnchorWave on PATH. The orchestrator's setup-environment + // step takes care of populating /src/phg_v2 from SEQ_SIM_PHG_DIR. + IntegrationGuard.requirePhg() IntegrationGuard.requireAnchorwave() workDir.createDirectories() From daed59c97af6644062786442eabb6c6f677f0603 Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Tue, 12 May 2026 17:27:13 -0500 Subject: [PATCH 05/18] Update parameters --- docs/commands.md | 86 ++++++++++++++++++++++++++---------- pipeline_config.example.yaml | 14 +++++- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/docs/commands.md b/docs/commands.md index 14590aa..8a1760c 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -110,7 +110,11 @@ seq_sim setup-environment -w my_workdir ## align-assemblies (Step 01) -Aligns multiple query assemblies to a reference genome using AnchorWave and minimap2. +Aligns multiple query assemblies to a reference genome via the PHGv2 +[`align-assemblies`](https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters) +command, which itself drives AnchorWave + minimap2 under the hood. This wrapper +keeps seq_sim's CLI surface (`--ref-gff`, `--ref-fasta`, `--query-fasta`, ...) +and the `maf_file_paths.txt` output contract that downstream steps depend on. **Usage:** ```bash @@ -119,31 +123,47 @@ seq_sim align-assemblies [OPTIONS] **Options:** - `--work-dir`, `-w`: Working directory (default: `seq_sim_work`) -- `--ref-gff`, `-g`: Reference GFF file (required) -- `--ref-fasta`, `-r`: Reference FASTA file (required) -- `--query-fasta`, `-q`: Query input (required) - can be a single FASTA (`.fa`, `.fasta`, `.fna`), a directory of FASTAs, or a text file listing one path per line -- `--threads`, `-t`: Number of threads to use (default: 1) +- `--ref-gff`, `-g`: Reference GFF file (required, forwarded as PHGv2 `--gff`) +- `--ref-fasta`, `-r`: Reference FASTA file (required, forwarded as PHGv2 `--reference-file`). For best results this should be the output of `phg prepare-assemblies`. +- `--query-fasta`, `-q`: Query input (required) - can be a single FASTA (`.fa`, `.fasta`, `.fna`), a directory of FASTAs, or a text file listing one path per line. Translated to a PHGv2 `--assembly-file-list` internally. +- `--threads`, `-t`: Total number of threads available to PHGv2 (`--total-threads`, default: 1) +- `--in-parallel`: How many alignments to run in parallel (PHGv2 `--in-parallel`). If omitted, PHGv2 picks a value from system memory and thread count. +- `--ref-max-align-cov`: Maximum reference genome alignment coverage for AnchorWave `proali` (PHGv2 `--ref-max-align-cov`, default: 1) +- `--query-max-align-cov`: Maximum query genome alignment coverage for AnchorWave `proali` (PHGv2 `--query-max-align-cov`, default: 1) +- `--conda-env-prefix`: Path to a Conda env containing PHGv2's runtime deps (anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location. +- `--just-ref-prep`: Only run PHGv2's reference-prep phase and stop. Useful for SLURM array workflows; no per-query MAFs and no `maf_file_paths.txt` are produced. +- `--output-dir`, `-o`: Custom output directory (default: `/output/01_anchorwave_results`) **What it does:** -1. Extracts CDS sequences from reference GFF using `anchorwave gff2seq` -2. Aligns reference to CDS with `minimap2` (once for all queries) -3. For each query, runs `minimap2` and `anchorwave proali` to produce alignments -4. Generates `maf_file_paths.txt` listing all produced MAF files +1. Collects the query FASTA list from `--query-fasta` and writes it as + `/assemblies_list.txt` (the PHGv2 `--assembly-file-list`). +2. Invokes `phg align-assemblies` from `/src/phg_v2/bin/phg`. PHGv2 + then runs `anchorwave gff2seq`, `minimap2`, and `anchorwave proali` + internally. +3. Collects the resulting `.maf` files PHGv2 wrote to the output directory and + produces `maf_file_paths.txt` so downstream steps (`maf-to-gvcf`, + `create-chain-files`) continue to work unchanged. **Output:** -- `/output/01_anchorwave_results/{refBase}_cds.fa` -- `/output/01_anchorwave_results/{refBase}.sam` -- `/output/01_anchorwave_results/{queryName}/` containing `{queryName}.sam`, `*.anchors`, `*.maf`, `*.f.maf` +- `/output/01_anchorwave_results/assemblies_list.txt` (PHGv2 assembly-file-list, generated by this wrapper) +- `/output/01_anchorwave_results/{queryName}.maf` (per-query alignment, one file each) +- `/output/01_anchorwave_results/{queryName}.sam` +- `/output/01_anchorwave_results/{queryName}_{refBase}.anchorspro` +- `/output/01_anchorwave_results/{queryName}.svg` (dot plot) +- `/output/01_anchorwave_results/ref.cds.fasta`, `{refBase}.sam` (reference-prep outputs) - `/output/01_anchorwave_results/maf_file_paths.txt` - `/logs/01_align_assemblies.log` **Examples:** ```bash -# Directory of queries +# Directory of queries, 8 threads seq_sim align-assemblies -g ref.gff -r ref.fa -q queries/ -t 8 -# Text list of query paths -seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt -t 4 +# Text list of query paths, 4 threads, run 2 alignments in parallel +seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt -t 4 --in-parallel 2 + +# Reference-prep only (for SLURM array workflows) +seq_sim align-assemblies -g ref.gff -r ref.fa -q queries.txt --just-ref-prep ``` --- @@ -409,7 +429,12 @@ seq_sim format-recombined-fastas \ ## align-mutated-assemblies (Step 10) Realigns the formatted recombined (or otherwise mutated) FASTA files back to -the reference genome. This is the first step of the PS4G creation workflow. +the reference genome via the PHGv2 +[`align-assemblies`](https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters) +command, which itself drives AnchorWave + minimap2 under the hood. This is the +first step of the PS4G creation workflow. The wrapper keeps seq_sim's existing +CLI surface and the `maf_file_paths.txt` output contract that step 11 +(`mutated-maf-to-gvcf`) depends on. **Usage:** ```bash @@ -418,16 +443,24 @@ seq_sim align-mutated-assemblies [OPTIONS] **Options:** - `--work-dir`, `-w`: Working directory (default: `seq_sim_work`) -- `--ref-gff`, `-g`: Reference GFF file (required) -- `--ref-fasta`, `-r`: Reference FASTA file (required) -- `--fasta-input`, `-f`: FASTA input (required) - single file, directory, or text list -- `--threads`, `-t`: Number of threads to use (default: 1) -- `--output-dir`, `-o`: Custom output directory (default: `work_dir/output/10_mutated_alignment_results`) +- `--ref-gff`, `-g`: Reference GFF file (required, forwarded as PHGv2 `--gff`) +- `--ref-fasta`, `-r`: Reference FASTA file (required, forwarded as PHGv2 `--reference-file`). For best results this should be the output of `phg prepare-assemblies`. +- `--fasta-input`, `-f`: FASTA input (required) - single file, directory, or text list. Translated to a PHGv2 `--assembly-file-list` internally. +- `--threads`, `-t`: Total number of threads available to PHGv2 (`--total-threads`, default: 1) +- `--in-parallel`: How many alignments to run in parallel (PHGv2 `--in-parallel`). If omitted, PHGv2 picks a value from system memory and thread count. +- `--ref-max-align-cov`: Maximum reference genome alignment coverage for AnchorWave `proali` (PHGv2 `--ref-max-align-cov`, default: 1) +- `--query-max-align-cov`: Maximum query genome alignment coverage for AnchorWave `proali` (PHGv2 `--query-max-align-cov`, default: 1) +- `--conda-env-prefix`: Path to a Conda env containing PHGv2's runtime deps. Defaults to the `phgv2-conda` env in its standard location. +- `--just-ref-prep`: Only run PHGv2's reference-prep phase and stop. No per-query MAFs and no `maf_file_paths.txt` are produced. +- `--output-dir`, `-o`: Custom output directory (default: `/output/10_mutated_alignment_results`) **Output:** -- `/output/10_mutated_alignment_results/{refBase}_cds.fa` -- `/output/10_mutated_alignment_results/{refBase}.sam` -- `/output/10_mutated_alignment_results/{fastaName}/` containing alignments +- `/output/10_mutated_alignment_results/assemblies_list.txt` (PHGv2 assembly-file-list, generated by this wrapper) +- `/output/10_mutated_alignment_results/{fastaName}.maf` (per-FASTA alignment, one file each) +- `/output/10_mutated_alignment_results/{fastaName}.sam` +- `/output/10_mutated_alignment_results/{fastaName}_{refBase}.anchorspro` +- `/output/10_mutated_alignment_results/{fastaName}.svg` (dot plot) +- `/output/10_mutated_alignment_results/ref.cds.fasta`, `{refBase}.sam` (reference-prep outputs) - `/output/10_mutated_alignment_results/maf_file_paths.txt` - `/logs/10_align_mutated_assemblies.log` @@ -435,6 +468,11 @@ seq_sim align-mutated-assemblies [OPTIONS] ```bash seq_sim align-mutated-assemblies \ -g ref.gff -r ref.fa -f seq_sim_work/output/09_formatted_fastas/ -t 8 + +# Run 2 alignments in parallel with 4 total threads +seq_sim align-mutated-assemblies \ + -g ref.gff -r ref.fa -f seq_sim_work/output/09_formatted_fastas/ \ + -t 4 --in-parallel 2 ``` --- diff --git a/pipeline_config.example.yaml b/pipeline_config.example.yaml index 6a00c6d..3c4f9e5 100644 --- a/pipeline_config.example.yaml +++ b/pipeline_config.example.yaml @@ -74,7 +74,12 @@ align_assemblies: ref_gff: "path/to/reference.gff" # Required: Reference GFF annotation file ref_fasta: "path/to/reference.fa" # Required: Reference FASTA file query_fasta: "path/to/queries.txt" # Required: Single query file, directory, or text list - threads: 1 # Optional: Number of threads (default: 1) + threads: 1 # Optional: PHGv2 --total-threads (default: 1) + # in_parallel: 2 # Optional: PHGv2 --in-parallel (omit for auto-tuning) + # ref_max_align_cov: 1 # Optional: PHGv2 --ref-max-align-cov (proali -R, default 1) + # query_max_align_cov: 1 # Optional: PHGv2 --query-max-align-cov (proali -Q, default 1) + # conda_env_prefix: "/path/to/env" # Optional: PHGv2 --conda-env-prefix (overrides phgv2-conda) + # just_ref_prep: false # Optional: PHGv2 --just-ref-prep (ref-prep only, no MAFs) # output: "/custom/output/path/" # Optional: Custom output directory # Step 2: MAF to GVCF Conversion @@ -191,7 +196,12 @@ align_mutated_assemblies: # Uses align_assemblies.ref_fasta if not specified # fasta_input: "/custom/fasta/input/" # Optional: Query FASTA input (file, directory, or text list) # Uses format_recombined_fastas output if not specified - threads: 1 # Optional: Number of threads (default: 1) + threads: 1 # Optional: PHGv2 --total-threads (default: 1) + # in_parallel: 2 # Optional: PHGv2 --in-parallel (omit for auto-tuning) + # ref_max_align_cov: 1 # Optional: PHGv2 --ref-max-align-cov (proali -R, default 1) + # query_max_align_cov: 1 # Optional: PHGv2 --query-max-align-cov (proali -Q, default 1) + # conda_env_prefix: "/path/to/env" # Optional: PHGv2 --conda-env-prefix (overrides phgv2-conda) + # just_ref_prep: false # Optional: PHGv2 --just-ref-prep (ref-prep only, no MAFs) # output: "/custom/alignment/output/" # Optional: Custom output directory # Step 11: Mutated MAF to GVCF Conversion From f5a45cce6127e31a4935bc25e04b58028347674e Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Wed, 13 May 2026 08:46:16 -0500 Subject: [PATCH 06/18] Version bump --- build.gradle.kts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle.kts b/build.gradle.kts index db32b42..88c52b6 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "net.maizegenetics" -version = "0.2.9" +version = "0.2.10" repositories { mavenCentral() From a32f045aa964d24d1aec6fa1521cd71b04fa9c9c Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Thu, 14 May 2026 11:34:53 -0500 Subject: [PATCH 07/18] Add alignment module --- .../maizegenetics/commands/AlignAssemblies.kt | 230 +++-------------- .../commands/AlignMutatedAssemblies.kt | 231 +++--------------- .../net/maizegenetics/commands/Orchestrate.kt | 127 ++++++---- .../commands/align/PhgAlignParams.kt | 35 +++ .../commands/align/PhgAlignRunner.kt | 177 ++++++++++++++ .../commands/align/PhgAlignSharedOptions.kt | 83 +++++++ .../commands/align/PhgAlignRunnerUnitTest.kt | 215 ++++++++++++++++ 7 files changed, 644 insertions(+), 454 deletions(-) create mode 100644 src/main/kotlin/net/maizegenetics/commands/align/PhgAlignParams.kt create mode 100644 src/main/kotlin/net/maizegenetics/commands/align/PhgAlignRunner.kt create mode 100644 src/main/kotlin/net/maizegenetics/commands/align/PhgAlignSharedOptions.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/align/PhgAlignRunnerUnitTest.kt diff --git a/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt index 77c8894..5f86da9 100644 --- a/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/AlignAssemblies.kt @@ -1,31 +1,28 @@ package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand -import com.github.ajalt.clikt.parameters.options.default -import com.github.ajalt.clikt.parameters.options.flag +import com.github.ajalt.clikt.parameters.groups.provideDelegate import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required -import com.github.ajalt.clikt.parameters.types.int import com.github.ajalt.clikt.parameters.types.path -import net.maizegenetics.Constants -import net.maizegenetics.utils.FileUtils -import net.maizegenetics.utils.LoggingUtils -import net.maizegenetics.utils.ProcessRunner -import net.maizegenetics.utils.SeqSimCommandException -import net.maizegenetics.utils.ValidationUtils +import net.maizegenetics.commands.align.PhgAlignParams +import net.maizegenetics.commands.align.PhgAlignRunner +import net.maizegenetics.commands.align.PhgAlignSharedOptions import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.Logger -import java.nio.file.Path -import kotlin.io.path.* /** * Wraps the PHGv2 `align-assemblies` command, which itself drives AnchorWave + - * minimap2 to align query assemblies against a reference. The wrapper keeps + * minimap2 to align query assemblies against a reference. This wrapper keeps * seq_sim's existing inputs (`--ref-gff`, `--ref-fasta`, `--query-fasta`, ...) * and existing output contract (`output/01_anchorwave_results/maf_file_paths.txt`) - * so downstream pipeline steps continue to work unchanged. New PHGv2-specific - * options (`--in-parallel`, `--ref-max-align-cov`, ...) are surfaced as - * additional optional flags. + * so downstream pipeline steps continue to work unchanged. + * + * All the heavy lifting (validating PHG, materializing the assembly file list, + * invoking PHGv2, writing `maf_file_paths.txt`) lives in [PhgAlignRunner] and + * is shared with every other align iteration; the only thing this wrapper + * declares is the step-specific input flag and per-step metadata (log + * filename and output subdirectory). * * See: https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters */ @@ -33,33 +30,11 @@ class AlignAssemblies : CliktCommand(name = "align-assemblies") { companion object { private const val LOG_FILE_NAME = "01_align_assemblies.log" private const val ANCHORWAVE_RESULTS_DIR = "01_anchorwave_results" - private const val MAF_PATHS_FILE = "maf_file_paths.txt" - private const val ASSEMBLY_LIST_FILE = "assemblies_list.txt" - - // Default values - private const val DEFAULT_THREADS = 1 } private val logger: Logger = LogManager.getLogger(AlignAssemblies::class.java) - private val workDir by option( - "--work-dir", "-w", - help = "Working directory for files and scripts" - ).path(mustExist = false, canBeFile = false, canBeDir = true) - .default(Path.of(Constants.DEFAULT_WORK_DIR)) - - private val refGff by option( - "--ref-gff", "-g", - help = "Reference GFF file (passed to PHGv2 as --gff)" - ).path(mustExist = true, canBeFile = true, canBeDir = false) - .required() - - private val refFasta by option( - "--ref-fasta", "-r", - help = "Reference FASTA file (passed to PHGv2 as --reference-file). For best results " + - "this should be the output of `phg prepare-assemblies`." - ).path(mustExist = true, canBeFile = true, canBeDir = false) - .required() + private val shared by PhgAlignSharedOptions() private val queryInput by option( "--query-fasta", "-q", @@ -68,168 +43,25 @@ class AlignAssemblies : CliktCommand(name = "align-assemblies") { ).path(mustExist = true) .required() - private val threads by option( - "--threads", "-t", - help = "Total number of threads available to PHGv2 (--total-threads)" - ).int() - .default(DEFAULT_THREADS) - - private val inParallel by option( - "--in-parallel", - help = "Number of alignments to run in parallel (PHGv2 --in-parallel). " + - "If omitted, PHGv2 picks a value from system memory + thread count." - ).int() - - private val refMaxAlignCov by option( - "--ref-max-align-cov", - help = "Maximum reference genome alignment coverage for AnchorWave proali (PHGv2 --ref-max-align-cov, " + - "passed through as proali's `-R`). PHGv2 defaults this to 1." - ).int() - - private val queryMaxAlignCov by option( - "--query-max-align-cov", - help = "Maximum query genome alignment coverage for AnchorWave proali (PHGv2 --query-max-align-cov, " + - "passed through as proali's `-Q`). PHGv2 defaults this to 1." - ).int() - - private val condaEnvPrefix by option( - "--conda-env-prefix", - help = "Path to a Conda environment that contains PHGv2's runtime dependencies " + - "(anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location." - ).path(mustExist = false, canBeFile = false, canBeDir = true) - - private val justRefPrep by option( - "--just-ref-prep", - help = "Only run PHGv2's reference-prep phase (writes ref.cds.fasta + Ref.sam) and stop. " + - "Useful when feeding a SLURM array; skips writing maf_file_paths.txt because no MAFs are produced." - ).flag() - - private val outputDir by option( - "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/01_anchorwave_results)" - ).path(mustExist = false, canBeFile = false, canBeDir = true) - - private fun collectQueryFiles(): List { - return FileUtils.collectFiles( - queryInput, - Constants.FASTA_EXTENSIONS, - "FASTA", - logger - ) - } - - /** - * Materializes a PHGv2 `--assembly-file-list` from whatever the user - * passed via `--query-fasta` (a single FASTA, a directory, or a .txt list). - * The reference FASTA is filtered out if it accidentally appears in the - * collected list (PHGv2 warns against including the reference here). - */ - private fun writeAssemblyFileList(queryFiles: List, baseOutputDir: Path): Path { - val refAbsolute = refFasta.toAbsolutePath().normalize() - val filtered = queryFiles - .map { it.toAbsolutePath().normalize() } - .filter { it != refAbsolute } - .distinct() - - if (filtered.size != queryFiles.size) { - logger.warn( - "Reference FASTA was present in the query list and was removed; PHGv2 " + - "expects the reference to be passed only via --reference-file." - ) - } - - val listFile = baseOutputDir.resolve(ASSEMBLY_LIST_FILE) - listFile.writeLines(filtered.map { it.toString() }) - logger.info("Wrote PHGv2 assembly file list (${filtered.size} entries): $listFile") - return listFile - } - override fun run() { - // Validate working directory and PHG binary - val phgBinary = ValidationUtils.validatePhgSetup(workDir, logger) - - // Configure file logging to working directory - LoggingUtils.setupFileLogging(workDir, LOG_FILE_NAME, logger) - - logger.info("Starting assembly alignment via PHGv2 `align-assemblies`") - logger.info("Working directory: $workDir") - logger.info("Reference GFF: $refGff") - logger.info("Reference FASTA: $refFasta") - logger.info("Total threads: $threads") - inParallel?.let { logger.info("In-parallel: $it") } - refMaxAlignCov?.let { logger.info("Ref max align cov (proali -R): $it") } - queryMaxAlignCov?.let { logger.info("Query max align cov (proali -Q): $it") } - condaEnvPrefix?.let { logger.info("Conda env prefix: $it") } - if (justRefPrep) { - logger.info("Just-ref-prep mode enabled (will not produce per-query MAFs)") - } - - // Collect query files into a PHGv2-shaped assembly-file-list - val queryFiles = collectQueryFiles() - logger.info("Processing ${queryFiles.size} query file(s)") - - // Create base output directory (use custom or default). - // PHGv2 requires the output directory to exist before invocation. - val baseOutputDir = FileUtils.resolveOutputDirectory(workDir, outputDir, ANCHORWAVE_RESULTS_DIR) - FileUtils.createOutputDirectory(baseOutputDir, logger) - - val assemblyListFile = writeAssemblyFileList(queryFiles, baseOutputDir) - - // Build the PHGv2 align-assemblies command - val commandArgs = mutableListOf( - phgBinary.toString(), - "align-assemblies", - "--gff", refGff.toAbsolutePath().toString(), - "--reference-file", refFasta.toAbsolutePath().toString(), - "--assembly-file-list", assemblyListFile.toAbsolutePath().toString(), - "--total-threads", threads.toString(), - "-o", baseOutputDir.toAbsolutePath().toString() - ) - inParallel?.let { commandArgs += listOf("--in-parallel", it.toString()) } - refMaxAlignCov?.let { commandArgs += listOf("--ref-max-align-cov", it.toString()) } - queryMaxAlignCov?.let { commandArgs += listOf("--query-max-align-cov", it.toString()) } - condaEnvPrefix?.let { commandArgs += listOf("--conda-env-prefix", it.toAbsolutePath().toString()) } - if (justRefPrep) { - commandArgs += "--just-ref-prep" - } - - logger.info("Running PHG align-assemblies...") - val exitCode = ProcessRunner.runCommand( - *commandArgs.toTypedArray(), - workingDir = workDir.toFile(), - logger = logger - ) - - if (exitCode != 0) { - logger.error("PHG align-assemblies failed with exit code $exitCode") - throw SeqSimCommandException("PHG align-assemblies failed with exit code $exitCode", exitCode) - } - - if (justRefPrep) { - logger.info("--just-ref-prep was set; skipping MAF collection.") - logger.info("Reference-prep outputs written to: $baseOutputDir") - return - } - - // Collect MAF outputs PHGv2 wrote into the output directory and - // surface them via the standard maf_file_paths.txt contract so - // downstream pipeline steps (maf-to-gvcf, create-chain-files, ...) - // keep working unchanged. - val mafFiles = baseOutputDir.listDirectoryEntries() - .filter { it.isRegularFile() && it.name.endsWith(".maf") } - .sorted() - - FileUtils.writeFilePaths( - mafFiles, - baseOutputDir.resolve(MAF_PATHS_FILE), - logger, - "MAF file" + PhgAlignRunner.run( + PhgAlignParams( + workDir = shared.workDir, + refGff = shared.refGff, + refFasta = shared.refFasta, + queryInput = queryInput, + threads = shared.threads, + inParallel = shared.inParallel, + refMaxAlignCov = shared.refMaxAlignCov, + queryMaxAlignCov = shared.queryMaxAlignCov, + condaEnvPrefix = shared.condaEnvPrefix, + justRefPrep = shared.justRefPrep, + customOutputDir = shared.outputDir, + logFileName = LOG_FILE_NAME, + outputSubdir = ANCHORWAVE_RESULTS_DIR, + inputKind = "query", + ), + logger ) - - logger.info("=".repeat(80)) - logger.info("PHG align-assemblies completed successfully") - logger.info("Total assemblies aligned: ${queryFiles.size}") - logger.info("MAF files written: ${mafFiles.size}") - logger.info("Output directory: $baseOutputDir") } } diff --git a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt index 5ad4e27..d2a48fb 100644 --- a/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt +++ b/src/main/kotlin/net/maizegenetics/commands/AlignMutatedAssemblies.kt @@ -1,22 +1,15 @@ package net.maizegenetics.commands import com.github.ajalt.clikt.core.CliktCommand -import com.github.ajalt.clikt.parameters.options.default -import com.github.ajalt.clikt.parameters.options.flag +import com.github.ajalt.clikt.parameters.groups.provideDelegate import com.github.ajalt.clikt.parameters.options.option import com.github.ajalt.clikt.parameters.options.required -import com.github.ajalt.clikt.parameters.types.int import com.github.ajalt.clikt.parameters.types.path -import net.maizegenetics.Constants -import net.maizegenetics.utils.FileUtils -import net.maizegenetics.utils.LoggingUtils -import net.maizegenetics.utils.ProcessRunner -import net.maizegenetics.utils.SeqSimCommandException -import net.maizegenetics.utils.ValidationUtils +import net.maizegenetics.commands.align.PhgAlignParams +import net.maizegenetics.commands.align.PhgAlignRunner +import net.maizegenetics.commands.align.PhgAlignSharedOptions import org.apache.logging.log4j.LogManager import org.apache.logging.log4j.Logger -import java.nio.file.Path -import kotlin.io.path.* /** * Wraps the PHGv2 `align-assemblies` command for the "circular" mutated / @@ -24,9 +17,13 @@ import kotlin.io.path.* * AnchorWave + minimap2; this wrapper keeps seq_sim's existing inputs * (`--ref-gff`, `--ref-fasta`, `--fasta-input`, ...) and existing output * contract (`output/10_mutated_alignment_results/maf_file_paths.txt`) so - * downstream pipeline steps continue to work unchanged. New PHGv2-specific - * options (`--in-parallel`, `--ref-max-align-cov`, ...) are surfaced as - * additional optional flags. + * downstream pipeline steps continue to work unchanged. + * + * All the heavy lifting (validating PHG, materializing the assembly file list, + * invoking PHGv2, writing `maf_file_paths.txt`) lives in [PhgAlignRunner] and + * is shared with [AlignAssemblies]; the only thing this wrapper declares is + * the step-specific input flag and per-step metadata (log filename and + * output subdirectory). * * See: https://phg.maizegenetics.net/build_and_load/#align-assemblies-parameters */ @@ -34,33 +31,11 @@ class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { companion object { private const val LOG_FILE_NAME = "10_align_mutated_assemblies.log" private const val MUTATED_ALIGNMENT_RESULTS_DIR = "10_mutated_alignment_results" - private const val MAF_PATHS_FILE = "maf_file_paths.txt" - private const val ASSEMBLY_LIST_FILE = "assemblies_list.txt" - - // Default values - private const val DEFAULT_THREADS = 1 } private val logger: Logger = LogManager.getLogger(AlignMutatedAssemblies::class.java) - private val workDir by option( - "--work-dir", "-w", - help = "Working directory for files and scripts" - ).path(mustExist = false, canBeFile = false, canBeDir = true) - .default(Path.of(Constants.DEFAULT_WORK_DIR)) - - private val refGff by option( - "--ref-gff", "-g", - help = "Reference GFF file (passed to PHGv2 as --gff)" - ).path(mustExist = true, canBeFile = true, canBeDir = false) - .required() - - private val refFasta by option( - "--ref-fasta", "-r", - help = "Reference FASTA file (passed to PHGv2 as --reference-file). For best results " + - "this should be the output of `phg prepare-assemblies`." - ).path(mustExist = true, canBeFile = true, canBeDir = false) - .required() + private val shared by PhgAlignSharedOptions() private val fastaInput by option( "--fasta-input", "-f", @@ -69,171 +44,25 @@ class AlignMutatedAssemblies : CliktCommand(name = "align-mutated-assemblies") { ).path(mustExist = true) .required() - private val threads by option( - "--threads", "-t", - help = "Total number of threads available to PHGv2 (--total-threads)" - ).int() - .default(DEFAULT_THREADS) - - private val inParallel by option( - "--in-parallel", - help = "Number of alignments to run in parallel (PHGv2 --in-parallel). " + - "If omitted, PHGv2 picks a value from system memory + thread count." - ).int() - - private val refMaxAlignCov by option( - "--ref-max-align-cov", - help = "Maximum reference genome alignment coverage for AnchorWave proali (PHGv2 --ref-max-align-cov, " + - "passed through as proali's `-R`). PHGv2 defaults this to 1." - ).int() - - private val queryMaxAlignCov by option( - "--query-max-align-cov", - help = "Maximum query genome alignment coverage for AnchorWave proali (PHGv2 --query-max-align-cov, " + - "passed through as proali's `-Q`). PHGv2 defaults this to 1." - ).int() - - private val condaEnvPrefix by option( - "--conda-env-prefix", - help = "Path to a Conda environment that contains PHGv2's runtime dependencies " + - "(anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location." - ).path(mustExist = false, canBeFile = false, canBeDir = true) - - private val justRefPrep by option( - "--just-ref-prep", - help = "Only run PHGv2's reference-prep phase (writes ref.cds.fasta + Ref.sam) and stop. " + - "Useful when feeding a SLURM array; skips writing maf_file_paths.txt because no MAFs are produced." - ).flag() - - private val outputDir by option( - "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/10_mutated_alignment_results)" - ).path(mustExist = false, canBeFile = false, canBeDir = true) - - private fun collectFastaFiles(): List { - return FileUtils.collectFiles( - fastaInput, - Constants.FASTA_EXTENSIONS, - "FASTA", - logger - ) - } - - /** - * Materializes a PHGv2 `--assembly-file-list` from whatever the user passed - * via `--fasta-input` (a single FASTA, a directory, or a .txt list). The - * reference FASTA is filtered out if it accidentally appears in the list - * (PHGv2 warns against including the reference here). - */ - private fun writeAssemblyFileList(fastaFiles: List, baseOutputDir: Path): Path { - val refAbsolute = refFasta.toAbsolutePath().normalize() - val filtered = fastaFiles - .map { it.toAbsolutePath().normalize() } - .filter { it != refAbsolute } - .distinct() - - if (filtered.size != fastaFiles.size) { - logger.warn( - "Reference FASTA was present in the FASTA input list and was removed; " + - "PHGv2 expects the reference to be passed only via --reference-file." - ) - } - - val listFile = baseOutputDir.resolve(ASSEMBLY_LIST_FILE) - listFile.writeLines(filtered.map { it.toString() }) - logger.info("Wrote PHGv2 assembly file list (${filtered.size} entries): $listFile") - return listFile - } - override fun run() { - // Validate working directory and PHG binary - val phgBinary = ValidationUtils.validatePhgSetup(workDir, logger) - - // Configure file logging to working directory - LoggingUtils.setupFileLogging(workDir, LOG_FILE_NAME, logger) - - logger.info("Starting mutated assembly alignment via PHGv2 `align-assemblies`") - logger.info("Working directory: $workDir") - logger.info("Reference GFF: $refGff") - logger.info("Reference FASTA: $refFasta") - logger.info("Total threads: $threads") - inParallel?.let { logger.info("In-parallel: $it") } - refMaxAlignCov?.let { logger.info("Ref max align cov (proali -R): $it") } - queryMaxAlignCov?.let { logger.info("Query max align cov (proali -Q): $it") } - condaEnvPrefix?.let { logger.info("Conda env prefix: $it") } - if (justRefPrep) { - logger.info("Just-ref-prep mode enabled (will not produce per-query MAFs)") - } - - // Collect FASTA files into a PHGv2-shaped assembly-file-list - val fastaFiles = collectFastaFiles() - logger.info("Processing ${fastaFiles.size} FASTA file(s)") - - // Create base output directory (use custom or default). - // PHGv2 requires the output directory to exist before invocation. - val baseOutputDir = FileUtils.resolveOutputDirectory(workDir, outputDir, MUTATED_ALIGNMENT_RESULTS_DIR) - FileUtils.createOutputDirectory(baseOutputDir, logger) - - val assemblyListFile = writeAssemblyFileList(fastaFiles, baseOutputDir) - - // Build the PHGv2 align-assemblies command - val commandArgs = mutableListOf( - phgBinary.toString(), - "align-assemblies", - "--gff", refGff.toAbsolutePath().toString(), - "--reference-file", refFasta.toAbsolutePath().toString(), - "--assembly-file-list", assemblyListFile.toAbsolutePath().toString(), - "--total-threads", threads.toString(), - "-o", baseOutputDir.toAbsolutePath().toString() - ) - inParallel?.let { commandArgs += listOf("--in-parallel", it.toString()) } - refMaxAlignCov?.let { commandArgs += listOf("--ref-max-align-cov", it.toString()) } - queryMaxAlignCov?.let { commandArgs += listOf("--query-max-align-cov", it.toString()) } - condaEnvPrefix?.let { commandArgs += listOf("--conda-env-prefix", it.toAbsolutePath().toString()) } - if (justRefPrep) { - commandArgs += "--just-ref-prep" - } - - logger.info("Running PHG align-assemblies (mutated)...") - val exitCode = ProcessRunner.runCommand( - *commandArgs.toTypedArray(), - workingDir = workDir.toFile(), - logger = logger - ) - - if (exitCode != 0) { - logger.error("PHG align-assemblies (mutated) failed with exit code $exitCode") - throw SeqSimCommandException( - "PHG align-assemblies (mutated) failed with exit code $exitCode", - exitCode - ) - } - - if (justRefPrep) { - logger.info("--just-ref-prep was set; skipping MAF collection.") - logger.info("Reference-prep outputs written to: $baseOutputDir") - return - } - - // Collect MAF outputs PHGv2 wrote into the output directory and - // surface them via the standard maf_file_paths.txt contract so - // downstream pipeline steps (mutated_maf_to_gvcf, ...) keep working - // unchanged. - val mafFiles = baseOutputDir.listDirectoryEntries() - .filter { it.isRegularFile() && it.name.endsWith(".maf") } - .sorted() - - FileUtils.writeFilePaths( - mafFiles, - baseOutputDir.resolve(MAF_PATHS_FILE), - logger, - "MAF file" + PhgAlignRunner.run( + PhgAlignParams( + workDir = shared.workDir, + refGff = shared.refGff, + refFasta = shared.refFasta, + queryInput = fastaInput, + threads = shared.threads, + inParallel = shared.inParallel, + refMaxAlignCov = shared.refMaxAlignCov, + queryMaxAlignCov = shared.queryMaxAlignCov, + condaEnvPrefix = shared.condaEnvPrefix, + justRefPrep = shared.justRefPrep, + customOutputDir = shared.outputDir, + logFileName = LOG_FILE_NAME, + outputSubdir = MUTATED_ALIGNMENT_RESULTS_DIR, + inputKind = "FASTA", + ), + logger ) - - logger.info("=".repeat(80)) - logger.info("PHG align-assemblies (mutated) completed successfully") - logger.info("Total assemblies aligned: ${fastaFiles.size}") - logger.info("MAF files written: ${mafFiles.size}") - logger.info("Output directory: $baseOutputDir") } } diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index cf959f5..e1fc13b 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -161,6 +161,47 @@ class Orchestrate : CliktCommand(name = "orchestrate") { return stepName in config.run_steps } + /** + * Appends the optional PHGv2 align-assemblies knobs shared by every + * align step (threads, in-parallel, proali coverage caps, conda env + * prefix, just-ref-prep, output dir override) to [args]. Each option + * is included only when its config field is non-null/true, matching + * the existing inline behaviour for both align_assemblies and + * align_mutated_assemblies. + */ + private fun appendPhgAlignSharedArgs( + args: MutableList, + threads: Int?, + inParallel: Int?, + refMaxAlignCov: Int?, + queryMaxAlignCov: Int?, + condaEnvPrefix: String?, + justRefPrep: Boolean?, + customOutput: Path?, + ) { + if (threads != null) { + args.add("--threads=$threads") + } + if (inParallel != null) { + args.add("--in-parallel=$inParallel") + } + if (refMaxAlignCov != null) { + args.add("--ref-max-align-cov=$refMaxAlignCov") + } + if (queryMaxAlignCov != null) { + args.add("--query-max-align-cov=$queryMaxAlignCov") + } + if (condaEnvPrefix != null) { + args.add("--conda-env-prefix=$condaEnvPrefix") + } + if (justRefPrep == true) { + args.add("--just-ref-prep") + } + if (customOutput != null) { + args.add("--output-dir=$customOutput") + } + } + private fun validateEnvironment(workDir: Path): Boolean { // Check if working directory exists if (!workDir.exists()) { @@ -501,33 +542,22 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("Reference FASTA: $refFasta") logger.info("Query FASTA: $queryFasta") - val args = buildList { - add("--work-dir=$workDir") - add("--ref-gff=$refGff") - add("--ref-fasta=$refFasta") - add("--query-fasta=$queryFasta") - if (config.align_assemblies.threads != null) { - add("--threads=${config.align_assemblies.threads}") - } - if (config.align_assemblies.in_parallel != null) { - add("--in-parallel=${config.align_assemblies.in_parallel}") - } - if (config.align_assemblies.ref_max_align_cov != null) { - add("--ref-max-align-cov=${config.align_assemblies.ref_max_align_cov}") - } - if (config.align_assemblies.query_max_align_cov != null) { - add("--query-max-align-cov=${config.align_assemblies.query_max_align_cov}") - } - if (config.align_assemblies.conda_env_prefix != null) { - add("--conda-env-prefix=${config.align_assemblies.conda_env_prefix}") - } - if (config.align_assemblies.just_ref_prep == true) { - add("--just-ref-prep") - } - if (customOutput != null) { - add("--output-dir=$customOutput") - } - } + val args = mutableListOf( + "--work-dir=$workDir", + "--ref-gff=$refGff", + "--ref-fasta=$refFasta", + "--query-fasta=$queryFasta", + ) + appendPhgAlignSharedArgs( + args, + threads = config.align_assemblies.threads, + inParallel = config.align_assemblies.in_parallel, + refMaxAlignCov = config.align_assemblies.ref_max_align_cov, + queryMaxAlignCov = config.align_assemblies.query_max_align_cov, + condaEnvPrefix = config.align_assemblies.conda_env_prefix, + justRefPrep = config.align_assemblies.just_ref_prep, + customOutput = customOutput, + ) AlignAssemblies().parse(args) restoreOrchestratorLogging(workDir) @@ -1219,33 +1249,22 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Determine output directory (custom or default) val customOutput = config.align_mutated_assemblies.output?.let { Path.of(it) } - val args = buildList { - add("--work-dir=${workDir}") - add("--ref-gff=${step10RefGff}") - add("--ref-fasta=${step10RefFasta}") - add("--fasta-input=${step10FastaInput}") - if (config.align_mutated_assemblies.threads != null) { - add("--threads=${config.align_mutated_assemblies.threads}") - } - if (config.align_mutated_assemblies.in_parallel != null) { - add("--in-parallel=${config.align_mutated_assemblies.in_parallel}") - } - if (config.align_mutated_assemblies.ref_max_align_cov != null) { - add("--ref-max-align-cov=${config.align_mutated_assemblies.ref_max_align_cov}") - } - if (config.align_mutated_assemblies.query_max_align_cov != null) { - add("--query-max-align-cov=${config.align_mutated_assemblies.query_max_align_cov}") - } - if (config.align_mutated_assemblies.conda_env_prefix != null) { - add("--conda-env-prefix=${config.align_mutated_assemblies.conda_env_prefix}") - } - if (config.align_mutated_assemblies.just_ref_prep == true) { - add("--just-ref-prep") - } - if (customOutput != null) { - add("--output-dir=${customOutput}") - } - } + val args = mutableListOf( + "--work-dir=$workDir", + "--ref-gff=$step10RefGff", + "--ref-fasta=$step10RefFasta", + "--fasta-input=$step10FastaInput", + ) + appendPhgAlignSharedArgs( + args, + threads = config.align_mutated_assemblies.threads, + inParallel = config.align_mutated_assemblies.in_parallel, + refMaxAlignCov = config.align_mutated_assemblies.ref_max_align_cov, + queryMaxAlignCov = config.align_mutated_assemblies.query_max_align_cov, + condaEnvPrefix = config.align_mutated_assemblies.conda_env_prefix, + justRefPrep = config.align_mutated_assemblies.just_ref_prep, + customOutput = customOutput, + ) AlignMutatedAssemblies().parse(args) restoreOrchestratorLogging(workDir) diff --git a/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignParams.kt b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignParams.kt new file mode 100644 index 0000000..24b86a8 --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignParams.kt @@ -0,0 +1,35 @@ +package net.maizegenetics.commands.align + +import java.nio.file.Path + +/** + * All inputs the [PhgAlignRunner] needs to wrap PHGv2 `align-assemblies`. + * + * Combines: + * - the user-facing PHGv2 knobs that are common to every align step (see + * [PhgAlignSharedOptions]), + * - per-caller metadata (log filename, output subdirectory, label used in + * log strings) that lets each thin wrapper place its outputs and logs + * in step-specific locations, and + * - the caller's unique input (collected as a single [Path] -- single file, + * directory, or .txt list -- just like the previous standalone commands). + * + * Adding a new align iteration is a new [PhgAlignParams] instance from a + * new Clikt wrapper; the runner itself does not change. + */ +data class PhgAlignParams( + val workDir: Path, + val refGff: Path, + val refFasta: Path, + val queryInput: Path, + val threads: Int, + val inParallel: Int? = null, + val refMaxAlignCov: Int? = null, + val queryMaxAlignCov: Int? = null, + val condaEnvPrefix: Path? = null, + val justRefPrep: Boolean = false, + val customOutputDir: Path? = null, + val logFileName: String, + val outputSubdir: String, + val inputKind: String, +) diff --git a/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignRunner.kt b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignRunner.kt new file mode 100644 index 0000000..5798370 --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignRunner.kt @@ -0,0 +1,177 @@ +package net.maizegenetics.commands.align + +import net.maizegenetics.Constants +import net.maizegenetics.utils.FileUtils +import net.maizegenetics.utils.LoggingUtils +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.SeqSimCommandException +import net.maizegenetics.utils.ValidationUtils +import org.apache.logging.log4j.Logger +import java.nio.file.Path +import kotlin.io.path.listDirectoryEntries +import kotlin.io.path.isRegularFile +import kotlin.io.path.name +import kotlin.io.path.writeLines + +/** + * Consolidated runner for every command that wraps PHGv2 `align-assemblies`. + * + * Validates the PHG binary, materializes a PHGv2 `--assembly-file-list` from + * the caller's input, invokes the PHG CLI, and writes the standard + * `maf_file_paths.txt` output. Per-caller variation (log filename, output + * subdirectory, label used in log strings) is provided via [PhgAlignParams] + * so adding a new align iteration is just a new thin Clikt wrapper -- no + * runner changes required. + */ +object PhgAlignRunner { + + private const val MAF_PATHS_FILE = "maf_file_paths.txt" + private const val ASSEMBLY_LIST_FILE = "assemblies_list.txt" + + /** + * Run PHGv2 align-assemblies for [params] and return the resolved output + * directory the run wrote to. + */ + fun run(params: PhgAlignParams, logger: Logger): Path { + // Validate working directory and PHG binary + val phgBinary = ValidationUtils.validatePhgSetup(params.workDir, logger) + + // Configure file logging to working directory + LoggingUtils.setupFileLogging(params.workDir, params.logFileName, logger) + + logger.info("Starting assembly alignment via PHGv2 `align-assemblies`") + logger.info("Working directory: ${params.workDir}") + logger.info("Reference GFF: ${params.refGff}") + logger.info("Reference FASTA: ${params.refFasta}") + logger.info("Total threads: ${params.threads}") + params.inParallel?.let { logger.info("In-parallel: $it") } + params.refMaxAlignCov?.let { logger.info("Ref max align cov (proali -R): $it") } + params.queryMaxAlignCov?.let { logger.info("Query max align cov (proali -Q): $it") } + params.condaEnvPrefix?.let { logger.info("Conda env prefix: $it") } + if (params.justRefPrep) { + logger.info("Just-ref-prep mode enabled (will not produce per-query MAFs)") + } + + // Collect input files into a PHGv2-shaped assembly-file-list + val inputFiles = FileUtils.collectFiles( + params.queryInput, + Constants.FASTA_EXTENSIONS, + "FASTA", + logger + ) + logger.info("Processing ${inputFiles.size} ${params.inputKind} file(s)") + + // Create base output directory (use custom or default). + // PHGv2 requires the output directory to exist before invocation. + val baseOutputDir = FileUtils.resolveOutputDirectory( + params.workDir, + params.customOutputDir, + params.outputSubdir + ) + FileUtils.createOutputDirectory(baseOutputDir, logger) + + val assemblyListFile = writeAssemblyFileList( + inputFiles, + params.refFasta, + params.inputKind, + baseOutputDir, + logger + ) + + // Build the PHGv2 align-assemblies command + val commandArgs = mutableListOf( + phgBinary.toString(), + "align-assemblies", + "--gff", params.refGff.toAbsolutePath().toString(), + "--reference-file", params.refFasta.toAbsolutePath().toString(), + "--assembly-file-list", assemblyListFile.toAbsolutePath().toString(), + "--total-threads", params.threads.toString(), + "-o", baseOutputDir.toAbsolutePath().toString() + ) + params.inParallel?.let { commandArgs += listOf("--in-parallel", it.toString()) } + params.refMaxAlignCov?.let { commandArgs += listOf("--ref-max-align-cov", it.toString()) } + params.queryMaxAlignCov?.let { commandArgs += listOf("--query-max-align-cov", it.toString()) } + params.condaEnvPrefix?.let { + commandArgs += listOf("--conda-env-prefix", it.toAbsolutePath().toString()) + } + if (params.justRefPrep) { + commandArgs += "--just-ref-prep" + } + + logger.info("Running PHG align-assemblies...") + val exitCode = ProcessRunner.runCommand( + *commandArgs.toTypedArray(), + workingDir = params.workDir.toFile(), + logger = logger + ) + + if (exitCode != 0) { + logger.error("PHG align-assemblies failed with exit code $exitCode") + throw SeqSimCommandException( + "PHG align-assemblies failed with exit code $exitCode", + exitCode + ) + } + + if (params.justRefPrep) { + logger.info("--just-ref-prep was set; skipping MAF collection.") + logger.info("Reference-prep outputs written to: $baseOutputDir") + return baseOutputDir + } + + // Collect MAF outputs PHGv2 wrote into the output directory and + // surface them via the standard maf_file_paths.txt contract so + // downstream pipeline steps (maf-to-gvcf, create-chain-files, ...) + // keep working unchanged. + val mafFiles = baseOutputDir.listDirectoryEntries() + .filter { it.isRegularFile() && it.name.endsWith(".maf") } + .sorted() + + FileUtils.writeFilePaths( + mafFiles, + baseOutputDir.resolve(MAF_PATHS_FILE), + logger, + "MAF file" + ) + + logger.info("=".repeat(80)) + logger.info("PHG align-assemblies completed successfully") + logger.info("Total assemblies aligned: ${inputFiles.size}") + logger.info("MAF files written: ${mafFiles.size}") + logger.info("Output directory: $baseOutputDir") + + return baseOutputDir + } + + /** + * Materializes a PHGv2 `--assembly-file-list` from whatever the caller + * passed (a single FASTA, a directory, or a .txt list). The reference + * FASTA is filtered out if it accidentally appears in the collected list + * (PHGv2 warns against including the reference here). + */ + private fun writeAssemblyFileList( + inputFiles: List, + refFasta: Path, + inputKind: String, + baseOutputDir: Path, + logger: Logger + ): Path { + val refAbsolute = refFasta.toAbsolutePath().normalize() + val filtered = inputFiles + .map { it.toAbsolutePath().normalize() } + .filter { it != refAbsolute } + .distinct() + + if (filtered.size != inputFiles.size) { + logger.warn( + "Reference FASTA was present in the $inputKind input list and was removed; " + + "PHGv2 expects the reference to be passed only via --reference-file." + ) + } + + val listFile = baseOutputDir.resolve(ASSEMBLY_LIST_FILE) + listFile.writeLines(filtered.map { it.toString() }) + logger.info("Wrote PHGv2 assembly file list (${filtered.size} entries): $listFile") + return listFile + } +} diff --git a/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignSharedOptions.kt b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignSharedOptions.kt new file mode 100644 index 0000000..37ab8c5 --- /dev/null +++ b/src/main/kotlin/net/maizegenetics/commands/align/PhgAlignSharedOptions.kt @@ -0,0 +1,83 @@ +package net.maizegenetics.commands.align + +import com.github.ajalt.clikt.parameters.groups.OptionGroup +import com.github.ajalt.clikt.parameters.options.default +import com.github.ajalt.clikt.parameters.options.flag +import com.github.ajalt.clikt.parameters.options.option +import com.github.ajalt.clikt.parameters.options.required +import com.github.ajalt.clikt.parameters.types.int +import com.github.ajalt.clikt.parameters.types.path +import net.maizegenetics.Constants +import java.nio.file.Path + +/** + * Shared Clikt option group for every command that wraps PHGv2 + * `align-assemblies`. Owns the options that are identical across align + * iterations (working dir, reference files, threading, AnchorWave proali + * knobs, conda env prefix, ref-prep-only flag, and the custom output dir + * override) so option names, help text, and defaults live in exactly one + * place. The per-step unique input flag (e.g. `--query-fasta` / + * `--fasta-input`) is declared on each wrapper itself. + */ +class PhgAlignSharedOptions : OptionGroup(name = "PHGv2 align-assemblies options") { + + val workDir by option( + "--work-dir", "-w", + help = "Working directory for files and scripts" + ).path(mustExist = false, canBeFile = false, canBeDir = true) + .default(Path.of(Constants.DEFAULT_WORK_DIR)) + + val refGff by option( + "--ref-gff", "-g", + help = "Reference GFF file (passed to PHGv2 as --gff)" + ).path(mustExist = true, canBeFile = true, canBeDir = false) + .required() + + val refFasta by option( + "--ref-fasta", "-r", + help = "Reference FASTA file (passed to PHGv2 as --reference-file). For best results " + + "this should be the output of `phg prepare-assemblies`." + ).path(mustExist = true, canBeFile = true, canBeDir = false) + .required() + + val threads by option( + "--threads", "-t", + help = "Total number of threads available to PHGv2 (--total-threads)" + ).int() + .default(1) + + val inParallel by option( + "--in-parallel", + help = "Number of alignments to run in parallel (PHGv2 --in-parallel). " + + "If omitted, PHGv2 picks a value from system memory + thread count." + ).int() + + val refMaxAlignCov by option( + "--ref-max-align-cov", + help = "Maximum reference genome alignment coverage for AnchorWave proali (PHGv2 --ref-max-align-cov, " + + "passed through as proali's `-R`). PHGv2 defaults this to 1." + ).int() + + val queryMaxAlignCov by option( + "--query-max-align-cov", + help = "Maximum query genome alignment coverage for AnchorWave proali (PHGv2 --query-max-align-cov, " + + "passed through as proali's `-Q`). PHGv2 defaults this to 1." + ).int() + + val condaEnvPrefix by option( + "--conda-env-prefix", + help = "Path to a Conda environment that contains PHGv2's runtime dependencies " + + "(anchorwave, minimap2, samtools, ...). Defaults to the `phgv2-conda` env in its standard location." + ).path(mustExist = false, canBeFile = false, canBeDir = true) + + val justRefPrep by option( + "--just-ref-prep", + help = "Only run PHGv2's reference-prep phase (writes ref.cds.fasta + Ref.sam) and stop. " + + "Useful when feeding a SLURM array; skips writing maf_file_paths.txt because no MAFs are produced." + ).flag() + + val outputDir by option( + "--output-dir", "-o", + help = "Custom output directory (default: work_dir/output/)" + ).path(mustExist = false, canBeFile = false, canBeDir = true) +} diff --git a/src/test/kotlin/net/maizegenetics/commands/align/PhgAlignRunnerUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/align/PhgAlignRunnerUnitTest.kt new file mode 100644 index 0000000..babf808 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/align/PhgAlignRunnerUnitTest.kt @@ -0,0 +1,215 @@ +package net.maizegenetics.commands.align + +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.apache.logging.log4j.LogManager +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [PhgAlignRunner] -- the shared backend used by both + * [net.maizegenetics.commands.AlignAssemblies] and + * [net.maizegenetics.commands.AlignMutatedAssemblies]. The wrappers' own + * tests still exercise the externally observable behaviour (command line, + * `assemblies_list.txt`, `maf_file_paths.txt`), so these tests focus on + * the runner's invariants directly: it should work for any caller, with + * any combination of step metadata. + */ +class PhgAlignRunnerUnitTest { + + private val smallseqRoot: Path = File("src/test/resources/smallseq") + .absoluteFile.toPath() + + private val logger = LogManager.getLogger(PhgAlignRunnerUnitTest::class.java) + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + /** + * Create a fake PHG layout (bin/phg) inside [workDir] so the runner's + * [net.maizegenetics.utils.ValidationUtils.validatePhgSetup] passes. + */ + private fun stubPhgBinary(workDir: Path) { + val phgDir = workDir.resolve("src/phg_v2/bin") + phgDir.createDirectories() + val phg = phgDir.resolve("phg") + phg.writeText("#!/bin/sh\nexit 0\n") + phg.toFile().setExecutable(true) + } + + private fun baseParams(workDir: Path, queryInput: Path) = PhgAlignParams( + workDir = workDir, + refGff = smallseqRoot.resolve("anchors.gff"), + refFasta = smallseqRoot.resolve("Ref.fa"), + queryInput = queryInput, + threads = 2, + logFileName = "test_align.log", + outputSubdir = "test_align_results", + inputKind = "query", + ) + + @Test + fun happyPathInvokesPhgWithSharedArgs(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run( + baseParams(workDir, smallseqRoot.resolve("queries")), + logger, + ) + } + + assertEquals(1, executor.invocations.size, "phg align-assemblies should be invoked exactly once") + val inv = executor.invocations.single() + assertTrue(inv.command.first().endsWith("phg"), "First token should be the phg binary") + assertEquals("align-assemblies", inv.command[1]) + + assertEquals( + smallseqRoot.resolve("anchors.gff").toAbsolutePath().toString(), + inv.argAfter("--gff") + ) + assertEquals( + smallseqRoot.resolve("Ref.fa").toAbsolutePath().toString(), + inv.argAfter("--reference-file") + ) + assertEquals("2", inv.argAfter("--total-threads")) + + // Output dir uses params.outputSubdir under /output/ + val expectedOutputDir = workDir.resolve("output/test_align_results") + assertEquals(expectedOutputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + + // assemblies_list.txt is materialized inside the output dir + val assemblyList = expectedOutputDir.resolve("assemblies_list.txt").toFile() + assertTrue(assemblyList.exists(), "assemblies_list.txt should have been written") + val listed = assemblyList.readLines().filter { it.isNotBlank() } + assertEquals(3, listed.size, "Smallseq queries directory contains 3 FASTAs") + + // Optional knobs absent unless set + assertTrue(!inv.command.contains("--in-parallel")) + assertTrue(!inv.command.contains("--ref-max-align-cov")) + assertTrue(!inv.command.contains("--query-max-align-cov")) + assertTrue(!inv.command.contains("--conda-env-prefix")) + assertTrue(!inv.command.contains("--just-ref-prep")) + } + + @Test + fun runnerHonoursPerCallerLogAndOutputMetadata(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + // Different log filename + subdir from the default test params + val params = baseParams(workDir, smallseqRoot.resolve("queries")).copy( + logFileName = "99_custom_step.log", + outputSubdir = "99_custom_step_results", + inputKind = "FASTA", + ) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run(params, logger) + } + + val inv = executor.invocations.single() + val expectedOutputDir = workDir.resolve("output/99_custom_step_results") + assertEquals(expectedOutputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + assertTrue( + workDir.resolve("logs/99_custom_step.log").toFile().exists(), + "Custom log file should be created under /logs/" + ) + } + + @Test + fun justRefPrepSkipsMafFilePathsTextFile(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + val params = baseParams(workDir, smallseqRoot.resolve("queries")).copy( + justRefPrep = true, + ) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run(params, logger) + } + + val inv = executor.invocations.single() + assertTrue(inv.command.contains("--just-ref-prep"), "--just-ref-prep should be forwarded") + + val mafPathsFile = workDir.resolve("output/test_align_results/maf_file_paths.txt").toFile() + assertTrue( + !mafPathsFile.exists(), + "maf_file_paths.txt should NOT be written when --just-ref-prep is set" + ) + } + + @Test + fun referenceFastaIsFilteredOutOfAssemblyList(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + // Build a text-list input that intentionally includes Ref.fa alongside + // the three real queries -- the runner should drop the reference. + val refFasta = smallseqRoot.resolve("Ref.fa").toAbsolutePath() + val queryList = workDir.resolve("queries_with_ref.txt") + queryList.writeText( + buildString { + appendLine(refFasta.toString()) + appendLine(smallseqRoot.resolve("queries/LineA.fa").toAbsolutePath().toString()) + appendLine(smallseqRoot.resolve("queries/LineB.fa").toAbsolutePath().toString()) + appendLine(smallseqRoot.resolve("queries/LineC.fa").toAbsolutePath().toString()) + } + ) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run(baseParams(workDir, queryList), logger) + } + + val inv = executor.invocations.single() + val assemblyListPath = inv.argAfter("--assembly-file-list") + ?: error("--assembly-file-list not present in command") + val listed = File(assemblyListPath).readLines().filter { it.isNotBlank() } + assertEquals( + 3, + listed.size, + "Reference FASTA should have been filtered out, leaving the 3 queries" + ) + assertTrue( + listed.none { it == refFasta.toString() }, + "Reference FASTA should not be present in the assembly file list" + ) + } + + @Test + fun optionalKnobsAreForwardedWhenProvided(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val executor = RecordingProcessExecutor(defaultExitCode = 0) + val condaPrefix = workDir.resolve("conda_env").also { it.createDirectories() } + + val params = baseParams(workDir, smallseqRoot.resolve("queries/LineA.fa")).copy( + threads = 4, + inParallel = 2, + refMaxAlignCov = 3, + queryMaxAlignCov = 5, + condaEnvPrefix = condaPrefix, + ) + + ProcessRunner.withExecutor(executor) { + PhgAlignRunner.run(params, logger) + } + + val inv = executor.invocations.single() + assertEquals("4", inv.argAfter("--total-threads")) + assertEquals("2", inv.argAfter("--in-parallel")) + assertEquals("3", inv.argAfter("--ref-max-align-cov")) + assertEquals("5", inv.argAfter("--query-max-align-cov")) + assertEquals(condaPrefix.toAbsolutePath().toString(), inv.argAfter("--conda-env-prefix")) + } +} From b3cc991352e2baddb11143ac2b5b81e35278fb88 Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Thu, 14 May 2026 15:01:41 -0500 Subject: [PATCH 08/18] Add workarounds for PHG align commands; Add grits/MLImpute project name fallbacks --- docker/Dockerfile.dev | 19 +++++++++++++++++++ scripts/dev.sh | 14 +++++++++----- .../commands/SetupEnvironment.kt | 18 +++++++++++++++--- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index c5f2bc7..a48f723 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -82,6 +82,25 @@ RUN mkdir -p "${MAMBA_ROOT_PREFIX}" && \ # are directly callable without `pixi run` or `conda run` inside the container. ENV PATH="${MAMBA_ROOT_PREFIX}/envs/phgv2-conda/bin:${PATH}" +# --------------------------------------------------------------------------- +# `conda` shim that delegates to micromamba. +# +# PHGv2's AlignAssemblies hard-codes `conda run -n phgv2-conda ...` +# (anchorwave, minimap2, samtools) and spawns it via Java's ProcessBuilder +# instead of going through a shell. Since this image only ships micromamba +# (lighter weight and what we use to manage the phgv2-conda env above), +# `conda` is not normally available and PHG fails with +# "Cannot run program 'conda': No such file or directory". +# +# A tiny exec-shim is enough: micromamba accepts the same `run -n NAME -- +# CMD ARGS` form so we just forward everything. +# --------------------------------------------------------------------------- +RUN printf '%s\n' \ + '#!/bin/bash' \ + 'exec micromamba "$@"' \ + > /usr/local/bin/conda && \ + chmod +x /usr/local/bin/conda + # --------------------------------------------------------------------------- # Pre-install the seq-sim pixi env. We do this against a scratch directory so # the image layer is stable regardless of the user's host-side seq_sim_work. diff --git a/scripts/dev.sh b/scripts/dev.sh index 82282ee..bb9ad9e 100755 --- a/scripts/dev.sh +++ b/scripts/dev.sh @@ -73,26 +73,30 @@ case "$cmd" in ;; test|unit) ensure_image - run_in_container bash -lc "./gradlew test $*" + # Non-login (`bash -c`, not `-lc`): Debian's /etc/profile would otherwise + # reset PATH and drop /opt/micromamba/envs/phgv2-conda/bin (which holds + # anchorwave, minimap2, ...). The Dockerfile sets PATH via ENV, which + # is preserved here but blown away by a login shell. + run_in_container bash -c "./gradlew test $*" ;; integration|int) ensure_image - run_in_container bash -lc "./gradlew integrationTest $*" + run_in_container bash -c "./gradlew integrationTest $*" ;; e2e|smoke) ensure_image - run_in_container bash -lc "./gradlew e2eTest $*" + run_in_container bash -c "./gradlew e2eTest $*" ;; all) ensure_image - run_in_container bash -lc "./gradlew test integrationTest e2eTest $*" + run_in_container bash -c "./gradlew test integrationTest e2eTest $*" ;; run) ensure_image # Everything after `--` is passed to `gradlew run --args="..."` if [ "${1:-}" = "--" ]; then shift; fi args="$*" - run_in_container bash -lc "./gradlew run --args=\"$args\"" + run_in_container bash -c "./gradlew run --args=\"$args\"" ;; exec) ensure_image diff --git a/src/main/kotlin/net/maizegenetics/commands/SetupEnvironment.kt b/src/main/kotlin/net/maizegenetics/commands/SetupEnvironment.kt index 5820e61..e9ca7eb 100644 --- a/src/main/kotlin/net/maizegenetics/commands/SetupEnvironment.kt +++ b/src/main/kotlin/net/maizegenetics/commands/SetupEnvironment.kt @@ -85,9 +85,18 @@ class SetupEnvironment : CliktCommand(name = "setup-environment") { exitProcess(1) } - // Find extracted directory and rename to standard name (removes "-main" suffix) + // Find extracted directory and rename to standard name. + // + // GitHub names the top-level folder inside the archive after the + // *current* repo name (e.g. "MLImpute-main"). The upstream repo + // was renamed from `MLImpute` to `grits`, so today the archive + // extracts as `grits-main/`. We match on a structural marker + // (the gradle wrapper script we care about) instead of a name + // prefix so future renames don't break this step. val extractedDir = srcDir.toFile().listFiles { file -> - file.isDirectory && file.name.startsWith("MLImpute") && file.name != Constants.MLIMPUTE_DIR + file.isDirectory && + file.name != Constants.MLIMPUTE_DIR && + file.resolve("src/kotlin/gradlew").isFile }?.firstOrNull() if (extractedDir != null) { @@ -96,7 +105,10 @@ class SetupEnvironment : CliktCommand(name = "setup-environment") { logger.warn("Failed to rename MLImpute directory, will use extracted name: ${extractedDir.name}") } } else { - logger.warn("Could not find extracted MLImpute directory") + logger.warn( + "Could not find extracted MLImpute directory under $srcDir " + + "(no subdirectory containing src/kotlin/gradlew)" + ) } } From dd3da193e0c377478bb2f57685a7004b3dea6d53 Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Fri, 15 May 2026 11:40:15 -0500 Subject: [PATCH 09/18] Initial unit tests --- .../commands/ConvertToFastaUnitTest.kt | 311 ++++++++++++++++++ .../commands/DownsampleGvcfUnitTest.kt | 228 +++++++++++++ .../commands/MafToGvcfUnitTest.kt | 300 +++++++++++++++++ 3 files changed, 839 insertions(+) create mode 100644 src/test/kotlin/net/maizegenetics/commands/ConvertToFastaUnitTest.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/DownsampleGvcfUnitTest.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/MafToGvcfUnitTest.kt diff --git a/src/test/kotlin/net/maizegenetics/commands/ConvertToFastaUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/ConvertToFastaUnitTest.kt new file mode 100644 index 0000000..420a104 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/ConvertToFastaUnitTest.kt @@ -0,0 +1,311 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.ByteArrayOutputStream +import java.io.File +import java.nio.file.Path +import java.util.zip.GZIPOutputStream +import kotlin.io.path.createDirectories +import kotlin.io.path.exists +import kotlin.io.path.writeBytes +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [ConvertToFasta] (step 04 of the variant pipeline) that + * don't actually shell out to MLImpute. We install a + * [RecordingProcessExecutor] and verify the exact `./gradlew run --args=...` + * invocation, the working directory passed to gradlew, the + * decompression staging for compressed GVCFs, and the per-step + * `fasta_file_paths.txt` output contract that downstream steps depend on. + */ +class ConvertToFastaUnitTest { + + private val smallseqRoot: Path = File("src/test/resources/smallseq") + .absoluteFile.toPath() + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + private fun stubMlimpute(workDir: Path): Path { + val mlimputeKotlinDir = workDir.resolve("src/MLImpute/src/kotlin") + mlimputeKotlinDir.createDirectories() + val gradlew = mlimputeKotlinDir.resolve("gradlew") + gradlew.writeText("#!/bin/sh\nexit 0\n") + gradlew.toFile().setExecutable(true) + return mlimputeKotlinDir + } + + private fun writeGzipped(path: Path, contents: String) { + val baos = ByteArrayOutputStream() + GZIPOutputStream(baos).use { it.write(contents.toByteArray()) } + path.writeBytes(baos.toByteArray()) + } + + /** Parse the space-separated `key=value` pairs inside the `--args=` token. */ + private fun parseArgsValue(args: String): Map = + args.split(" ") + .filter { it.startsWith("--") && it.contains("=") } + .associate { + val (k, v) = it.removePrefix("--").split("=", limit = 2) + k to v + } + + /** + * Hook helper that simulates MLImpute writing a `.fasta` file for the + * configured `--out-file` path so the command's `writeFilePaths()` + * step can record real paths in `fasta_file_paths.txt`. + */ + private fun mlimputeSimulator(): (RecordingProcessExecutor.Invocation) -> Int = { inv -> + val args = inv.command.firstOrNull { it.startsWith("--args=") }?.removePrefix("--args=") + args?.split(" ") + ?.firstOrNull { it.startsWith("--out-file=") } + ?.removePrefix("--out-file=") + ?.let { out -> + File(out).also { f -> + f.parentFile?.mkdirs() + f.writeText(">simulated\nACGT\n") + } + } + 0 + } + + @Test + fun gradlewIsInvokedOncePerGvcfWithExpectedArgs(@TempDir workDir: Path) { + val mlimputeKotlinDir = stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + gvcfDir.resolve("LineA.gvcf").writeText("##fileformat=VCFv4.2\n") + gvcfDir.resolve("LineB.gvcf").writeText("##fileformat=VCFv4.2\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = mlimputeSimulator()) + ProcessRunner.withExecutor(executor) { + ConvertToFasta().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-file", gvcfDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--missing-records-as", "asN", + "--missing-genotype-as", "asRef" + ) + ) + } + + assertEquals(2, executor.invocations.size, "gradlew should be invoked once per GVCF file") + + val inv = executor.invocations.first() + assertEquals("./gradlew", inv.command[0]) + assertEquals("run", inv.command[1]) + assertEquals( + mlimputeKotlinDir.toFile().absoluteFile, + inv.workingDir?.absoluteFile, + "gradlew should run inside the MLImpute kotlin project" + ) + + val argsToken = inv.command.first { it.startsWith("--args=") }.removePrefix("--args=") + assertTrue( + argsToken.startsWith("convert-to-fasta "), + "First token in --args should be the MLImpute subcommand `convert-to-fasta`" + ) + val argMap = parseArgsValue(argsToken) + assertEquals( + smallseqRoot.resolve("Ref.fa").toAbsolutePath().toString(), + argMap["fasta-file"] + ) + assertEquals("asN", argMap["missing-records-as"]) + assertEquals("asRef", argMap["missing-genotype-as"]) + + // out-file must be a .fasta path under the default output directory. + val expectedOutDir = workDir.resolve("output/04_fasta_results") + .toAbsolutePath().toString() + val outFiles = executor.invocations.map { inv2 -> + inv2.command.first { it.startsWith("--args=") } + .let { parseArgsValue(it.removePrefix("--args=")) }["out-file"]!! + } + assertEquals(setOf(true), outFiles.map { it.endsWith(".fasta") }.toSet()) + assertTrue( + outFiles.all { it.startsWith(expectedOutDir) }, + "All FASTA outputs should land in $expectedOutDir; got: $outFiles" + ) + } + + @Test + fun ignoreContigIsForwardedOnlyWhenProvided(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + gvcfDir.resolve("LineA.gvcf").writeText("##fileformat=VCFv4.2\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = mlimputeSimulator()) + ProcessRunner.withExecutor(executor) { + ConvertToFasta().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-file", gvcfDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--ignore-contig", "chrUn,chloro" + ) + ) + } + + val inv = executor.invocations.single() + val argsToken = inv.command.first { it.startsWith("--args=") }.removePrefix("--args=") + val argMap = parseArgsValue(argsToken) + assertEquals("chrUn,chloro", argMap["ignore-contig"]) + } + + @Test + fun ignoreContigIsOmittedWhenEmpty(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + gvcfDir.resolve("LineA.gvcf").writeText("##fileformat=VCFv4.2\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = mlimputeSimulator()) + ProcessRunner.withExecutor(executor) { + ConvertToFasta().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-file", gvcfDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString() + ) + ) + } + + val argsToken = executor.invocations.single().command + .first { it.startsWith("--args=") } + .removePrefix("--args=") + assertTrue( + !argsToken.contains("--ignore-contig="), + "--ignore-contig should not be forwarded when not provided" + ) + } + + @Test + fun compressedGvcfsAreDecompressedBeforeMlimputeIsCalled(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + writeGzipped(gvcfDir.resolve("LineA.g.vcf.gz"), "##fileformat=VCFv4.2\nA\n") + writeGzipped(gvcfDir.resolve("LineB.gvcf.gz"), "##fileformat=VCFv4.2\nB\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = mlimputeSimulator()) + ProcessRunner.withExecutor(executor) { + ConvertToFasta().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-file", gvcfDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString() + ) + ) + } + + // Each invocation's `--gvcf-file` argument must point at the + // decompressed temp dir, NOT the original .g.vcf.gz inputs. + val gvcfArgs = executor.invocations.map { inv -> + inv.command.first { it.startsWith("--args=") } + .let { parseArgsValue(it.removePrefix("--args=")) }["gvcf-file"]!! + } + assertEquals(2, gvcfArgs.size) + assertTrue( + gvcfArgs.all { it.endsWith(".gvcf") }, + "MLImpute must always be handed a .gvcf path; got: $gvcfArgs" + ) + assertTrue( + gvcfArgs.none { it.endsWith(".gz") }, + "Compressed inputs must be decompressed before forwarding; got: $gvcfArgs" + ) + } + + @Test + fun fastaFilePathsTextFileListsGeneratedFastas(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + gvcfDir.resolve("LineA.gvcf").writeText("##fileformat=VCFv4.2\n") + gvcfDir.resolve("LineB.gvcf").writeText("##fileformat=VCFv4.2\n") + gvcfDir.resolve("LineC.gvcf").writeText("##fileformat=VCFv4.2\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = mlimputeSimulator()) + ProcessRunner.withExecutor(executor) { + ConvertToFasta().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-file", gvcfDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString() + ) + ) + } + + val fastaPaths = workDir.resolve("output/04_fasta_results/fasta_file_paths.txt").toFile() + assertTrue(fastaPaths.exists(), "fasta_file_paths.txt should be written") + val lines = fastaPaths.readLines().filter { it.isNotBlank() } + assertEquals(3, lines.size, "One FASTA path per GVCF should be listed") + assertTrue( + lines.all { it.endsWith(".fasta") }, + "Every listed path should end with .fasta" + ) + assertTrue( + lines.all { File(it).exists() && File(it).length() > 0 }, + "Every listed FASTA must exist on disk and be non-empty (from simulator)" + ) + } + + @Test + fun tempUncompressedDirIsAlwaysCleanedUp(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + writeGzipped(gvcfDir.resolve("LineA.g.vcf.gz"), "##fileformat=VCFv4.2\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = mlimputeSimulator()) + ProcessRunner.withExecutor(executor) { + ConvertToFasta().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-file", gvcfDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString() + ) + ) + } + + val tempDir = workDir.resolve("temp_uncompressed_gvcf_fasta") + assertTrue(!tempDir.exists(), "Temp uncompressed dir should be removed after the run") + } + + @Test + fun customOutputDirIsHonored(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + gvcfDir.resolve("LineA.gvcf").writeText("##fileformat=VCFv4.2\n") + val customOutput = workDir.resolve("custom_fasta_dir") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = mlimputeSimulator()) + ProcessRunner.withExecutor(executor) { + ConvertToFasta().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-file", gvcfDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--output-dir", customOutput.toString() + ) + ) + } + + val argsToken = executor.invocations.single().command + .first { it.startsWith("--args=") } + .removePrefix("--args=") + val argMap = parseArgsValue(argsToken) + val outFile = argMap["out-file"]!! + assertTrue( + outFile.startsWith(customOutput.toAbsolutePath().toString()), + "Custom --output-dir should be respected (got out-file=$outFile)" + ) + assertTrue( + customOutput.resolve("fasta_file_paths.txt").toFile().exists(), + "fasta_file_paths.txt should be written under the custom output dir" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/DownsampleGvcfUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/DownsampleGvcfUnitTest.kt new file mode 100644 index 0000000..e32b837 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/DownsampleGvcfUnitTest.kt @@ -0,0 +1,228 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.ByteArrayOutputStream +import java.nio.file.Path +import java.util.zip.GZIPOutputStream +import kotlin.io.path.createDirectories +import kotlin.io.path.exists +import kotlin.io.path.writeBytes +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [DownsampleGvcf] (step 03 of the variant pipeline) that + * don't actually shell out to MLImpute. We install a + * [RecordingProcessExecutor] and verify the exact `./gradlew run --args=...` + * invocation, the working directory passed to gradlew, and the staging + * behavior for compressed / mis-named GVCFs. + */ +class DownsampleGvcfUnitTest { + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + /** + * Create the fake MLImpute kotlin subproject inside [workDir] so the + * command's existence checks pass. + */ + private fun stubMlimpute(workDir: Path): Path { + val mlimputeKotlinDir = workDir.resolve("src/MLImpute/src/kotlin") + mlimputeKotlinDir.createDirectories() + val gradlew = mlimputeKotlinDir.resolve("gradlew") + gradlew.writeText("#!/bin/sh\nexit 0\n") + gradlew.toFile().setExecutable(true) + return mlimputeKotlinDir + } + + private fun writeGzipped(path: Path, contents: String) { + val baos = ByteArrayOutputStream() + GZIPOutputStream(baos).use { it.write(contents.toByteArray()) } + path.writeBytes(baos.toByteArray()) + } + + private fun parseArgsValue(args: String): Map { + // The single --args= string contains space-separated key=value pairs. + // It also has the "downsample-gvcf" subcommand as the first token. + return args.split(" ") + .filter { it.startsWith("--") && it.contains("=") } + .associate { + val (k, v) = it.removePrefix("--").split("=", limit = 2) + k to v + } + } + + @Test + fun gradlewIsInvokedWithExpectedDownsampleArgs(@TempDir workDir: Path) { + val mlimputeKotlinDir = stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + gvcfDir.resolve("LineA.gvcf").writeText("##fileformat=VCFv4.2\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + DownsampleGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-dir", gvcfDir.toString(), + "--rates", "0.1,0.2", + "--seed", "42", + "--keep-ref", "false", + "--min-ref-block-size", "30" + ) + ) + } + + assertEquals(1, executor.invocations.size, "gradlew should be invoked exactly once") + val inv = executor.invocations.single() + + assertEquals("./gradlew", inv.command[0]) + assertEquals("run", inv.command[1]) + assertEquals( + mlimputeKotlinDir.toFile().absoluteFile, + inv.workingDir?.absoluteFile, + "gradlew should run inside the MLImpute kotlin project" + ) + + val argsToken = inv.command.first { it.startsWith("--args=") }.removePrefix("--args=") + assertTrue( + argsToken.startsWith("downsample-gvcf "), + "First token in --args should be the MLImpute subcommand `downsample-gvcf`" + ) + val argMap = parseArgsValue(argsToken) + assertEquals("0.1,0.2", argMap["rates"]) + assertEquals("42", argMap["seed"]) + assertEquals("false", argMap["keep-ref"]) + assertEquals("30", argMap["min-ref-block-size"]) + + // Output directory defaults to workDir/output/03_downsample_results + val expectedOutDir = workDir.resolve("output/03_downsample_results") + .toAbsolutePath().toString() + assertEquals(expectedOutDir, argMap["out-dir"]) + + // Already-uncompressed .gvcf -- MLImpute reads directly from the input dir + assertEquals(gvcfDir.toAbsolutePath().toString(), argMap["gvcf-dir"]) + + // --ignore-contig is empty by default and should NOT be forwarded + assertTrue(!argsToken.contains("--ignore-contig=")) + } + + @Test + fun ignoreContigIsForwardedOnlyWhenProvided(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + gvcfDir.resolve("LineA.gvcf").writeText("##fileformat=VCFv4.2\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + DownsampleGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-dir", gvcfDir.toString(), + "--ignore-contig", "chrM,chrPt" + ) + ) + } + + val inv = executor.invocations.single() + val argsToken = inv.command.first { it.startsWith("--args=") }.removePrefix("--args=") + val argMap = parseArgsValue(argsToken) + assertEquals("chrM,chrPt", argMap["ignore-contig"]) + } + + @Test + fun compressedGvcfsAreDecompressedToTempDirBeforeForwarding(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + // Two compressed inputs forces the command to use a temp directory + // for MLImpute since none of the staged files live under gvcfDir. + writeGzipped(gvcfDir.resolve("LineA.g.vcf.gz"), "##fileformat=VCFv4.2\nA\n") + writeGzipped(gvcfDir.resolve("LineB.gvcf.gz"), "##fileformat=VCFv4.2\nB\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + DownsampleGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-dir", gvcfDir.toString(), + "--keep-uncompressed" + ) + ) + } + + val inv = executor.invocations.single() + val argsToken = inv.command.first { it.startsWith("--args=") }.removePrefix("--args=") + val argMap = parseArgsValue(argsToken) + + val mlimputeInputDir = Path.of(argMap["gvcf-dir"]!!) + assertTrue( + mlimputeInputDir.toAbsolutePath().startsWith(workDir.toAbsolutePath()), + "MLImpute input dir should be under the working directory" + ) + assertTrue( + mlimputeInputDir.fileName.toString() == "temp_uncompressed_gvcf", + "Compressed inputs should be staged in the temp_uncompressed_gvcf dir, " + + "got: $mlimputeInputDir" + ) + + // With --keep-uncompressed the temp directory survives so we can + // assert decompression actually occurred. + val tempDir = workDir.resolve("temp_uncompressed_gvcf") + assertTrue(tempDir.exists(), "Temp uncompressed dir should exist") + val decompressed = tempDir.toFile().listFiles()!!.map { it.name }.toSet() + assertEquals(setOf("LineA.gvcf", "LineB.gvcf"), decompressed) + } + + @Test + fun tempUncompressedDirIsCleanedUpByDefault(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + writeGzipped(gvcfDir.resolve("LineA.g.vcf.gz"), "##fileformat=VCFv4.2\n") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + DownsampleGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-dir", gvcfDir.toString() + // --keep-uncompressed NOT set; temp dir should be cleaned up + ) + ) + } + + val tempDir = workDir.resolve("temp_uncompressed_gvcf") + assertTrue(!tempDir.exists(), "Temp uncompressed dir should be removed after run") + } + + @Test + fun customOutputDirIsForwarded(@TempDir workDir: Path) { + stubMlimpute(workDir) + val gvcfDir = workDir.resolve("gvcfs").also { it.createDirectories() } + gvcfDir.resolve("LineA.gvcf").writeText("##fileformat=VCFv4.2\n") + val customOutput = workDir.resolve("custom_downsample_out") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + DownsampleGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--gvcf-dir", gvcfDir.toString(), + "--output-dir", customOutput.toString() + ) + ) + } + + val inv = executor.invocations.single() + val argsToken = inv.command.first { it.startsWith("--args=") }.removePrefix("--args=") + val argMap = parseArgsValue(argsToken) + assertEquals(customOutput.toAbsolutePath().toString(), argMap["out-dir"]) + assertTrue(customOutput.exists(), "Custom output dir should have been created") + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/MafToGvcfUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/MafToGvcfUnitTest.kt new file mode 100644 index 0000000..bba24b9 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/MafToGvcfUnitTest.kt @@ -0,0 +1,300 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [MafToGvcf] (step 02 of the variant pipeline) that + * don't actually shell out to `biokotlin-tools`. We install a + * [RecordingProcessExecutor] and verify the exact command line that + * seq-sim would send to `biokotlin-tools maf-to-gvcf-converter`, plus + * that `gvcf_file_paths.txt` is written for downstream steps. + */ +class MafToGvcfUnitTest { + + private val smallseqRoot: Path = File("src/test/resources/smallseq") + .absoluteFile.toPath() + + // The dev container sets SEQ_SIM_SKIP_PIXI_PREFIX=1, which makes + // ProcessRunner silently drop a leading `pixi run` before it reaches the + // executor. That's the right behavior at runtime (the container's PATH + // already has the pixi tools on it), but it would break the positional + // assertions below that verify *what the command builds*, not what + // actually runs. We snapshot the flag, force it off for the duration of + // each test, and restore it in @AfterEach so we don't leak state into + // other test classes. + private val originalSkipPixi = ProcessRunner.skipPixiPrefix + + @BeforeEach + fun disablePixiStripping() { + ProcessRunner.skipPixiPrefix = false + } + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + ProcessRunner.skipPixiPrefix = originalSkipPixi + } + + /** + * Create a fake biokotlin-tools layout (bin/biokotlin-tools) inside + * [workDir] so the command's + * [net.maizegenetics.utils.ValidationUtils.validateBiokotlinSetup] passes. + */ + private fun stubBiokotlinBinary(workDir: Path): Path { + val biokotlinDir = workDir.resolve("src/biokotlin-tools/bin") + biokotlinDir.createDirectories() + val binary = biokotlinDir.resolve("biokotlin-tools") + binary.writeText("#!/bin/sh\nexit 0\n") + binary.toFile().setExecutable(true) + return binary + } + + /** + * Drop a few stub `.maf` files into [dir] so the command's + * `collectMafFiles()` has something to iterate over. Content doesn't + * matter because biokotlin-tools is mocked. + */ + private fun stubMafFiles(dir: Path, names: List): List { + dir.createDirectories() + return names.map { name -> + val f = dir.resolve(name) + f.writeText("##maf version=1\n") + f + } + } + + @Test + fun biokotlinIsInvokedOncePerMafWithExpectedArgs(@TempDir workDir: Path) { + val biokotlinBinary = stubBiokotlinBinary(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf", "LineB.maf")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + // Simulate biokotlin-tools producing the .g.vcf.gz output file. + val outputArg = inv.command.firstOrNull { it.startsWith("--output-file=") } + outputArg?.removePrefix("--output-file=")?.let { outputPath -> + File("$outputPath.gz").also { f -> + f.parentFile?.mkdirs() + f.writeText("##fileformat=VCFv4.2\n") + } + } + 0 + } + + ProcessRunner.withExecutor(executor) { + MafToGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--reference-file", smallseqRoot.resolve("Ref.fa").toString(), + "--maf-file", mafDir.toString(), + "--sample-name", "smallseq" + ) + ) + } + + // One invocation per MAF file + assertEquals(2, executor.invocations.size, "biokotlin should be invoked once per MAF file") + + // Verify command shape on the first invocation + val inv = executor.invocations.first() + assertTrue( + inv.command.first() == "pixi" && inv.command[1] == "run", + "biokotlin-tools should be launched through `pixi run`" + ) + assertEquals(biokotlinBinary.toString(), inv.command[2]) + assertEquals("maf-to-gvcf-converter", inv.command[3]) + + val refArg = inv.command.firstOrNull { it.startsWith("--reference-file=") } + assertEquals( + "--reference-file=${smallseqRoot.resolve("Ref.fa")}", + refArg + ) + val sampleNameArg = inv.command.firstOrNull { it.startsWith("--sample-name=") } + assertEquals("--sample-name=smallseq", sampleNameArg) + + // Auto-generated output files end with .g.vcf and live in default 02_gvcf_results + val expectedOutDir = workDir.resolve("output/02_gvcf_results") + val outputArgs = executor.invocations.map { inv2 -> + inv2.command.first { it.startsWith("--output-file=") } + } + assertTrue( + outputArgs.all { it.endsWith(".g.vcf") }, + "Auto-generated output filenames should end with .g.vcf (biokotlin adds .gz)" + ) + assertTrue( + outputArgs.all { it.contains(expectedOutDir.toString()) }, + "Outputs should live in /output/02_gvcf_results by default" + ) + } + + @Test + fun sampleNameDefaultsToMafBaseNameWhenUnset(@TempDir workDir: Path) { + stubBiokotlinBinary(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf", "LineB.maf")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + // Produce the expected .g.vcf.gz so writeFilePaths succeeds. + val outputArg = inv.command.firstOrNull { it.startsWith("--output-file=") } + outputArg?.removePrefix("--output-file=")?.let { outputPath -> + File("$outputPath.gz").also { f -> + f.parentFile?.mkdirs() + f.writeText("##fileformat=VCFv4.2\n") + } + } + 0 + } + + ProcessRunner.withExecutor(executor) { + MafToGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--reference-file", smallseqRoot.resolve("Ref.fa").toString(), + "--maf-file", mafDir.toString() + ) + ) + } + + val sampleNames = executor.invocations.map { inv -> + inv.command.first { it.startsWith("--sample-name=") } + .removePrefix("--sample-name=") + } + assertEquals(setOf("LineA", "LineB"), sampleNames.toSet()) + } + + @Test + fun gvcfFilePathsTextFileListsGeneratedGvcfs(@TempDir workDir: Path) { + stubBiokotlinBinary(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf", "LineB.maf", "LineC.maf")) + + // RecordingProcessExecutor doesn't run biokotlin, so simulate its + // side-effect: write a .g.vcf.gz at the requested --output-file path + // (with biokotlin's implicit .gz suffix added by the command itself). + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + val outputArg = inv.command.firstOrNull { it.startsWith("--output-file=") } + outputArg?.removePrefix("--output-file=")?.let { outputPath -> + File("$outputPath.gz").also { f -> + f.parentFile?.mkdirs() + f.writeText("##fileformat=VCFv4.2\n") + } + } + 0 + } + + ProcessRunner.withExecutor(executor) { + MafToGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--reference-file", smallseqRoot.resolve("Ref.fa").toString(), + "--maf-file", mafDir.toString() + ) + ) + } + + val gvcfPaths = workDir.resolve("output/02_gvcf_results/gvcf_file_paths.txt").toFile() + assertTrue(gvcfPaths.exists(), "gvcf_file_paths.txt should be written") + val lines = gvcfPaths.readLines().filter { it.isNotBlank() } + assertEquals(3, lines.size, "One GVCF path per MAF file should be listed") + assertTrue( + lines.all { it.endsWith(".g.vcf.gz") }, + "Every listed path should be a .g.vcf.gz file" + ) + assertTrue( + lines.all { File(it).exists() }, + "Every listed GVCF must exist on disk" + ) + } + + @Test + fun customOutputDirIsHonored(@TempDir workDir: Path) { + stubBiokotlinBinary(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf")) + val customOutput = workDir.resolve("custom_gvcf_dir") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + val outputArg = inv.command.firstOrNull { it.startsWith("--output-file=") } + outputArg?.removePrefix("--output-file=")?.let { outputPath -> + File("$outputPath.gz").also { f -> + f.parentFile?.mkdirs() + f.writeText("##fileformat=VCFv4.2\n") + } + } + 0 + } + + ProcessRunner.withExecutor(executor) { + MafToGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--reference-file", smallseqRoot.resolve("Ref.fa").toString(), + "--maf-file", mafDir.toString(), + "--output-dir", customOutput.toString() + ) + ) + } + + val inv = executor.invocations.single() + val outputArg = inv.command.first { it.startsWith("--output-file=") } + assertTrue( + outputArg.startsWith("--output-file=${customOutput.toAbsolutePath()}"), + "Outputs should be written to the custom --output-dir: $outputArg" + ) + assertTrue( + customOutput.resolve("gvcf_file_paths.txt").toFile().exists(), + "gvcf_file_paths.txt should be written under the custom output dir" + ) + } + + @Test + fun singleMafFileWithExplicitOutputFileNameIsHonored(@TempDir workDir: Path) { + stubBiokotlinBinary(workDir) + val mafFile = workDir.resolve("only.maf").also { + it.writeText("##maf version=1\n") + } + val customOutputName = workDir.resolve("custom_name.g.vcf") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + val outputArg = inv.command.firstOrNull { it.startsWith("--output-file=") } + outputArg?.removePrefix("--output-file=")?.let { outputPath -> + File("$outputPath.gz").also { f -> + f.parentFile?.mkdirs() + f.writeText("##fileformat=VCFv4.2\n") + } + } + 0 + } + + ProcessRunner.withExecutor(executor) { + MafToGvcf().parse( + listOf( + "--work-dir", workDir.toString(), + "--reference-file", smallseqRoot.resolve("Ref.fa").toString(), + "--maf-file", mafFile.toString(), + "--output-file", customOutputName.toString() + ) + ) + } + + val inv = executor.invocations.single() + val outputArg = inv.command.first { it.startsWith("--output-file=") } + // biokotlin adds .gz, so the .g.vcf form should be passed in + assertTrue( + outputArg.endsWith("/custom_name.g.vcf"), + "Explicit --output-file should be respected (got: $outputArg)" + ) + } +} From c7dc8cede03f3f6c4b908bc03cb606598fd0f2dd Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Fri, 15 May 2026 11:40:49 -0500 Subject: [PATCH 10/18] Add steps 1-4 e2e test --- .../integration/OrchestrateE2ETest.kt | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt index d01aaf1..6dc12e8 100644 --- a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt +++ b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt @@ -7,7 +7,10 @@ import org.junit.jupiter.api.Test import org.junit.jupiter.api.io.TempDir import java.io.File import java.nio.file.Path +import kotlin.io.path.ExperimentalPathApi import kotlin.io.path.createDirectories +import kotlin.io.path.deleteRecursively +import kotlin.io.path.exists import kotlin.io.path.writeText import kotlin.test.assertTrue @@ -23,6 +26,23 @@ class OrchestrateE2ETest { private val smallseqRoot: Path = File("src/test/resources/smallseq") .absoluteFile.toPath() + /** + * Create (or reset) a stable, inspectable working directory under + * `build/test-output/` for a long-form E2E test. We deliberately avoid + * [TempDir] here so that intermediate pipeline outputs survive the + * test for post-mortem inspection. The directory is wiped on each run + * to keep the test hermetic, and `./gradlew clean` removes it. + */ + @OptIn(ExperimentalPathApi::class) + private fun persistentWorkDir(testName: String): Path { + val workDir = File("build/test-output/$testName").absoluteFile.toPath() + if (workDir.exists()) { + workDir.deleteRecursively() + } + workDir.createDirectories() + return workDir + } + @Test fun orchestrateSmallseqPipelineProducesMafAndGvcf(@TempDir workDir: Path) { // align-assemblies now drives PHGv2 internally, so we need both the @@ -70,4 +90,169 @@ class OrchestrateE2ETest { "Every gVCF referenced in gvcf_file_paths.txt must exist on disk" ) } + + /** + * Full variant-pipeline (steps 1-4) E2E: align_assemblies -> + * maf_to_gvcf -> downsample_gvcf -> convert_to_fasta. Validates that + * every step's expected outputs are produced and chained together + * correctly by the orchestrator. + * + * Unlike the other E2E test, this one does NOT use [TempDir] -- it + * pins the working directory to a fixed location under `build/` so + * the intermediate outputs persist for post-mortem inspection. The + * location is logged at the start of the test and is wiped on each + * fresh run to keep the test hermetic. + */ + @Test + fun orchestrateRunsVariantPipelineStepsOneThroughFour() { + // Steps 1-4 require: PHG + AnchorWave (step 1), biokotlin-tools (step 2), + // and MLImpute (steps 3-4). The orchestrator's auto-run of + // setup-environment populates biokotlin-tools and MLImpute on first + // run; the PHGv2 binary is picked up from SEQ_SIM_PHG_DIR. + IntegrationGuard.requirePhg() + IntegrationGuard.requireAnchorwave() + + val workDir = persistentWorkDir("orchestrate-steps-1-4") + println(">>> Persisting variant-pipeline E2E outputs at: $workDir") + + val configPath = workDir.resolve("pipeline.yaml") + configPath.writeText( + """ + work_dir: "${workDir.toString()}" + + run_steps: + - align_assemblies + - maf_to_gvcf + - downsample_gvcf + - convert_to_fasta + + align_assemblies: + ref_gff: "${smallseqRoot.resolve("anchors.gff")}" + ref_fasta: "${smallseqRoot.resolve("Ref.fa")}" + query_fasta: "${smallseqRoot.resolve("queries")}" + threads: 2 + + maf_to_gvcf: + sample_name: "smallseq" + + downsample_gvcf: + rates: "0.2,0.4" + seed: 42 + keep_ref: true + min_ref_block_size: 20 + + convert_to_fasta: + missing_records_as: "asRef" + missing_genotype_as: "asN" + """.trimIndent() + ) + + Orchestrate().parse(listOf("--config", configPath.toString())) + + // --------------------------------------------------------------- + // Step 1: align_assemblies -> 01_anchorwave_results/ + // --------------------------------------------------------------- + val step1Dir = workDir.resolve("output/01_anchorwave_results").toFile() + assertTrue(step1Dir.exists() && step1Dir.isDirectory, "Step 1 output directory must exist") + val mafPaths = File(step1Dir, "maf_file_paths.txt") + assertTrue(mafPaths.exists() && mafPaths.length() > 0, "maf_file_paths.txt must be non-empty") + val mafFiles = mafPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(mafFiles.isNotEmpty(), "At least one MAF should be produced") + assertTrue( + mafFiles.all { it.exists() && it.length() > 0 }, + "Every MAF listed must exist on disk and be non-empty" + ) + + // --------------------------------------------------------------- + // Step 2: maf_to_gvcf -> 02_gvcf_results/ + // --------------------------------------------------------------- + val step2Dir = workDir.resolve("output/02_gvcf_results").toFile() + assertTrue(step2Dir.exists() && step2Dir.isDirectory, "Step 2 output directory must exist") + val gvcfPaths = File(step2Dir, "gvcf_file_paths.txt") + assertTrue(gvcfPaths.exists(), "gvcf_file_paths.txt must exist") + val gvcfFiles = gvcfPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(gvcfFiles.isNotEmpty(), "At least one gVCF should be listed") + assertTrue( + gvcfFiles.all { it.exists() && it.length() > 0 }, + "Every gVCF referenced in gvcf_file_paths.txt must exist on disk and be non-empty" + ) + assertTrue( + gvcfFiles.all { it.name.endsWith(".g.vcf.gz") }, + "Every gVCF must be a compressed .g.vcf.gz (biokotlin compresses by default)" + ) + + // --------------------------------------------------------------- + // Step 3: downsample_gvcf -> 03_downsample_results/ + // --------------------------------------------------------------- + val step3Dir = workDir.resolve("output/03_downsample_results").toFile() + assertTrue(step3Dir.exists() && step3Dir.isDirectory, "Step 3 output directory must exist") + val downsampledFiles = step3Dir.listFiles()?.toList().orEmpty() + val downsampledGvcfs = downsampledFiles.filter { + it.name.endsWith(".gvcf") || it.name.endsWith(".g.vcf") || + it.name.endsWith(".gvcf.gz") || it.name.endsWith(".g.vcf.gz") + } + assertTrue( + downsampledGvcfs.isNotEmpty(), + "At least one downsampled GVCF should be produced (got: ${downsampledFiles.map { it.name }})" + ) + assertTrue( + downsampledGvcfs.all { it.length() > 0 }, + "Every downsampled GVCF must be non-empty" + ) + + // Step 3's temp_uncompressed_gvcf staging directory should be cleaned up + // by default (keep_uncompressed is false). + val tempStaging = workDir.resolve("temp_uncompressed_gvcf").toFile() + assertTrue( + !tempStaging.exists(), + "Step 3's temp_uncompressed_gvcf dir should be cleaned up after the run" + ) + + // --------------------------------------------------------------- + // Step 4: convert_to_fasta -> 04_fasta_results/ + // --------------------------------------------------------------- + val step4Dir = workDir.resolve("output/04_fasta_results").toFile() + assertTrue(step4Dir.exists() && step4Dir.isDirectory, "Step 4 output directory must exist") + val fastaPaths = File(step4Dir, "fasta_file_paths.txt") + assertTrue(fastaPaths.exists(), "fasta_file_paths.txt must exist") + val fastaFiles = fastaPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(fastaFiles.isNotEmpty(), "At least one FASTA should be listed") + assertTrue( + fastaFiles.all { it.exists() && it.length() > 0 }, + "Every FASTA referenced in fasta_file_paths.txt must exist on disk and be non-empty" + ) + assertTrue( + fastaFiles.all { it.name.endsWith(".fasta") }, + "Every FASTA must end with .fasta" + ) + + // Step 4's temp_uncompressed_gvcf_fasta staging directory should be + // cleaned up after the run. + val step4Staging = workDir.resolve("temp_uncompressed_gvcf_fasta").toFile() + assertTrue( + !step4Staging.exists(), + "Step 4's temp_uncompressed_gvcf_fasta dir should be cleaned up after the run" + ) + + // --------------------------------------------------------------- + // Log file contract: each pipeline step writes its own log file. + // --------------------------------------------------------------- + val logsDir = workDir.resolve("logs").toFile() + assertTrue(logsDir.exists() && logsDir.isDirectory, "logs/ should exist") + val logNames = logsDir.listFiles()?.map { it.name }?.toSet().orEmpty() + listOf( + "00_orchestrate.log", + "01_align_assemblies.log", + "02_maf_to_gvcf.log", + "03_downsample_gvcf.log", + "04_convert_to_fasta.log" + ).forEach { expected -> + assertTrue( + expected in logNames, + "Expected log $expected to be present in logs/; saw $logNames" + ) + } + + println(">>> Variant-pipeline E2E outputs preserved at: $workDir") + } } From b031d8b5d3511249a0920b61756e945e53fd93e2 Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Fri, 15 May 2026 16:36:30 -0500 Subject: [PATCH 11/18] Add memory fixes --- build.gradle.kts | 31 +++++++++++++++++++++++++++++++ docker/Dockerfile.dev | 37 +++++++++++++++++++++++++++++++++---- docker/docker-compose.yml | 29 +++++++++++++++++++++++++++++ docker/phg_environment.yml | 12 ++++++++++++ 4 files changed, 105 insertions(+), 4 deletions(-) diff --git a/build.gradle.kts b/build.gradle.kts index 88c52b6..9fe16ef 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -46,6 +46,19 @@ tasks.test { } } +// Heavy test tiers spawn full pipelines that shell out to subprocess +// Gradle daemons (MLImpute) and native tools (AnchorWave / minimap2 / +// python). The kernel OOM killer (exit 137 / SIGKILL) fires when total +// CONTAINER memory is exceeded -- not the JVM heap. We fork per test +// class so memory is released between classes; the heap itself is left +// at Gradle's default (which is enough for smallseq). +fun Test.applyHeavyTestConfig() { + // Each end-to-end test runs a full pipeline; isolate them so the + // JVM frees process-wide resources (classloaders, threadpools, + // native handles) between classes. + setForkEvery(1L) +} + val integrationTest = tasks.register("integrationTest") { description = "Runs per-step integration tests against real external binaries (requires seq-sim-dev container)." group = "verification" @@ -53,10 +66,21 @@ val integrationTest = tasks.register("integrationTest") { testClassesDirs = sourceSets.test.get().output.classesDirs classpath = sourceSets.test.get().runtimeClasspath + // Restrict Gradle's class-file scan so `forkEvery = 1` only spawns + // JVMs for the integration test classes. Without this, the heavy + // test config forks one JVM per test class in the entire test + // source set (~20 classes), which is both wasteful and flaky -- + // any single fork's startup failure surfaces as + // "Gradle Test Executor N finished with non-zero exit value 1" + // and aborts the whole task. + include("**/*IntegrationTest.class") + useJUnitPlatform { includeTags("integration") } + applyHeavyTestConfig() + shouldRunAfter(tasks.test) outputs.upToDateWhen { false } } @@ -68,10 +92,17 @@ val e2eTest = tasks.register("e2eTest") { testClassesDirs = sourceSets.test.get().output.classesDirs classpath = sourceSets.test.get().runtimeClasspath + // See the comment on `integrationTest` above -- restrict scanning so + // `forkEvery = 1` only forks JVMs for `*E2ETest` classes instead of + // the entire test source set. + include("**/*E2ETest.class") + useJUnitPlatform { includeTags("e2e") } + applyHeavyTestConfig() + shouldRunAfter(integrationTest) outputs.upToDateWhen { false } } diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index a48f723..3786c3c 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -108,9 +108,22 @@ RUN printf '%s\n' \ # on first use, but the solver cache is already populated. # --------------------------------------------------------------------------- COPY src/main/resources/pixi.toml /opt/seq-sim-prebuilt/pixi.toml -RUN cd /opt/seq-sim-prebuilt && \ - pixi install --manifest-path pixi.toml || \ - echo "pixi install warmup failed; runtime will retry" +# The pixi warmup MUST succeed -- if it fails silently, every runtime test +# in a fresh workdir does a cold-cache pixi install whose solver+download +# peak can exceed the container memory limit and get SIGKILL'd (exit 137). +# Fail the image build loudly instead of hiding the regression. +# +# We point PIXI/RATTLER_CACHE_DIR at the same path the runtime container +# uses (see docker-compose.yml). That way the downloaded .conda packages +# end up in /var/cache/pixi during the image build, get baked into the +# image layer, and seed the named volume on first mount -- so the +# runtime `pixi install` is just hardlinks/copies from a warm cache. +ENV PIXI_CACHE_DIR=/var/cache/pixi +ENV RATTLER_CACHE_DIR=/var/cache/pixi +RUN mkdir -p /var/cache/pixi && \ + cd /opt/seq-sim-prebuilt && \ + pixi install --manifest-path pixi.toml && \ + test -d /opt/seq-sim-prebuilt/.pixi/envs/default # --------------------------------------------------------------------------- # Pre-download PHGv2 latest release so `setup-environment` can skip the @@ -149,7 +162,23 @@ ENV SEQ_SIM_IN_CONTAINER=1 \ WORKDIR /workspace # Allow non-root operation by default (uid is overridden from docker-compose). +# `scripts/dev.sh` sets the container user to the *host* uid (e.g. 501/503 +# on macOS) so files written into the bind-mounted repo are owned by the +# host user. That means we can't pin the pixi cache to any specific uid: +# the dev user (uid 1000) is just the in-image owner of the warmup +# artifacts, but at runtime the cache must be writable by whatever uid +# docker-compose decides to use. +# +# We therefore make /var/cache/pixi recursively world-readable/writable +# (a+rwX). When the named docker volume for /var/cache/pixi (see +# docker-compose.yml) is first mounted, docker seeds it from the image's +# contents at that path, so the world-writable mode propagates to the +# fresh volume and any uid can acquire pixi's lock file. New files pixi +# writes inherit the container user's umask (typically 0022), which is +# fine as long as a single host uid uses a given volume -- and the +# volume gets recreated whenever the host user changes. RUN useradd -m -u 1000 -s /bin/bash dev && \ - chown -R dev:dev /opt/seq-sim-prebuilt + chown -R dev:dev /opt/seq-sim-prebuilt /var/cache/pixi && \ + chmod -R a+rwX /var/cache/pixi CMD ["/bin/bash"] diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 7179a17..262dffb 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -10,6 +10,19 @@ services: # On macOS Docker Desktop the bind mount translates these ids for you; # on Linux this matches the invoker's uid (see scripts/dev.sh). user: "${SEQ_SIM_UID:-1000}:${SEQ_SIM_GID:-1000}" + # End-to-end tests run a full pipeline: the test JVM forks MLImpute's + # Gradle daemon, biokotlin-tools' JVM, AnchorWave/minimap2 natives, + # python (pysam/CrossMap), and pixi shims. Combined RSS comfortably + # exceeds 4 GB during the heaviest steps. On macOS Docker Desktop the + # container also inherits whatever VM memory the user set in + # Preferences -> Resources, so we explicitly raise the per-container + # limit here to surface a clear error (the kernel OOM killer otherwise + # SIGKILLs the test JVM and reports it as `exit 137`). + # + # Override with SEQ_SIM_MEM_LIMIT=8g (or larger) when running + # `scripts/dev.sh e2e` on a memory-constrained host. + mem_limit: ${SEQ_SIM_MEM_LIMIT:-16g} + memswap_limit: ${SEQ_SIM_MEM_LIMIT:-16g} environment: - SEQ_SIM_IN_CONTAINER=1 - SEQ_SIM_SKIP_PHG_SETUP=1 @@ -17,6 +30,17 @@ services: - SEQ_SIM_SKIP_PIXI_PREFIX=1 - GRADLE_USER_HOME=/workspace/.gradle-container - HOME=/workspace/.home-container + # Keep pixi's package cache off the macOS-side bind mount. Docker + # Desktop on macOS occasionally returns EINVAL (os error 22) for + # hardlink operations inside virtiofs/gRPC-FUSE bind mounts, which + # breaks `pixi install` mid-link when it hardlinks from the cache + # into /.pixi/envs/.... The named volume below is backed + # by the docker storage driver (overlayfs on Linux VMs), where + # hardlinks behave correctly. It also persists across the + # `compose run --rm` container recreations so we don't re-download + # packages on every `scripts/dev.sh e2e` invocation. + - PIXI_CACHE_DIR=/var/cache/pixi + - RATTLER_CACHE_DIR=/var/cache/pixi volumes: # Single bind mount covers everything: the repo itself plus the # .gradle-container/ and .home-container/ subdirs used as caches. @@ -25,5 +49,10 @@ services: # - first-run gradle wrapper downloads don't hit "permission denied" # - tearing down (./scripts/dev.sh clean) leaves the caches visible - ..:/workspace + # Named volume for the pixi/rattler package cache (see env vars above). + - pixi-cache:/var/cache/pixi tty: true stdin_open: true + +volumes: + pixi-cache: diff --git a/docker/phg_environment.yml b/docker/phg_environment.yml index 3187f60..cc22f33 100644 --- a/docker/phg_environment.yml +++ b/docker/phg_environment.yml @@ -19,3 +19,15 @@ dependencies: - agc>=3.1 - ropebwt3>=3.8 - minimap2>=2.28 + # Python deps needed by the MLImpute scripts that back pick_crossovers, + # convert_coordinates, and generate_recombined_sequences. They live in + # the seq-sim pixi env, but inside the container SEQ_SIM_SKIP_PIXI_PREFIX=1 + # routes those calls through this conda env directly, so the deps must + # be present here too. + - numpy + - pandas + - pysam + - crossmap + # Native CLI deps used by create_chain_files / format_recombined_fastas. + - seqkit + - parallel From 535d450b84beeab0f358ae677e982e59d09c1e3c Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Fri, 15 May 2026 16:36:48 -0500 Subject: [PATCH 12/18] Fixi pixi permission issues --- scripts/dev.sh | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/scripts/dev.sh b/scripts/dev.sh index bb9ad9e..c812af2 100755 --- a/scripts/dev.sh +++ b/scripts/dev.sh @@ -53,6 +53,49 @@ ensure_image() { fi } +# The pixi-cache named volume is seeded from the image's /var/cache/pixi +# the first time it is mounted, and from then on its perms persist for +# the lifetime of the volume. We always run the container as the host +# uid (`id -u`), so a volume seeded by an older image (which baked the +# cache as uid 1000) is unwritable from the host uid and breaks +# `pixi install` with "Permission denied" on the repodata lock. Detect +# that mismatch and refuse to proceed until the user nukes the volume. +ensure_pixi_cache_writable() { + local volume_name + volume_name="$($COMPOSE -f "$COMPOSE_FILE" config --format json 2>/dev/null \ + | jq -r '.volumes["pixi-cache"].name // empty' 2>/dev/null)" + if [ -z "$volume_name" ]; then + # Fall back to the default `_` naming. + volume_name="docker_pixi-cache" + fi + if ! docker volume inspect "$volume_name" >/dev/null 2>&1; then + return 0 + fi + # Probe the volume by trying to create a sentinel file as the host uid. + if docker run --rm \ + --user "${SEQ_SIM_UID}:${SEQ_SIM_GID}" \ + -v "${volume_name}:/var/cache/pixi" \ + alpine:3.20 sh -c 'touch /var/cache/pixi/.seq-sim-write-probe && rm /var/cache/pixi/.seq-sim-write-probe' \ + >/dev/null 2>&1 + then + return 0 + fi + cat >&2 < ERROR: the docker volume "${volume_name}" is not writable by uid ${SEQ_SIM_UID}. + This usually means it was seeded by an older image that pinned the pixi + cache to uid 1000. The current image marks /var/cache/pixi world-writable, + but named volumes preserve their content (and perms) across rebuilds. + + Recreate the volume and rebuild the image: + + docker compose -f ${COMPOSE_FILE} down -v + scripts/dev.sh build + + Then re-run your command. +EOF + exit 1 +} + usage() { sed -n '2,24p' "$0" } @@ -81,14 +124,17 @@ case "$cmd" in ;; integration|int) ensure_image + ensure_pixi_cache_writable run_in_container bash -c "./gradlew integrationTest $*" ;; e2e|smoke) ensure_image + ensure_pixi_cache_writable run_in_container bash -c "./gradlew e2eTest $*" ;; all) ensure_image + ensure_pixi_cache_writable run_in_container bash -c "./gradlew test integrationTest e2eTest $*" ;; run) From 78609b6bc935fa586dd20ed6a46292cf6cfbcbc9 Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Fri, 15 May 2026 16:37:17 -0500 Subject: [PATCH 13/18] Add tests --- .../commands/ConvertCoordinatesUnitTest.kt | 261 ++++++++++++++++ .../commands/CreateChainFilesUnitTest.kt | 217 +++++++++++++ .../FormatRecombinedFastasUnitTest.kt | 212 +++++++++++++ .../GenerateRecombinedSequencesUnitTest.kt | 285 ++++++++++++++++++ .../commands/PickCrossoversUnitTest.kt | 218 ++++++++++++++ .../integration/IntegrationGuard.kt | 52 ++++ .../integration/OrchestrateE2ETest.kt | 256 +++++++++++----- 7 files changed, 1429 insertions(+), 72 deletions(-) create mode 100644 src/test/kotlin/net/maizegenetics/commands/ConvertCoordinatesUnitTest.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/CreateChainFilesUnitTest.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/FormatRecombinedFastasUnitTest.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/GenerateRecombinedSequencesUnitTest.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/PickCrossoversUnitTest.kt diff --git a/src/test/kotlin/net/maizegenetics/commands/ConvertCoordinatesUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/ConvertCoordinatesUnitTest.kt new file mode 100644 index 0000000..24ea3d0 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/ConvertCoordinatesUnitTest.kt @@ -0,0 +1,261 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [ConvertCoordinates] (step 07 of the recombination pipeline). + * The command shells out to MLImpute's `convert_coords.py` via + * `pixi run sh -c "..."`. The tests verify the constructed command line, + * the refkey-staging side effect into the output dir, and the + * key/founder-key path files that downstream steps depend on. + */ +class ConvertCoordinatesUnitTest { + + // ConvertCoordinates builds a `pixi run sh -c ` command. + // Snapshot, force off for the test body, restore in @AfterEach. + private val originalSkipPixi = ProcessRunner.skipPixiPrefix + + @BeforeEach + fun disablePixiStripping() { + ProcessRunner.skipPixiPrefix = false + } + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + ProcessRunner.skipPixiPrefix = originalSkipPixi + } + + /** + * Create the fake MLImpute layout including the python package and + * `convert_coords.py` so the command's existence check passes. + */ + private fun stubMlimputePythonScript(workDir: Path): Path { + val scriptDir = workDir.resolve("src/MLImpute/src/python/cross") + scriptDir.createDirectories() + val script = scriptDir.resolve("convert_coords.py") + script.writeText("#!/usr/bin/env python\n") + return script + } + + private fun writeAssemblyList(path: Path, assemblies: List) { + path.writeText( + assemblies.joinToString("\n") { name -> "/some/path/$name.fa\t$name" } + ) + } + + /** + * Drop `_refkey.bed` files into [dir] so the command's refkey + * staging step (which copies them into the output dir before running + * the python script) has something to act on. + */ + private fun stubRefkeyDir(dir: Path, assemblies: List) { + dir.createDirectories() + assemblies.forEach { name -> + dir.resolve("${name}_refkey.bed").writeText("# refkey for $name\n") + } + } + + /** + * Simulate convert_coords.py: for each `_refkey.bed` in the + * working dir, write a `_key.bed` and additionally produce a + * single founder key `_key.bed` for every pair of assemblies. + */ + private fun convertCoordsSimulator(): (RecordingProcessExecutor.Invocation) -> Int = { inv -> + val outDir = inv.workingDir + outDir?.mkdirs() + val refkeyBeds = outDir?.listFiles { f -> f.name.endsWith("_refkey.bed") } + ?.toList().orEmpty() + refkeyBeds.forEachIndexed { idx, refkey -> + val name = refkey.name.removeSuffix("_refkey.bed") + File(outDir, "${name}_key.bed").writeText("# key for $name\n") + // Write a founder key for every other assembly so we exercise both + // path-file branches. + if (idx % 2 == 0) { + val founder = idx / 2 + File(outDir, "${founder}_key.bed").writeText("# founder key $founder\n") + } + } + 0 + } + + @Test + fun convertCoordsInvokesPixiShellOnceWithExpectedArgs(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, listOf("LineA", "LineB")) + + val chainDir = workDir.resolve("chains").also { it.createDirectories() } + chainDir.resolve("LineA_subsampled.chain").writeText("") + + val refkeyDir = workDir.resolve("refkeys") + stubRefkeyDir(refkeyDir, listOf("LineA", "LineB")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = convertCoordsSimulator()) + + ProcessRunner.withExecutor(executor) { + ConvertCoordinates().parse( + listOf( + "--work-dir", workDir.toString(), + "--assembly-list", assemblyList.toString(), + "--chain-dir", chainDir.toString(), + "--refkey-dir", refkeyDir.toString() + ) + ) + } + + assertEquals(1, executor.invocations.size, "convert_coords.py should be invoked exactly once") + val inv = executor.invocations.single() + + assertEquals("pixi", inv.command[0]) + assertEquals("run", inv.command[1]) + assertEquals("sh", inv.command[2]) + assertEquals("-c", inv.command[3]) + val shellCommand = inv.command[4] + + val expectedPythonPath = workDir.resolve("src/MLImpute/src").toAbsolutePath().toString() + assertTrue( + shellCommand.contains("PYTHONPATH='$expectedPythonPath'"), + "Shell command should export PYTHONPATH=$expectedPythonPath; got: $shellCommand" + ) + + val expectedScript = workDir.resolve("src/MLImpute/src/python/cross/convert_coords.py") + .toAbsolutePath().toString() + assertTrue( + shellCommand.contains("python '$expectedScript'"), + "Shell command should invoke convert_coords.py; got: $shellCommand" + ) + assertTrue( + shellCommand.contains("--assembly-list '${assemblyList.toAbsolutePath()}'"), + "Shell command should pass --assembly-list; got: $shellCommand" + ) + assertTrue( + shellCommand.contains("--chain-dir '${chainDir.toAbsolutePath()}'"), + "Shell command should pass --chain-dir; got: $shellCommand" + ) + + val expectedOutputDir = workDir.resolve("output/07_coordinates_results").toFile().absoluteFile + assertEquals(expectedOutputDir, inv.workingDir?.absoluteFile) + } + + @Test + fun refkeyBedFilesAreStagedIntoOutputDirBeforeScriptRuns(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, listOf("LineA", "LineB")) + val chainDir = workDir.resolve("chains").also { it.createDirectories() } + val refkeyDir = workDir.resolve("refkeys") + stubRefkeyDir(refkeyDir, listOf("LineA", "LineB")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = convertCoordsSimulator()) + + ProcessRunner.withExecutor(executor) { + ConvertCoordinates().parse( + listOf( + "--work-dir", workDir.toString(), + "--assembly-list", assemblyList.toString(), + "--chain-dir", chainDir.toString(), + "--refkey-dir", refkeyDir.toString() + ) + ) + } + + val outputDir = workDir.resolve("output/07_coordinates_results").toFile() + val stagedRefkeys = outputDir.listFiles { f -> f.name.endsWith("_refkey.bed") }!!.map { it.name }.toSet() + assertEquals( + setOf("LineA_refkey.bed", "LineB_refkey.bed"), + stagedRefkeys, + "Refkey BEDs should be copied into the output dir before convert_coords.py runs" + ) + } + + @Test + fun keyAndFounderKeyPathFilesAreWrittenForGeneratedBeds(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, listOf("LineA", "LineB", "LineC", "LineD")) + val chainDir = workDir.resolve("chains").also { it.createDirectories() } + val refkeyDir = workDir.resolve("refkeys") + stubRefkeyDir(refkeyDir, listOf("LineA", "LineB", "LineC", "LineD")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = convertCoordsSimulator()) + + ProcessRunner.withExecutor(executor) { + ConvertCoordinates().parse( + listOf( + "--work-dir", workDir.toString(), + "--assembly-list", assemblyList.toString(), + "--chain-dir", chainDir.toString(), + "--refkey-dir", refkeyDir.toString() + ) + ) + } + + val outputDir = workDir.resolve("output/07_coordinates_results").toFile() + + val keyPathsFile = File(outputDir, "key_file_paths.txt") + assertTrue(keyPathsFile.exists(), "key_file_paths.txt should be written for assembly key BEDs") + val keyLines = keyPathsFile.readLines().filter { it.isNotBlank() } + assertEquals(4, keyLines.size, "One assembly key path per assembly") + assertTrue( + keyLines.all { it.endsWith("_key.bed") && !File(it).name.matches(Regex("^\\d+_key\\.bed$")) }, + "Assembly key paths should look like _key.bed; got: $keyLines" + ) + + val founderPathsFile = File(outputDir, "founder_key_file_paths.txt") + assertTrue( + founderPathsFile.exists(), + "founder_key_file_paths.txt should be written for N_key.bed founders" + ) + val founderLines = founderPathsFile.readLines().filter { it.isNotBlank() } + assertTrue(founderLines.isNotEmpty(), "At least one founder key should be listed") + assertTrue( + founderLines.all { File(it).name.matches(Regex("^\\d+_key\\.bed$")) }, + "Founder keys must match N_key.bed; got: $founderLines" + ) + } + + @Test + fun customOutputDirIsHonored(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, listOf("LineA", "LineB")) + val chainDir = workDir.resolve("chains").also { it.createDirectories() } + val refkeyDir = workDir.resolve("refkeys") + stubRefkeyDir(refkeyDir, listOf("LineA", "LineB")) + val customOutput = workDir.resolve("custom_coords_out") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = convertCoordsSimulator()) + + ProcessRunner.withExecutor(executor) { + ConvertCoordinates().parse( + listOf( + "--work-dir", workDir.toString(), + "--assembly-list", assemblyList.toString(), + "--chain-dir", chainDir.toString(), + "--refkey-dir", refkeyDir.toString(), + "--output-dir", customOutput.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals(customOutput.toFile().absoluteFile, inv.workingDir?.absoluteFile) + assertTrue( + customOutput.resolve("key_file_paths.txt").toFile().exists(), + "key_file_paths.txt should be written under the custom output dir" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/CreateChainFilesUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/CreateChainFilesUnitTest.kt new file mode 100644 index 0000000..cf96b96 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/CreateChainFilesUnitTest.kt @@ -0,0 +1,217 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [CreateChainFiles] (step 06 of the recombination pipeline). + * The command shells out to MLImpute's `create_chains.sh` via `bash` -- + * no `pixi run` prefix here, so we can verify the exact argv directly. + * + * The test also asserts the rename-to-`_subsampled.chain` invariant and + * the temp-MAF-dir cleanup behaviour that downstream steps rely on. + */ +class CreateChainFilesUnitTest { + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + /** + * Create the fake MLImpute layout with `create_chains.sh` present so + * the command's existence check passes. + */ + private fun stubMlimputeChainScript(workDir: Path): Path { + val scriptDir = workDir.resolve("src/MLImpute/src/python/cross") + scriptDir.createDirectories() + val script = scriptDir.resolve("create_chains.sh") + script.writeText("#!/bin/sh\nexit 0\n") + script.toFile().setExecutable(true) + return script + } + + /** + * Drop a few stub `.maf` files into [dir] so the command's + * `collectMafFiles()` has something to iterate over. + */ + private fun stubMafFiles(dir: Path, names: List): List { + dir.createDirectories() + return names.map { name -> + val f = dir.resolve(name) + f.writeText("##maf version=1\n") + f + } + } + + /** + * Hook helper that simulates create_chains.sh by writing a `.chain` + * file in the output directory for every MAF file staged into the + * temporary input directory. + */ + private fun createChainsSimulator(): (RecordingProcessExecutor.Invocation) -> Int = { inv -> + val tempInputDir = inv.command.dropWhile { it != "-i" }.getOrNull(1)?.let { File(it) } + val outDir = inv.command.dropWhile { it != "-o" }.getOrNull(1)?.let { File(it) } + outDir?.mkdirs() + tempInputDir?.listFiles { f -> f.name.endsWith(".maf") }?.forEach { mafFile -> + val chainName = mafFile.nameWithoutExtension + ".chain" + File(outDir, chainName).writeText("# chain for ${mafFile.name}\n") + } + 0 + } + + @Test + fun bashScriptIsInvokedWithExpectedArgs(@TempDir workDir: Path) { + val bashScript = stubMlimputeChainScript(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf", "LineB.maf")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = createChainsSimulator()) + + ProcessRunner.withExecutor(executor) { + CreateChainFiles().parse( + listOf( + "--work-dir", workDir.toString(), + "--maf-input", mafDir.toString(), + "--jobs", "4" + ) + ) + } + + assertEquals(1, executor.invocations.size, "create_chains.sh should be invoked exactly once") + val inv = executor.invocations.single() + + assertEquals("bash", inv.command[0]) + assertEquals(bashScript.toString(), inv.command[1]) + assertEquals("4", inv.argAfter("-j")) + + // -i must point at a temp dir under the output dir + val outputDir = workDir.resolve("output/06_chain_results") + assertEquals(outputDir.toAbsolutePath().toString(), inv.argAfter("-o")) + + val tempInputArg = inv.argAfter("-i")!! + assertTrue( + tempInputArg.startsWith(outputDir.toAbsolutePath().toString()), + "-i should point under the output directory (got: $tempInputArg)" + ) + // The command sets workingDir to the (parent) work directory. + assertEquals(workDir.toFile().absoluteFile, inv.workingDir?.absoluteFile) + } + + @Test + fun jobsDefaultsToEightWhenNotProvided(@TempDir workDir: Path) { + stubMlimputeChainScript(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = createChainsSimulator()) + + ProcessRunner.withExecutor(executor) { + CreateChainFiles().parse( + listOf( + "--work-dir", workDir.toString(), + "--maf-input", mafDir.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals("8", inv.argAfter("-j")) + } + + @Test + fun tempMafDirectoryIsCleanedUpAfterRun(@TempDir workDir: Path) { + stubMlimputeChainScript(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = createChainsSimulator()) + + ProcessRunner.withExecutor(executor) { + CreateChainFiles().parse( + listOf( + "--work-dir", workDir.toString(), + "--maf-input", mafDir.toString() + ) + ) + } + + val tempMafDir = workDir.resolve("output/06_chain_results/temp_maf_files").toFile() + assertTrue( + !tempMafDir.exists(), + "temp_maf_files should be removed after the run (got existing dir at $tempMafDir)" + ) + } + + @Test + fun chainFilesAreRenamedToSubsampledAndListedInPathsFile(@TempDir workDir: Path) { + stubMlimputeChainScript(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf", "LineB.maf", "LineC.maf")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = createChainsSimulator()) + + ProcessRunner.withExecutor(executor) { + CreateChainFiles().parse( + listOf( + "--work-dir", workDir.toString(), + "--maf-input", mafDir.toString() + ) + ) + } + + val outputDir = workDir.resolve("output/06_chain_results").toFile() + val chainFiles = outputDir.listFiles { f -> f.name.endsWith(".chain") }!! + assertEquals(3, chainFiles.size, "One chain per MAF should be produced") + assertTrue( + chainFiles.all { it.name.endsWith("_subsampled.chain") }, + "Chain files should be renamed with _subsampled suffix; got: ${chainFiles.map { it.name }}" + ) + + val pathsFile = outputDir.resolve("chain_file_paths.txt") + assertTrue(pathsFile.exists(), "chain_file_paths.txt should be written") + val lines = pathsFile.readLines().filter { it.isNotBlank() } + assertEquals(3, lines.size, "One line per chain file") + assertTrue( + lines.all { it.endsWith("_subsampled.chain") }, + "Every listed path should end with _subsampled.chain" + ) + } + + @Test + fun customOutputDirIsHonored(@TempDir workDir: Path) { + stubMlimputeChainScript(workDir) + val mafDir = workDir.resolve("mafs") + stubMafFiles(mafDir, listOf("LineA.maf")) + val customOutput = workDir.resolve("custom_chain_out") + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = createChainsSimulator()) + + ProcessRunner.withExecutor(executor) { + CreateChainFiles().parse( + listOf( + "--work-dir", workDir.toString(), + "--maf-input", mafDir.toString(), + "--output-dir", customOutput.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals(customOutput.toAbsolutePath().toString(), inv.argAfter("-o")) + assertTrue( + customOutput.resolve("chain_file_paths.txt").toFile().exists(), + "chain_file_paths.txt should be written under the custom output dir" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/FormatRecombinedFastasUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/FormatRecombinedFastasUnitTest.kt new file mode 100644 index 0000000..2acf78a --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/FormatRecombinedFastasUnitTest.kt @@ -0,0 +1,212 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [FormatRecombinedFastas] (step 09 of the recombination + * pipeline). The command shells out to `seqkit seq` via `pixi run` once + * per input FASTA; we install a [RecordingProcessExecutor] and verify + * the constructed command line, the auto-detection of the previous + * step's `recombinate_fastas/` directory, and the per-step + * `formatted_fasta_paths.txt` output contract. + */ +class FormatRecombinedFastasUnitTest { + + // FormatRecombinedFastas builds `pixi run seqkit seq ...`. Disable the + // dev-container's `pixi run` stripping so we can assert on it. + private val originalSkipPixi = ProcessRunner.skipPixiPrefix + + @BeforeEach + fun disablePixiStripping() { + ProcessRunner.skipPixiPrefix = false + } + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + ProcessRunner.skipPixiPrefix = originalSkipPixi + } + + private fun stubFastas(dir: Path, names: List): List { + dir.createDirectories() + return names.map { name -> + val f = dir.resolve("$name.fa") + f.writeText(">1\nACGTACGTACGTACGTACGTACGTACGTACGT\n") + f + } + } + + @Test + fun seqkitIsInvokedOncePerFastaWithExpectedArgs(@TempDir workDir: Path) { + val fastaDir = workDir.resolve("recombined").also { it.createDirectories() } + stubFastas(fastaDir, listOf("0", "1")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + FormatRecombinedFastas().parse( + listOf( + "--work-dir", workDir.toString(), + "--fasta-input", fastaDir.toString(), + "--line-width", "80", + "--threads", "4" + ) + ) + } + + assertEquals(2, executor.invocations.size, "seqkit should be invoked once per FASTA file") + + val inv = executor.invocations.first() + assertEquals("pixi", inv.command[0]) + assertEquals("run", inv.command[1]) + assertEquals("seqkit", inv.command[2]) + assertEquals("seq", inv.command[3]) + assertEquals("80", inv.argAfter("-w")) + assertEquals("4", inv.argAfter("-j")) + + // The last positional arg is the FASTA path; verify both invocations + // covered the staged inputs. + val inputArgs = executor.invocations.map { it.command.last() }.toSet() + assertEquals( + setOf( + fastaDir.resolve("0.fa").toString(), + fastaDir.resolve("1.fa").toString() + ), + inputArgs + ) + + // Outputs are redirected via outputFile to the per-step output dir. + val expectedOutDir = workDir.resolve("output/09_formatted_fastas").toFile().absoluteFile + executor.invocations.forEach { invocation -> + assertEquals( + expectedOutDir, + invocation.outputFile?.parentFile?.absoluteFile, + "seqkit output should be redirected under $expectedOutDir" + ) + assertTrue( + invocation.outputFile?.name?.endsWith(".fa") == true, + "Output filename should preserve the .fa extension; got: ${invocation.outputFile?.name}" + ) + } + } + + @Test + fun defaultLineWidthAndThreadsAreApplied(@TempDir workDir: Path) { + val fastaDir = workDir.resolve("recombined").also { it.createDirectories() } + stubFastas(fastaDir, listOf("0")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + FormatRecombinedFastas().parse( + listOf( + "--work-dir", workDir.toString(), + "--fasta-input", fastaDir.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals("60", inv.argAfter("-w"), "Default line width should be 60") + assertEquals("8", inv.argAfter("-j"), "Default thread count should be 8") + } + + @Test + fun inputIsAutoDetectedFromStep08OutputWhenOmitted(@TempDir workDir: Path) { + // Simulate step 8's output directory at the expected location. + val recombinedFastasDir = workDir.resolve("output/08_recombined_sequences/recombinate_fastas") + stubFastas(recombinedFastasDir, listOf("0", "1", "2")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + FormatRecombinedFastas().parse( + listOf( + "--work-dir", workDir.toString() + // --fasta-input deliberately omitted + ) + ) + } + + assertEquals( + 3, + executor.invocations.size, + "All three FASTAs from auto-detected step-8 output should be processed" + ) + val inputArgs = executor.invocations.map { it.command.last() }.toSet() + val expectedInputs = (0..2).map { recombinedFastasDir.resolve("$it.fa").toString() }.toSet() + assertEquals(expectedInputs, inputArgs, "Auto-detected inputs should match step-8 outputs") + } + + @Test + fun formattedFastaPathsTextFileListsSuccessfullyFormattedFastas(@TempDir workDir: Path) { + val fastaDir = workDir.resolve("recombined").also { it.createDirectories() } + stubFastas(fastaDir, listOf("0", "1", "2")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + FormatRecombinedFastas().parse( + listOf( + "--work-dir", workDir.toString(), + "--fasta-input", fastaDir.toString() + ) + ) + } + + val pathsFile = workDir.resolve("output/09_formatted_fastas/formatted_fasta_paths.txt").toFile() + assertTrue(pathsFile.exists(), "formatted_fasta_paths.txt should be written") + val lines = pathsFile.readLines().filter { it.isNotBlank() } + assertEquals(3, lines.size, "One formatted FASTA path per input") + assertTrue( + lines.all { it.endsWith(".fa") }, + "Every listed path should end with .fa; got: $lines" + ) + assertTrue( + lines.all { File(it).exists() }, + "Every listed FASTA should exist on disk (RecordingProcessExecutor pre-creates outputs)" + ) + } + + @Test + fun customOutputDirIsHonored(@TempDir workDir: Path) { + val fastaDir = workDir.resolve("recombined").also { it.createDirectories() } + stubFastas(fastaDir, listOf("0")) + val customOutput = workDir.resolve("custom_formatted_out") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + + ProcessRunner.withExecutor(executor) { + FormatRecombinedFastas().parse( + listOf( + "--work-dir", workDir.toString(), + "--fasta-input", fastaDir.toString(), + "--output-dir", customOutput.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals( + customOutput.toFile().absoluteFile, + inv.outputFile?.parentFile?.absoluteFile, + "seqkit output should land in the custom output dir" + ) + assertTrue( + customOutput.resolve("formatted_fasta_paths.txt").toFile().exists(), + "formatted_fasta_paths.txt should be written under the custom output dir" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/GenerateRecombinedSequencesUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/GenerateRecombinedSequencesUnitTest.kt new file mode 100644 index 0000000..b72732b --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/GenerateRecombinedSequencesUnitTest.kt @@ -0,0 +1,285 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Files +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.exists +import kotlin.io.path.isSymbolicLink +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [GenerateRecombinedSequences] (step 08 of the + * recombination pipeline). The command shells out to MLImpute's + * `write_fastas.py` via `pixi run sh -c "..."`. The tests verify the + * constructed command line, that founder key BEDs are staged into the + * output dir, that `.fasta`/`.fna` parents get `.fa` symlinks created + * for them automatically, and the `recombined_fasta_paths.txt` output + * contract. + */ +class GenerateRecombinedSequencesUnitTest { + + // GenerateRecombinedSequences uses `pixi run sh -c "..."`. Snapshot + // skipPixiPrefix and force it off so we can assert the full command. + private val originalSkipPixi = ProcessRunner.skipPixiPrefix + + @BeforeEach + fun disablePixiStripping() { + ProcessRunner.skipPixiPrefix = false + } + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + ProcessRunner.skipPixiPrefix = originalSkipPixi + } + + private fun stubMlimputePythonScript(workDir: Path): Path { + val scriptDir = workDir.resolve("src/MLImpute/src/python/cross") + scriptDir.createDirectories() + val script = scriptDir.resolve("write_fastas.py") + script.writeText("#!/usr/bin/env python\n") + return script + } + + /** + * Write a minimal assembly list at [path] referencing the given + * assembly names with `\t` formatting. + */ + private fun writeAssemblyList(path: Path, assemblyDir: Path, names: List) { + path.writeText( + names.joinToString("\n") { name -> + "${assemblyDir.resolve("$name.fa").toAbsolutePath()}\t$name" + } + ) + } + + /** + * Drop `_key.bed` files (founder keys) into [dir] so the command's + * existence check passes. The command requires AT LEAST one file + * matching the regex `^\d+_key\.bed$`. + */ + private fun stubFounderKeyDir(dir: Path, founderIds: List) { + dir.createDirectories() + founderIds.forEach { id -> + dir.resolve("${id}_key.bed").writeText("# founder key $id\n") + } + } + + /** + * Simulate write_fastas.py by producing one recombined FASTA per + * founder key file inside the `recombinate_fastas/` subdirectory of + * the script's working directory (which the command sets to the + * step's output dir). + */ + private fun writeFastasSimulator(): (RecordingProcessExecutor.Invocation) -> Int = { inv -> + val outDir = inv.workingDir + outDir?.mkdirs() + val recombinedDir = File(outDir, "recombinate_fastas").also { it.mkdirs() } + val founderKeys = outDir?.listFiles { f -> f.name.matches(Regex("^\\d+_key\\.bed$")) } + ?.toList().orEmpty() + founderKeys.forEach { fk -> + val founderId = fk.name.removeSuffix("_key.bed") + File(recombinedDir, "$founderId.fa").writeText(">$founderId\nACGT\n") + } + 0 + } + + @Test + fun writeFastasInvokesPixiShellOnceWithExpectedArgs(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyDir = workDir.resolve("assemblies").also { it.createDirectories() } + listOf("LineA", "LineB").forEach { name -> + assemblyDir.resolve("$name.fa").writeText(">1\nACGT\n") + } + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, assemblyDir, listOf("LineA", "LineB")) + val chromosomeList = workDir.resolve("chromosomes.txt").also { it.writeText("1\n") } + val founderKeyDir = workDir.resolve("founder_keys") + stubFounderKeyDir(founderKeyDir, listOf(0, 1)) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = writeFastasSimulator()) + + ProcessRunner.withExecutor(executor) { + GenerateRecombinedSequences().parse( + listOf( + "--work-dir", workDir.toString(), + "--assembly-list", assemblyList.toString(), + "--chromosome-list", chromosomeList.toString(), + "--assembly-dir", assemblyDir.toString(), + "--founder-key-dir", founderKeyDir.toString() + ) + ) + } + + assertEquals(1, executor.invocations.size, "write_fastas.py should be invoked exactly once") + val inv = executor.invocations.single() + + assertEquals("pixi", inv.command[0]) + assertEquals("run", inv.command[1]) + assertEquals("sh", inv.command[2]) + assertEquals("-c", inv.command[3]) + val shellCommand = inv.command[4] + + val expectedPythonPath = workDir.resolve("src/MLImpute/src").toAbsolutePath().toString() + assertTrue( + shellCommand.contains("PYTHONPATH='$expectedPythonPath'"), + "Shell command should export PYTHONPATH=$expectedPythonPath; got: $shellCommand" + ) + + val expectedScript = workDir.resolve("src/MLImpute/src/python/cross/write_fastas.py") + .toAbsolutePath().toString() + assertTrue( + shellCommand.contains("python '$expectedScript'"), + "Shell command should invoke write_fastas.py; got: $shellCommand" + ) + assertTrue( + shellCommand.contains("--assembly-list '${assemblyList.toAbsolutePath()}'"), + "Shell command should pass --assembly-list; got: $shellCommand" + ) + assertTrue( + shellCommand.contains("--chromosome-list '${chromosomeList.toAbsolutePath()}'"), + "Shell command should pass --chromosome-list; got: $shellCommand" + ) + assertTrue( + shellCommand.contains("--assembly-dir '${assemblyDir.toAbsolutePath()}'"), + "Shell command should pass --assembly-dir; got: $shellCommand" + ) + + val expectedOutputDir = workDir.resolve("output/08_recombined_sequences").toFile().absoluteFile + assertEquals(expectedOutputDir, inv.workingDir?.absoluteFile) + } + + @Test + fun founderKeyFilesAreStagedIntoOutputDirBeforeScriptRuns(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyDir = workDir.resolve("assemblies").also { it.createDirectories() } + assemblyDir.resolve("LineA.fa").writeText(">1\nACGT\n") + assemblyDir.resolve("LineB.fa").writeText(">1\nACGT\n") + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, assemblyDir, listOf("LineA", "LineB")) + val chromosomeList = workDir.resolve("chromosomes.txt").also { it.writeText("1\n") } + val founderKeyDir = workDir.resolve("founder_keys") + stubFounderKeyDir(founderKeyDir, listOf(0, 1, 2)) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = writeFastasSimulator()) + + ProcessRunner.withExecutor(executor) { + GenerateRecombinedSequences().parse( + listOf( + "--work-dir", workDir.toString(), + "--assembly-list", assemblyList.toString(), + "--chromosome-list", chromosomeList.toString(), + "--assembly-dir", assemblyDir.toString(), + "--founder-key-dir", founderKeyDir.toString() + ) + ) + } + + val outputDir = workDir.resolve("output/08_recombined_sequences").toFile() + val stagedKeys = outputDir.listFiles { f -> f.name.matches(Regex("^\\d+_key\\.bed$")) }!! + .map { it.name }.toSet() + assertEquals( + setOf("0_key.bed", "1_key.bed", "2_key.bed"), + stagedKeys, + "Founder key BEDs should be copied into the output dir before write_fastas.py runs" + ) + } + + @Test + fun fastaSymlinksAreCreatedForFastaAndFnaParents(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyDir = workDir.resolve("assemblies").also { it.createDirectories() } + // The Python script wants .fa, but the parent assemblies live with + // .fasta / .fna extensions. The command must create symlinks so the + // script can find them. + assemblyDir.resolve("LineA.fasta").writeText(">1\nACGT\n") + assemblyDir.resolve("LineB.fna").writeText(">1\nACGT\n") + // LineA also has a .fai sidecar; the symlink for it should also be created. + assemblyDir.resolve("LineA.fasta.fai").writeText("1\t4\t3\t60\t61\n") + + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, assemblyDir, listOf("LineA", "LineB")) + val chromosomeList = workDir.resolve("chromosomes.txt").also { it.writeText("1\n") } + val founderKeyDir = workDir.resolve("founder_keys") + stubFounderKeyDir(founderKeyDir, listOf(0)) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = writeFastasSimulator()) + + ProcessRunner.withExecutor(executor) { + GenerateRecombinedSequences().parse( + listOf( + "--work-dir", workDir.toString(), + "--assembly-list", assemblyList.toString(), + "--chromosome-list", chromosomeList.toString(), + "--assembly-dir", assemblyDir.toString(), + "--founder-key-dir", founderKeyDir.toString() + ) + ) + } + + val lineAFa = assemblyDir.resolve("LineA.fa") + val lineBFa = assemblyDir.resolve("LineB.fa") + assertTrue(lineAFa.exists(), "LineA.fa symlink should be created from LineA.fasta") + assertTrue(lineAFa.isSymbolicLink(), "LineA.fa should be a symlink") + assertTrue(lineBFa.exists(), "LineB.fa symlink should be created from LineB.fna") + assertTrue(lineBFa.isSymbolicLink(), "LineB.fa should be a symlink") + + val lineAFai = assemblyDir.resolve("LineA.fa.fai") + assertTrue(lineAFai.exists(), "LineA.fa.fai sidecar symlink should also be created") + assertTrue( + Files.isSymbolicLink(lineAFai), + "LineA.fa.fai should be a symlink to LineA.fasta.fai" + ) + } + + @Test + fun recombinedFastaPathsTextFileListsGeneratedFastas(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyDir = workDir.resolve("assemblies").also { it.createDirectories() } + listOf("LineA", "LineB").forEach { name -> + assemblyDir.resolve("$name.fa").writeText(">1\nACGT\n") + } + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, assemblyDir, listOf("LineA", "LineB")) + val chromosomeList = workDir.resolve("chromosomes.txt").also { it.writeText("1\n") } + val founderKeyDir = workDir.resolve("founder_keys") + stubFounderKeyDir(founderKeyDir, listOf(0, 1, 2, 3)) + + val executor = RecordingProcessExecutor(defaultExitCode = 0, onInvoke = writeFastasSimulator()) + + ProcessRunner.withExecutor(executor) { + GenerateRecombinedSequences().parse( + listOf( + "--work-dir", workDir.toString(), + "--assembly-list", assemblyList.toString(), + "--chromosome-list", chromosomeList.toString(), + "--assembly-dir", assemblyDir.toString(), + "--founder-key-dir", founderKeyDir.toString() + ) + ) + } + + val pathsFile = workDir.resolve("output/08_recombined_sequences/recombined_fasta_paths.txt").toFile() + assertTrue(pathsFile.exists(), "recombined_fasta_paths.txt should be written") + val lines = pathsFile.readLines().filter { it.isNotBlank() } + assertEquals(4, lines.size, "One recombined FASTA per founder key should be listed") + assertTrue( + lines.all { it.endsWith(".fa") && File(it).exists() && File(it).length() > 0 }, + "Every listed recombined FASTA must exist on disk and be non-empty; got: $lines" + ) + assertTrue( + lines.all { File(it).parentFile.name == "recombinate_fastas" }, + "Recombined FASTAs should live under recombinate_fastas/; got: $lines" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/PickCrossoversUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/PickCrossoversUnitTest.kt new file mode 100644 index 0000000..4c73150 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/PickCrossoversUnitTest.kt @@ -0,0 +1,218 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [PickCrossovers] (step 05 of the recombination pipeline). + * The command shells out to MLImpute's `pick_crossovers.py` via + * `pixi run sh -c "..."` -- we install a [RecordingProcessExecutor] and + * verify the exact command line, the working directory, and the + * `refkey_file_paths.txt` output contract that downstream steps rely on. + */ +class PickCrossoversUnitTest { + + private val smallseqRoot: Path = File("src/test/resources/smallseq") + .absoluteFile.toPath() + + // PickCrossovers builds a `pixi run sh -c ` command. In the + // dev container ProcessRunner.skipPixiPrefix=1 silently drops the leading + // `pixi run` token, which would make positional assertions brittle. + // Snapshot, force off for the test body, restore in @AfterEach. + private val originalSkipPixi = ProcessRunner.skipPixiPrefix + + @BeforeEach + fun disablePixiStripping() { + ProcessRunner.skipPixiPrefix = false + } + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + ProcessRunner.skipPixiPrefix = originalSkipPixi + } + + /** + * Create the fake MLImpute layout including the python package and the + * `pick_crossovers.py` script that the command's existence check looks + * for at `/src/MLImpute/src/python/cross/pick_crossovers.py`. + */ + private fun stubMlimputePythonScript(workDir: Path): Path { + val scriptDir = workDir.resolve("src/MLImpute/src/python/cross") + scriptDir.createDirectories() + val script = scriptDir.resolve("pick_crossovers.py") + script.writeText("#!/usr/bin/env python\n") + return script + } + + /** + * Write a minimal tab-separated assembly list at [path] referencing the + * given assembly names. The path column doesn't need to exist on disk -- + * pick_crossovers.py is mocked. + */ + private fun writeAssemblyList(path: Path, assemblies: List) { + path.writeText( + assemblies.joinToString("\n") { name -> + "/some/path/$name.fa\t$name" + } + ) + } + + /** + * RecordingProcessExecutor hook that simulates pick_crossovers.py by + * writing one `_refkey.bed` per assembly name listed in + * the assembly-list referenced by the shell command. The output + * directory is derived from the executor invocation's `workingDir`, + * which the command sets to the per-step output dir. + */ + private fun pickCrossoversSimulator( + assemblyListPath: Path, + ): (RecordingProcessExecutor.Invocation) -> Int = { inv -> + val outDir = inv.workingDir + outDir?.mkdirs() + assemblyListPath.toFile().readLines() + .filter { it.isNotBlank() } + .mapNotNull { it.split("\t").getOrNull(1)?.trim() } + .forEach { name -> + File(outDir, "${name}_refkey.bed").writeText("# refkey for $name\n") + } + 0 + } + + @Test + fun pickCrossoversInvokesPixiShellOnceWithExpectedArgs(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, listOf("LineA", "LineB")) + + val executor = RecordingProcessExecutor( + defaultExitCode = 0, + onInvoke = pickCrossoversSimulator(assemblyList) + ) + + ProcessRunner.withExecutor(executor) { + PickCrossovers().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--assembly-list", assemblyList.toString() + ) + ) + } + + assertEquals(1, executor.invocations.size, "pick_crossovers.py should be invoked exactly once") + val inv = executor.invocations.single() + + // Command shape: pixi run sh -c "" + assertEquals("pixi", inv.command[0]) + assertEquals("run", inv.command[1]) + assertEquals("sh", inv.command[2]) + assertEquals("-c", inv.command[3]) + val shellCommand = inv.command[4] + + // PYTHONPATH points to /src/MLImpute/src so 'python.*' imports resolve. + val expectedPythonPath = workDir.resolve("src/MLImpute/src").toAbsolutePath().toString() + assertTrue( + shellCommand.contains("PYTHONPATH='$expectedPythonPath'"), + "Shell command should export PYTHONPATH=$expectedPythonPath; got: $shellCommand" + ) + + // The script is invoked with absolute paths so cwd-relativity doesn't bite. + val expectedScript = workDir.resolve("src/MLImpute/src/python/cross/pick_crossovers.py") + .toAbsolutePath().toString() + assertTrue( + shellCommand.contains("python '$expectedScript'"), + "Shell command should invoke '$expectedScript'; got: $shellCommand" + ) + assertTrue( + shellCommand.contains("--ref-fasta '${smallseqRoot.resolve("Ref.fa").toAbsolutePath()}'"), + "Shell command should pass --ref-fasta with absolute path; got: $shellCommand" + ) + assertTrue( + shellCommand.contains("--assembly-list '${assemblyList.toAbsolutePath()}'"), + "Shell command should pass --assembly-list with absolute path; got: $shellCommand" + ) + + // The command sets workingDir to the output dir so relative outputs land + // in the right place. + val expectedOutputDir = workDir.resolve("output/05_crossovers_results").toFile().absoluteFile + assertEquals(expectedOutputDir, inv.workingDir?.absoluteFile) + } + + @Test + fun refkeyFilePathsTextFileListsGeneratedBedFiles(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, listOf("LineA", "LineB", "LineC", "LineD")) + + val executor = RecordingProcessExecutor( + defaultExitCode = 0, + onInvoke = pickCrossoversSimulator(assemblyList) + ) + + ProcessRunner.withExecutor(executor) { + PickCrossovers().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--assembly-list", assemblyList.toString() + ) + ) + } + + val refkeyPaths = workDir.resolve("output/05_crossovers_results/refkey_file_paths.txt").toFile() + assertTrue(refkeyPaths.exists(), "refkey_file_paths.txt should be written") + val lines = refkeyPaths.readLines().filter { it.isNotBlank() } + assertEquals(4, lines.size, "One refkey BED path per assembly should be listed") + assertTrue( + lines.all { it.endsWith("_refkey.bed") }, + "Every listed path should end with _refkey.bed; got: $lines" + ) + assertTrue( + lines.all { File(it).exists() && File(it).length() > 0 }, + "Every listed refkey BED must exist on disk and be non-empty (from simulator)" + ) + } + + @Test + fun customOutputDirIsHonored(@TempDir workDir: Path) { + stubMlimputePythonScript(workDir) + val assemblyList = workDir.resolve("assembly_list.txt") + writeAssemblyList(assemblyList, listOf("LineA", "LineB")) + val customOutput = workDir.resolve("custom_crossovers_out") + + val executor = RecordingProcessExecutor( + defaultExitCode = 0, + onInvoke = pickCrossoversSimulator(assemblyList) + ) + + ProcessRunner.withExecutor(executor) { + PickCrossovers().parse( + listOf( + "--work-dir", workDir.toString(), + "--ref-fasta", smallseqRoot.resolve("Ref.fa").toString(), + "--assembly-list", assemblyList.toString(), + "--output-dir", customOutput.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals(customOutput.toFile().absoluteFile, inv.workingDir?.absoluteFile) + assertTrue( + customOutput.resolve("refkey_file_paths.txt").toFile().exists(), + "refkey_file_paths.txt should be written under the custom output dir" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/integration/IntegrationGuard.kt b/src/test/kotlin/net/maizegenetics/integration/IntegrationGuard.kt index ac6f39c..372e9ab 100644 --- a/src/test/kotlin/net/maizegenetics/integration/IntegrationGuard.kt +++ b/src/test/kotlin/net/maizegenetics/integration/IntegrationGuard.kt @@ -1,6 +1,7 @@ package net.maizegenetics.integration import org.junit.jupiter.api.Assumptions +import java.io.File /** * Shared precondition helpers for integration-tier tests. @@ -45,4 +46,55 @@ object IntegrationGuard { ?: false Assumptions.assumeTrue(found, "anchorwave binary not found on PATH") } + + /** + * Print the container's actual memory budget (cgroup v2 / v1) and the + * test JVM's heap settings so we can diagnose `exit 137` (kernel OOM + * kills) without guessing. Should be called once per E2E test. + * + * E2E tests run the full pipeline which forks Gradle daemons, + * MLImpute application JVMs, and native tools (AnchorWave/minimap2/ + * pysam). Their combined RSS can comfortably exceed 4 GB. If the + * container memory limit is below that, the OOM killer fires. + */ + fun logContainerMemoryBudget() { + val runtime = Runtime.getRuntime() + val mb = 1024L * 1024L + println(">>> [MEMORY] JVM Xmx (maxMemory): ${runtime.maxMemory() / mb} MB") + println(">>> [MEMORY] JVM available processors: ${runtime.availableProcessors()}") + + // cgroup v2 (modern Docker / Linux >= 4.5) + val cgroupV2 = File("/sys/fs/cgroup/memory.max") + if (cgroupV2.exists()) { + val raw = runCatching { cgroupV2.readText().trim() }.getOrNull() ?: "" + val pretty = raw.toLongOrNull()?.let { "$raw bytes (${it / mb} MB)" } ?: raw + println(">>> [MEMORY] cgroup v2 memory.max: $pretty") + } + + // cgroup v1 fallback + val cgroupV1 = File("/sys/fs/cgroup/memory/memory.limit_in_bytes") + if (cgroupV1.exists()) { + val raw = runCatching { cgroupV1.readText().trim() }.getOrNull() ?: "" + val pretty = raw.toLongOrNull()?.let { "$raw bytes (${it / mb} MB)" } ?: raw + println(">>> [MEMORY] cgroup v1 memory.limit_in_bytes: $pretty") + } + + // /proc/meminfo for the host kernel's view + val meminfo = File("/proc/meminfo") + if (meminfo.exists()) { + val lines = runCatching { + meminfo.readLines().filter { line -> + line.startsWith("MemTotal:") || + line.startsWith("MemAvailable:") || + line.startsWith("MemFree:") + } + }.getOrNull().orEmpty() + lines.forEach { println(">>> [MEMORY] /proc/meminfo $it") } + } + + println( + ">>> [MEMORY] If subsequent steps fail with exit 137, the container memory " + + "budget shown above is too small. End-to-end tests typically need at least 6 GB." + ) + } } diff --git a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt index 6dc12e8..3103e17 100644 --- a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt +++ b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt @@ -4,7 +4,6 @@ import com.github.ajalt.clikt.core.parse import net.maizegenetics.commands.Orchestrate import org.junit.jupiter.api.Tag import org.junit.jupiter.api.Test -import org.junit.jupiter.api.io.TempDir import java.io.File import java.nio.file.Path import kotlin.io.path.ExperimentalPathApi @@ -15,8 +14,9 @@ import kotlin.io.path.writeText import kotlin.test.assertTrue /** - * End-to-end smoke test: run `orchestrate` against the smallseq test - * pipeline and assert that the expected outputs are produced. + * End-to-end test: run `orchestrate` against the smallseq test fixtures + * through every pipeline step (1-9) and assert that each step's expected + * outputs are produced. * * Only runs inside the seq-sim-dev container (SEQ_SIM_IN_CONTAINER=1). */ @@ -28,10 +28,11 @@ class OrchestrateE2ETest { /** * Create (or reset) a stable, inspectable working directory under - * `build/test-output/` for a long-form E2E test. We deliberately avoid - * [TempDir] here so that intermediate pipeline outputs survive the - * test for post-mortem inspection. The directory is wiped on each run - * to keep the test hermetic, and `./gradlew clean` removes it. + * `build/test-output/` for the E2E test. We deliberately avoid + * [org.junit.jupiter.api.io.TempDir] so that intermediate pipeline + * outputs survive the test for post-mortem inspection. The directory + * is wiped on each run to keep the test hermetic, and `./gradlew clean` + * removes it. */ @OptIn(ExperimentalPathApi::class) private fun persistentWorkDir(testName: String): Path { @@ -43,77 +44,48 @@ class OrchestrateE2ETest { return workDir } - @Test - fun orchestrateSmallseqPipelineProducesMafAndGvcf(@TempDir workDir: Path) { - // align-assemblies now drives PHGv2 internally, so we need both the - // phg binary and AnchorWave on PATH. The orchestrator's setup-environment - // step takes care of populating /src/phg_v2 from SEQ_SIM_PHG_DIR. - IntegrationGuard.requirePhg() - IntegrationGuard.requireAnchorwave() - - workDir.createDirectories() - - // Write a work-dir-local pipeline config pointing at the smallseq resources. - val configPath = workDir.resolve("pipeline.yaml") - configPath.writeText( - """ - work_dir: "${workDir.toString()}" - - run_steps: - - align_assemblies - - maf_to_gvcf - - align_assemblies: - ref_gff: "${smallseqRoot.resolve("anchors.gff")}" - ref_fasta: "${smallseqRoot.resolve("Ref.fa")}" - query_fasta: "${smallseqRoot.resolve("queries")}" - threads: 2 - - maf_to_gvcf: - sample_name: "smallseq" - """.trimIndent() - ) - - Orchestrate().parse(listOf("--config", configPath.toString())) - - val anchorwaveOut = workDir.resolve("output/01_anchorwave_results").toFile() - assertTrue(anchorwaveOut.exists(), "AnchorWave output directory should exist") - val mafPaths = File(anchorwaveOut, "maf_file_paths.txt") - assertTrue(mafPaths.exists() && mafPaths.length() > 0, "maf_file_paths.txt should be non-empty") - - val gvcfPaths = workDir.resolve("output/02_gvcf_results/gvcf_file_paths.txt").toFile() - assertTrue(gvcfPaths.exists(), "gvcf_file_paths.txt should exist") - val gvcfLines = gvcfPaths.readLines().filter { it.isNotBlank() } - assertTrue(gvcfLines.isNotEmpty(), "At least one gVCF should be listed") - assertTrue( - gvcfLines.all { File(it).exists() }, - "Every gVCF referenced in gvcf_file_paths.txt must exist on disk" - ) - } - /** - * Full variant-pipeline (steps 1-4) E2E: align_assemblies -> - * maf_to_gvcf -> downsample_gvcf -> convert_to_fasta. Validates that - * every step's expected outputs are produced and chained together - * correctly by the orchestrator. + * Full pipeline (steps 1-9) E2E: align_assemblies -> maf_to_gvcf -> + * downsample_gvcf -> convert_to_fasta -> pick_crossovers -> + * create_chain_files -> convert_coordinates -> + * generate_recombined_sequences -> format_recombined_fastas. * - * Unlike the other E2E test, this one does NOT use [TempDir] -- it - * pins the working directory to a fixed location under `build/` so - * the intermediate outputs persist for post-mortem inspection. The - * location is logged at the start of the test and is wiped on each - * fresh run to keep the test hermetic. + * Validates that every step's expected outputs are produced and that + * the orchestrator chains them together correctly end-to-end. + * + * Uses a persistent working directory under `build/test-output/` (not + * [org.junit.jupiter.api.io.TempDir]) so intermediate pipeline outputs + * survive the test for post-mortem inspection. The location is logged + * at the start of the test and is wiped on each fresh run to keep the + * test hermetic. */ @Test - fun orchestrateRunsVariantPipelineStepsOneThroughFour() { - // Steps 1-4 require: PHG + AnchorWave (step 1), biokotlin-tools (step 2), - // and MLImpute (steps 3-4). The orchestrator's auto-run of + fun orchestrateRunsFullPipelineStepsOneThroughNine() { + // Steps 1-9 require: PHG + AnchorWave (step 1), biokotlin-tools + // (step 2), MLImpute (steps 3-4 and the python scripts that back + // pick_crossovers / convert_coordinates / generate_recombined_sequences), + // and seqkit (step 9). The orchestrator's auto-run of // setup-environment populates biokotlin-tools and MLImpute on first // run; the PHGv2 binary is picked up from SEQ_SIM_PHG_DIR. IntegrationGuard.requirePhg() IntegrationGuard.requireAnchorwave() + IntegrationGuard.logContainerMemoryBudget() + + val workDir = persistentWorkDir("orchestrate-steps-1-9") + println(">>> Persisting full-pipeline E2E outputs at: $workDir") - val workDir = persistentWorkDir("orchestrate-steps-1-4") - println(">>> Persisting variant-pipeline E2E outputs at: $workDir") + // pick_crossovers requires an EVEN number of assemblies (they're + // paired for crossover simulation). smallseq ships 3 query FASTAs + // (LineA/LineB/LineC) and each input flows through to exactly one + // downsampled GVCF + one FASTA, so we feed only 2 queries through + // the pipeline to keep the assembly count even end-to-end. + val queryListFile = workDir.resolve("queries.txt") + queryListFile.writeText( + listOf( + smallseqRoot.resolve("queries/LineA.fa"), + smallseqRoot.resolve("queries/LineB.fa"), + ).joinToString("\n") { it.toString() } + "\n" + ) val configPath = workDir.resolve("pipeline.yaml") configPath.writeText( @@ -125,11 +97,16 @@ class OrchestrateE2ETest { - maf_to_gvcf - downsample_gvcf - convert_to_fasta + - pick_crossovers + - create_chain_files + - convert_coordinates + - generate_recombined_sequences + - format_recombined_fastas align_assemblies: ref_gff: "${smallseqRoot.resolve("anchors.gff")}" ref_fasta: "${smallseqRoot.resolve("Ref.fa")}" - query_fasta: "${smallseqRoot.resolve("queries")}" + query_fasta: "${queryListFile.toString()}" threads: 2 maf_to_gvcf: @@ -144,6 +121,19 @@ class OrchestrateE2ETest { convert_to_fasta: missing_records_as: "asRef" missing_genotype_as: "asN" + + pick_crossovers: {} + + create_chain_files: + jobs: 2 + + convert_coordinates: {} + + generate_recombined_sequences: {} + + format_recombined_fastas: + line_width: 60 + threads: 2 """.trimIndent() ) @@ -234,6 +224,123 @@ class OrchestrateE2ETest { "Step 4's temp_uncompressed_gvcf_fasta dir should be cleaned up after the run" ) + // --------------------------------------------------------------- + // Step 5: pick_crossovers -> 05_crossovers_results/ + // --------------------------------------------------------------- + val step5Dir = workDir.resolve("output/05_crossovers_results").toFile() + assertTrue(step5Dir.exists() && step5Dir.isDirectory, "Step 5 output directory must exist") + val refkeyPaths = File(step5Dir, "refkey_file_paths.txt") + assertTrue(refkeyPaths.exists(), "refkey_file_paths.txt must exist") + val refkeyFiles = refkeyPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(refkeyFiles.isNotEmpty(), "At least one refkey BED should be produced") + assertTrue( + refkeyFiles.all { it.exists() && it.length() > 0 }, + "Every refkey BED referenced in refkey_file_paths.txt must exist and be non-empty" + ) + assertTrue( + refkeyFiles.all { it.name.endsWith("_refkey.bed") }, + "Every refkey path must end with _refkey.bed" + ) + + // --------------------------------------------------------------- + // Step 6: create_chain_files -> 06_chain_results/ + // --------------------------------------------------------------- + val step6Dir = workDir.resolve("output/06_chain_results").toFile() + assertTrue(step6Dir.exists() && step6Dir.isDirectory, "Step 6 output directory must exist") + val chainPaths = File(step6Dir, "chain_file_paths.txt") + assertTrue(chainPaths.exists(), "chain_file_paths.txt must exist") + val chainFiles = chainPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(chainFiles.isNotEmpty(), "At least one chain file should be produced") + assertTrue( + chainFiles.all { it.exists() && it.length() > 0 }, + "Every chain file referenced in chain_file_paths.txt must exist and be non-empty" + ) + assertTrue( + chainFiles.all { it.name.endsWith("_subsampled.chain") }, + "Every chain file should be renamed with _subsampled suffix" + ) + + // Step 6 should clean up its temporary MAF staging directory. + val tempMafDir = workDir.resolve("output/06_chain_results/temp_maf_files").toFile() + assertTrue( + !tempMafDir.exists(), + "Step 6's temp_maf_files dir should be cleaned up after the run" + ) + + // --------------------------------------------------------------- + // Step 7: convert_coordinates -> 07_coordinates_results/ + // --------------------------------------------------------------- + val step7Dir = workDir.resolve("output/07_coordinates_results").toFile() + assertTrue(step7Dir.exists() && step7Dir.isDirectory, "Step 7 output directory must exist") + val keyPaths = File(step7Dir, "key_file_paths.txt") + assertTrue(keyPaths.exists(), "key_file_paths.txt must exist") + val keyFiles = keyPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(keyFiles.isNotEmpty(), "At least one assembly key BED should be listed") + assertTrue( + keyFiles.all { it.exists() && it.length() > 0 }, + "Every assembly key referenced in key_file_paths.txt must exist and be non-empty" + ) + + val founderKeyPaths = File(step7Dir, "founder_key_file_paths.txt") + assertTrue(founderKeyPaths.exists(), "founder_key_file_paths.txt must exist") + val founderKeyFiles = founderKeyPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(founderKeyFiles.isNotEmpty(), "At least one founder key BED should be listed") + assertTrue( + founderKeyFiles.all { it.name.matches(Regex("^\\d+_key\\.bed$")) }, + "Founder keys must match the N_key.bed pattern" + ) + + // --------------------------------------------------------------- + // Step 8: generate_recombined_sequences -> 08_recombined_sequences/ + // --------------------------------------------------------------- + val step8Dir = workDir.resolve("output/08_recombined_sequences").toFile() + assertTrue(step8Dir.exists() && step8Dir.isDirectory, "Step 8 output directory must exist") + val recombinedFastasDir = File(step8Dir, "recombinate_fastas") + assertTrue( + recombinedFastasDir.exists() && recombinedFastasDir.isDirectory, + "Step 8's recombinate_fastas/ subdir must exist" + ) + val recombinedPaths = File(step8Dir, "recombined_fasta_paths.txt") + assertTrue(recombinedPaths.exists(), "recombined_fasta_paths.txt must exist") + val recombinedFiles = recombinedPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(recombinedFiles.isNotEmpty(), "At least one recombined FASTA should be listed") + assertTrue( + recombinedFiles.all { it.exists() && it.length() > 0 }, + "Every recombined FASTA must exist on disk and be non-empty" + ) + assertTrue( + recombinedFiles.all { it.parentFile.name == "recombinate_fastas" }, + "Recombined FASTAs must live under recombinate_fastas/" + ) + + // --------------------------------------------------------------- + // Step 9: format_recombined_fastas -> 09_formatted_fastas/ + // --------------------------------------------------------------- + val step9Dir = workDir.resolve("output/09_formatted_fastas").toFile() + assertTrue(step9Dir.exists() && step9Dir.isDirectory, "Step 9 output directory must exist") + val formattedPaths = File(step9Dir, "formatted_fasta_paths.txt") + assertTrue(formattedPaths.exists(), "formatted_fasta_paths.txt must exist") + val formattedFiles = formattedPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(formattedFiles.isNotEmpty(), "At least one formatted FASTA should be listed") + assertTrue( + formattedFiles.all { it.exists() && it.length() > 0 }, + "Every formatted FASTA must exist on disk and be non-empty" + ) + // seqkit should have honored --line-width 60; sample one file and check. + val sample = formattedFiles.first() + val sequenceLineLengths = sample.readLines() + .filterNot { it.startsWith(">") || it.isBlank() } + .map { it.length } + if (sequenceLineLengths.isNotEmpty()) { + // All but the final line of each contig must be exactly 60 chars. + val nonFinalLines = sequenceLineLengths.dropLast(1) + assertTrue( + nonFinalLines.all { it == 60 } || nonFinalLines.isEmpty(), + "All non-trailing sequence lines should be 60 chars wide in $sample; " + + "saw widths=${sequenceLineLengths.distinct().sorted()}" + ) + } + // --------------------------------------------------------------- // Log file contract: each pipeline step writes its own log file. // --------------------------------------------------------------- @@ -245,7 +352,12 @@ class OrchestrateE2ETest { "01_align_assemblies.log", "02_maf_to_gvcf.log", "03_downsample_gvcf.log", - "04_convert_to_fasta.log" + "04_convert_to_fasta.log", + "05_pick_crossovers.log", + "06_create_chain_files.log", + "07_convert_coordinates.log", + "08_generate_recombined_sequences.log", + "09_format_recombined_fastas.log" ).forEach { expected -> assertTrue( expected in logNames, @@ -253,6 +365,6 @@ class OrchestrateE2ETest { ) } - println(">>> Variant-pipeline E2E outputs preserved at: $workDir") + println(">>> Full-pipeline E2E outputs preserved at: $workDir") } } From 8569ff8ed4f2d7c02e4f3eb98714db2cb54d507e Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Fri, 15 May 2026 16:37:34 -0500 Subject: [PATCH 14/18] Fix chrom ID issues with smallseq data --- src/test/resources/smallseq/Ref.fa | 4 +- src/test/resources/smallseq/Ref.fa.fai | 2 + src/test/resources/smallseq/anchors.gff | 120 +++++++++---------- src/test/resources/smallseq/queries/LineA.fa | 4 +- src/test/resources/smallseq/queries/LineB.fa | 4 +- src/test/resources/smallseq/queries/LineC.fa | 4 +- 6 files changed, 70 insertions(+), 68 deletions(-) create mode 100644 src/test/resources/smallseq/Ref.fa.fai diff --git a/src/test/resources/smallseq/Ref.fa b/src/test/resources/smallseq/Ref.fa index 0979bf7..e8ef7e8 100644 --- a/src/test/resources/smallseq/Ref.fa +++ b/src/test/resources/smallseq/Ref.fa @@ -1,4 +1,4 @@ ->1 sampleName=Ref +>chr1 sampleName=Ref GCGCGGGGACCGAGAATCCCGGCGGGGCAGGACGAACCGGGCGAGAACGGACAACCCCCCGCTACGACGGCACCAACCACCCCACGAACGCCGGGGACCCACACGCGAAGGGAGCCAAGA GATCGAGCGACGACGACACCGCCAAAACGGCACCCCCTCTACGAACCAGGCCGACACGGACCGCACCCAAACGCCGACGACCAGCGGACGAAGAACCGCAACACCGCAGACGGAAAACCA CGACGCCAAGAAAAACAGGGCATAGAGCCGAGGCAGGAACCGAAGGAAACACCAAAAAGGACGGGAGGGCCCCCGCAGGGCGGGCGCACCCGAGTCACACCCAACAGAAACCCACCGGAC @@ -458,7 +458,7 @@ GGAGAACCCACACTCAAAGGACAAAAGCGAAAAGAGCGGGCCAAGACCGAAGACACCACAGGAGCCCGCAACGCAGCAAG CAGGCACAGAAGGAGGGCACACAGCCACAAAGGGGAAAAGGCCGGCAAAAAGCGGACCAGGCGGGAAAAGGATAGGACGACCCAGTGGACCGGCAGGCCAAAGGGCCGAAACGCAGGCAG CCGCGCGCCAGCCAAGCCAACCGGACGCACGGGCCAAGGCACACAGAGGAAACCGGAGCTGAACCGCAGACCCAGAGCACCAGGCGACGCAGAGCAACGCCCACCAGCGCACGCGAACGG ACGAAGGGACGCGCGAACGGCGGACAAATGAGAAGGCCGG ->2 sampleName=Ref +>chr2 sampleName=Ref CCGGAACGGAGGGACCACGCGAGGGGGAGCGCCGCACCAACGACGGCGCGGACCGGCCCCCAGACACGGGAGGCAAGAGGAGGCAAGGCCGCGGAAGTAAAGAGGAACAAACGGCGCAGC CGCCGGCGAGAAGGAGAGAACCGAGCAGCCCACGCGCCGGCGCGACACTCCGACAGGACGGACAAGCCGACGCCGTGCGACGCCCGACCAGGGACACGAGGAAACACCGGAGCGCGAGGA GCAAGAAACCGACGGACAGGACCGGAATAACGACCGCAAGACAGGGAGAACCAAAGCCCCAGACCCACCGAGACACCAGCAAGCGAGGGGGACAAAGGCGACACAGCGCCGCCGCAGGGC diff --git a/src/test/resources/smallseq/Ref.fa.fai b/src/test/resources/smallseq/Ref.fa.fai new file mode 100644 index 0000000..8bd27d1 --- /dev/null +++ b/src/test/resources/smallseq/Ref.fa.fai @@ -0,0 +1,2 @@ +chr1 55000 21 120 121 +chr2 55000 55501 120 121 diff --git a/src/test/resources/smallseq/anchors.gff b/src/test/resources/smallseq/anchors.gff index e04aac3..f47b7df 100644 --- a/src/test/resources/smallseq/anchors.gff +++ b/src/test/resources/smallseq/anchors.gff @@ -1,60 +1,60 @@ -1 . gene 1 1000 . + . ID=gene:gene0 -1 . mRNA 1 1000 . + . ID=transcript:gene0_T001;Parent=gene:gene0 -1 . CDS 1 800 . + 0 ID=CDS:gene0_P001;Parent=transcript:gene0_T001 -1 . gene 5501 6500 . + . ID=gene:gene1 -1 . mRNA 5501 6500 . + . ID=transcript:gene1_T001;Parent=gene:gene1 -1 . CDS 5501 6300 . + 0 ID=CDS:gene1_P001;Parent=transcript:gene1_T001 -1 . gene 11001 12000 . + . ID=gene:gene2 -1 . mRNA 11001 12000 . + . ID=transcript:gene2_T001;Parent=gene:gene2 -1 . CDS 11001 11800 . + 0 ID=CDS:gene2_P001;Parent=transcript:gene2_T001 -1 . gene 16501 17500 . + . ID=gene:gene3 -1 . mRNA 16501 17500 . + . ID=transcript:gene3_T001;Parent=gene:gene3 -1 . CDS 16501 17300 . + 0 ID=CDS:gene3_P001;Parent=transcript:gene3_T001 -1 . gene 22001 23000 . + . ID=gene:gene4 -1 . mRNA 22001 23000 . + . ID=transcript:gene4_T001;Parent=gene:gene4 -1 . CDS 22001 22800 . + 0 ID=CDS:gene4_P001;Parent=transcript:gene4_T001 -1 . gene 27501 28500 . + . ID=gene:gene5 -1 . mRNA 27501 28500 . + . ID=transcript:gene5_T001;Parent=gene:gene5 -1 . CDS 27501 28300 . + 0 ID=CDS:gene5_P001;Parent=transcript:gene5_T001 -1 . gene 33001 34000 . + . ID=gene:gene6 -1 . mRNA 33001 34000 . + . ID=transcript:gene6_T001;Parent=gene:gene6 -1 . CDS 33001 33800 . + 0 ID=CDS:gene6_P001;Parent=transcript:gene6_T001 -1 . gene 38501 39500 . + . ID=gene:gene7 -1 . mRNA 38501 39500 . + . ID=transcript:gene7_T001;Parent=gene:gene7 -1 . CDS 38501 39300 . + 0 ID=CDS:gene7_P001;Parent=transcript:gene7_T001 -1 . gene 44001 45000 . + . ID=gene:gene8 -1 . mRNA 44001 45000 . + . ID=transcript:gene8_T001;Parent=gene:gene8 -1 . CDS 44001 44800 . + 0 ID=CDS:gene8_P001;Parent=transcript:gene8_T001 -1 . gene 49501 50500 . + . ID=gene:gene9 -1 . mRNA 49501 50500 . + . ID=transcript:gene9_T001;Parent=gene:gene9 -1 . CDS 49501 50300 . + 0 ID=CDS:gene9_P001;Parent=transcript:gene9_T001 -2 . gene 1 1000 . + . ID=gene:gene10 -2 . mRNA 1 1000 . + . ID=transcript:gene10_T001;Parent=gene:gene10 -2 . CDS 1 800 . + 0 ID=CDS:gene10_P001;Parent=transcript:gene10_T001 -2 . gene 5501 6500 . + . ID=gene:gene11 -2 . mRNA 5501 6500 . + . ID=transcript:gene11_T001;Parent=gene:gene11 -2 . CDS 5501 6300 . + 0 ID=CDS:gene11_P001;Parent=transcript:gene11_T001 -2 . gene 11001 12000 . + . ID=gene:gene12 -2 . mRNA 11001 12000 . + . ID=transcript:gene12_T001;Parent=gene:gene12 -2 . CDS 11001 11800 . + 0 ID=CDS:gene12_P001;Parent=transcript:gene12_T001 -2 . gene 16501 17500 . + . ID=gene:gene13 -2 . mRNA 16501 17500 . + . ID=transcript:gene13_T001;Parent=gene:gene13 -2 . CDS 16501 17300 . + 0 ID=CDS:gene13_P001;Parent=transcript:gene13_T001 -2 . gene 22001 23000 . + . ID=gene:gene14 -2 . mRNA 22001 23000 . + . ID=transcript:gene14_T001;Parent=gene:gene14 -2 . CDS 22001 22800 . + 0 ID=CDS:gene14_P001;Parent=transcript:gene14_T001 -2 . gene 27501 28500 . + . ID=gene:gene15 -2 . mRNA 27501 28500 . + . ID=transcript:gene15_T001;Parent=gene:gene15 -2 . CDS 27501 28300 . + 0 ID=CDS:gene15_P001;Parent=transcript:gene15_T001 -2 . gene 33001 34000 . + . ID=gene:gene16 -2 . mRNA 33001 34000 . + . ID=transcript:gene16_T001;Parent=gene:gene16 -2 . CDS 33001 33800 . + 0 ID=CDS:gene16_P001;Parent=transcript:gene16_T001 -2 . gene 38501 39500 . + . ID=gene:gene17 -2 . mRNA 38501 39500 . + . ID=transcript:gene17_T001;Parent=gene:gene17 -2 . CDS 38501 39300 . + 0 ID=CDS:gene17_P001;Parent=transcript:gene17_T001 -2 . gene 44001 45000 . + . ID=gene:gene18 -2 . mRNA 44001 45000 . + . ID=transcript:gene18_T001;Parent=gene:gene18 -2 . CDS 44001 44800 . + 0 ID=CDS:gene18_P001;Parent=transcript:gene18_T001 -2 . gene 49501 50500 . + . ID=gene:gene19 -2 . mRNA 49501 50500 . + . ID=transcript:gene19_T001;Parent=gene:gene19 -2 . CDS 49501 50300 . + 0 ID=CDS:gene19_P001;Parent=transcript:gene19_T001 +chr1 . gene 1 1000 . + . ID=gene:gene0 +chr1 . mRNA 1 1000 . + . ID=transcript:gene0_T001;Parent=gene:gene0 +chr1 . CDS 1 800 . + 0 ID=CDS:gene0_P001;Parent=transcript:gene0_T001 +chr1 . gene 5501 6500 . + . ID=gene:gene1 +chr1 . mRNA 5501 6500 . + . ID=transcript:gene1_T001;Parent=gene:gene1 +chr1 . CDS 5501 6300 . + 0 ID=CDS:gene1_P001;Parent=transcript:gene1_T001 +chr1 . gene 11001 12000 . + . ID=gene:gene2 +chr1 . mRNA 11001 12000 . + . ID=transcript:gene2_T001;Parent=gene:gene2 +chr1 . CDS 11001 11800 . + 0 ID=CDS:gene2_P001;Parent=transcript:gene2_T001 +chr1 . gene 16501 17500 . + . ID=gene:gene3 +chr1 . mRNA 16501 17500 . + . ID=transcript:gene3_T001;Parent=gene:gene3 +chr1 . CDS 16501 17300 . + 0 ID=CDS:gene3_P001;Parent=transcript:gene3_T001 +chr1 . gene 22001 23000 . + . ID=gene:gene4 +chr1 . mRNA 22001 23000 . + . ID=transcript:gene4_T001;Parent=gene:gene4 +chr1 . CDS 22001 22800 . + 0 ID=CDS:gene4_P001;Parent=transcript:gene4_T001 +chr1 . gene 27501 28500 . + . ID=gene:gene5 +chr1 . mRNA 27501 28500 . + . ID=transcript:gene5_T001;Parent=gene:gene5 +chr1 . CDS 27501 28300 . + 0 ID=CDS:gene5_P001;Parent=transcript:gene5_T001 +chr1 . gene 33001 34000 . + . ID=gene:gene6 +chr1 . mRNA 33001 34000 . + . ID=transcript:gene6_T001;Parent=gene:gene6 +chr1 . CDS 33001 33800 . + 0 ID=CDS:gene6_P001;Parent=transcript:gene6_T001 +chr1 . gene 38501 39500 . + . ID=gene:gene7 +chr1 . mRNA 38501 39500 . + . ID=transcript:gene7_T001;Parent=gene:gene7 +chr1 . CDS 38501 39300 . + 0 ID=CDS:gene7_P001;Parent=transcript:gene7_T001 +chr1 . gene 44001 45000 . + . ID=gene:gene8 +chr1 . mRNA 44001 45000 . + . ID=transcript:gene8_T001;Parent=gene:gene8 +chr1 . CDS 44001 44800 . + 0 ID=CDS:gene8_P001;Parent=transcript:gene8_T001 +chr1 . gene 49501 50500 . + . ID=gene:gene9 +chr1 . mRNA 49501 50500 . + . ID=transcript:gene9_T001;Parent=gene:gene9 +chr1 . CDS 49501 50300 . + 0 ID=CDS:gene9_P001;Parent=transcript:gene9_T001 +chr2 . gene 1 1000 . + . ID=gene:gene10 +chr2 . mRNA 1 1000 . + . ID=transcript:gene10_T001;Parent=gene:gene10 +chr2 . CDS 1 800 . + 0 ID=CDS:gene10_P001;Parent=transcript:gene10_T001 +chr2 . gene 5501 6500 . + . ID=gene:gene11 +chr2 . mRNA 5501 6500 . + . ID=transcript:gene11_T001;Parent=gene:gene11 +chr2 . CDS 5501 6300 . + 0 ID=CDS:gene11_P001;Parent=transcript:gene11_T001 +chr2 . gene 11001 12000 . + . ID=gene:gene12 +chr2 . mRNA 11001 12000 . + . ID=transcript:gene12_T001;Parent=gene:gene12 +chr2 . CDS 11001 11800 . + 0 ID=CDS:gene12_P001;Parent=transcript:gene12_T001 +chr2 . gene 16501 17500 . + . ID=gene:gene13 +chr2 . mRNA 16501 17500 . + . ID=transcript:gene13_T001;Parent=gene:gene13 +chr2 . CDS 16501 17300 . + 0 ID=CDS:gene13_P001;Parent=transcript:gene13_T001 +chr2 . gene 22001 23000 . + . ID=gene:gene14 +chr2 . mRNA 22001 23000 . + . ID=transcript:gene14_T001;Parent=gene:gene14 +chr2 . CDS 22001 22800 . + 0 ID=CDS:gene14_P001;Parent=transcript:gene14_T001 +chr2 . gene 27501 28500 . + . ID=gene:gene15 +chr2 . mRNA 27501 28500 . + . ID=transcript:gene15_T001;Parent=gene:gene15 +chr2 . CDS 27501 28300 . + 0 ID=CDS:gene15_P001;Parent=transcript:gene15_T001 +chr2 . gene 33001 34000 . + . ID=gene:gene16 +chr2 . mRNA 33001 34000 . + . ID=transcript:gene16_T001;Parent=gene:gene16 +chr2 . CDS 33001 33800 . + 0 ID=CDS:gene16_P001;Parent=transcript:gene16_T001 +chr2 . gene 38501 39500 . + . ID=gene:gene17 +chr2 . mRNA 38501 39500 . + . ID=transcript:gene17_T001;Parent=gene:gene17 +chr2 . CDS 38501 39300 . + 0 ID=CDS:gene17_P001;Parent=transcript:gene17_T001 +chr2 . gene 44001 45000 . + . ID=gene:gene18 +chr2 . mRNA 44001 45000 . + . ID=transcript:gene18_T001;Parent=gene:gene18 +chr2 . CDS 44001 44800 . + 0 ID=CDS:gene18_P001;Parent=transcript:gene18_T001 +chr2 . gene 49501 50500 . + . ID=gene:gene19 +chr2 . mRNA 49501 50500 . + . ID=transcript:gene19_T001;Parent=gene:gene19 +chr2 . CDS 49501 50300 . + 0 ID=CDS:gene19_P001;Parent=transcript:gene19_T001 diff --git a/src/test/resources/smallseq/queries/LineA.fa b/src/test/resources/smallseq/queries/LineA.fa index 2d2601a..bd872ff 100644 --- a/src/test/resources/smallseq/queries/LineA.fa +++ b/src/test/resources/smallseq/queries/LineA.fa @@ -1,4 +1,4 @@ ->1 sampleName=LineA +>chr1 sampleName=LineA GCGCGGGGACCGAGAAACCCGGCGGGGCAGGACGAACCGGGCGAGAACGGACAACCCCCCGCAACGACGGCACCAACCACCCCACGAACGCCGGGGACCCACTCGCGAAGGGAGCCAAGA GAACGAGCGACGACGACACCGCCTAAACGGCACCCCCGCGACGAACCAGGCCGACACGGACCGCACCCAAACGCCGACGACCAGCGGACGAAGAACCGCAACTCCGCAGATGGAAAACCA CGACGCCAAGAAAAACAGGGCAGAGAGCCGAGGCAGGAACCGAAGGAAACACCAAAAAGGACGGGAGGGCCCCCGCAGGGCGGGCGCACCCGAGACACACCCAACAGAAACCCACCGGAT @@ -458,7 +458,7 @@ GGAGAACCCACACGCAAAGGACATAAGCGAAAAGAGCGGGCCAAGACCGAAGACACCACAGGAGCCCGCAACGCAGTAAG CAGGCACAGAAGGAGGGCACACAGCCACAAAGGGGAAAAGGCCGGCAAAAAGCGGACCAGGTGGGAAAAGGACAGGACGACCCAGCGGACCGGCTGGCCAAAGGGCCGAAACGCAGGCAG CCGCGCGCCAGCCAAGCCAACCTGACGCACGGGCCAAGGCACACAGAGGAAACCGGAGCGGAACCGCAGACCCAGAGCACCATGCGACGCAGAGCAACGCCCACCAGCGCACGCGAACGG ACGAAGGGACGCGCGAACGGCGGACAAAGGAGAAGGCCGG ->2 sampleName=LineA +>chr2 sampleName=LineA CCGGAACGGAGGGACCACGCGAGGGGGAGCGCCGCACCAACGACGGCGCGGACCGGCCCCCAGACACGGGAGGCAAGAGGAGGCAATGCCGCGGAAGAAAAGAGGAACAAACGGCGCAGC CGCCGGCGAGAAGGAGAGAACCGAGCAGCCCACGCGCCGGCGCGACACGCCGACAGGACGGACAAGCCGACGCCGAGCGACGCCCGACCAGGGACACGAGGAAACACCGGAGCGCGAGGA GCAAGAAACCGACGGACAGGACCGGAAGAACGACCGCAAGACAGGGAGAACCAAATCCCCAGACCCACCGAGACACCAGCAAGCGAGGGGGACAAAGGCGTCACAGCGCCGCCGCAGGGC diff --git a/src/test/resources/smallseq/queries/LineB.fa b/src/test/resources/smallseq/queries/LineB.fa index c1bcbe6..44c87b1 100644 --- a/src/test/resources/smallseq/queries/LineB.fa +++ b/src/test/resources/smallseq/queries/LineB.fa @@ -1,4 +1,4 @@ ->1 sampleName=LineB +>chr1 sampleName=LineB GCGCGGGGACCGTGAAACCCGGCGGGGCAGGACGAACCGGGCGAGAACGGACAACCCCCCGCAACTACGGCACCAACCACCCCACGAACGCCGGGGACCCACACGCGAAGGGAGCCAAGA GAACGAGCGACGACGACACTGCCAAAACGGCACCCCCGCGACGAACTAGGCCGACACGGACCGCACCCAAACGCCGACGACCAGCGGACGAAGAACCGCAACACCGCATACGGAAAACCA CGACGCCAAGAAAAACAGGGCAGAGTGCCGAGGCAGGTACCGAAGGAAACACCAAAAAGGACGGGAGGGCCCCCGCAGGGCGGGCGCACCCGAGACACACCCAACAGAAACCCACCGGAC @@ -458,7 +458,7 @@ GGAGAACCCACACGCAAAGGACAAAAGCGAAAAGAGCGGGCCAAGACCGAAGACACCACAGGAGCCCGCAATGCAGCTAG CAGGCACAGAAGGAGGGCATACAGCCACAAAGGGGAAAAGGCCGGCAAAAAGCGGACCAGGCGGGAAAAGGACAGGACGACCCAGTGGACCGGCAGTCCAAAGGGCCGAAACGCAGGCAG CCGCGCGTCAGCCAAGCCAACCGGACGCACGGGCCAAGGCACACAGAGGAATCCGGAGCGGAACCGCAGACCCAGAGCATCAGGCGACGCAGAGCAACGCCCACCAGCGCACGCGAACGG ACGAAGGGACGCGCGAACGGCGGACAAAGGAGAAGGCCGG ->2 sampleName=LineB +>chr2 sampleName=LineB CCGGAACGGAGGGACCACGCGAGGGGGAGCGCCGTACCAACGACGGCGTGGACCGGCCCCCAGACACGGGAGGCAAGAGGAGGCAAGGCCGCGGAAGAAAAGAGGAACAAACGGCGCAGC CTCCGGCGAGAAGGAGAGAACCGAGCAGCCCACGCGCCGGCGCGACACGCCGATAGGACGGACAAGCCGACGCCGAGCGACGCCCGACCAGGGACACGAGGAAACACCGGAGCGCGAGGA GCAAGAAACCGACGGACAGGACCGGAAGAACGACCGCAAGACAGGGAGAACCAAAGCCCCAGACCCACCGAGACACCAGCAAGCGAGGGGGACAAAGGCGACACAGCGCCGCCGTAGGGC diff --git a/src/test/resources/smallseq/queries/LineC.fa b/src/test/resources/smallseq/queries/LineC.fa index 328a947..2265ce2 100644 --- a/src/test/resources/smallseq/queries/LineC.fa +++ b/src/test/resources/smallseq/queries/LineC.fa @@ -1,4 +1,4 @@ ->1 sampleName=LineC +>chr1 sampleName=LineC GCGCGGGGACCGAGAAACCCGGCGGGGCAGGACGAACCGGGCGAGAACGGACAACCCCCCGCAACGACGGCACCAACCACCCCACGAACGCCGGGGACCCACTCGCGAAGGGAGCCAAGA GAACGAGCGACGACGACACCGCCTAAACGGCACCCCCGCGACGAACCAGGCCGACACGGACCGCACCCAAACGCCGACGACCAGCGGACGAAGAACCGCAACTCCGCAGATGGAAAACCA CGACGCCAAGAAAAACAGGGCAGAGAGCCGAGGCAGGAACCGAAGGAAACACCAAAAAGGACGGGAGGGCCCCCGCAGGGCGGGCGCACCCGAGACACACCCAACAGAAACCCACCGGAT @@ -312,7 +312,7 @@ CAAGGAAAACACAGCCAACGGAGAACCCACACGCAAAGGACATAAGCGAAAAGAGCGGGCCAAGACCGAAGACACCACAG ACACACGGACCGCAAAAACCAGGCACAGAAGGAGGGCACACAGCCACAAAGGGGAAAAGGCCGGCAAAAAGCGGACCAGGTGGGAAAAGGACAGGACGACCCAGCGGACCGGCTGGCCAA AGGGCCGAAACGCAGGCAGCCGCGCGCCAGCCAAGCCAACCTGACGCACGGGCCAAGGCACACAGAGGAAACCGGAGCGGAACCGCAGACCCAGAGCACCATGCGACGCAGAGCAACGCC CACCAGCGCACGCGAACGGACGAAGGGACGCGCGAACGGCGGACAAAGGAGAAGGCCGG ->2 sampleName=LineC +>chr2 sampleName=LineC CCGGAACGGAGGGACCACGCGAGGGGGAGCGCCGCACCAACGACGGCGCGGACCGGCCCCCAGACACGGGAGGCAAGAGGAGGCAATGCCGCGGAAGAAAAGAGGAACAAACGGCGCAGC CGCCGGCGAGAAGGAGAGAACCGAGCAGCCCACGCGCCGGCGCGACACGCCGACAGGACGGACAAGCCGACGCCGAGCGACGCCCGACCAGGGACACGAGGAAACACCGGAGCGCGAGGA GCAAGAAACCGACGGACAGGACCGGAAGAACGACCGCAAGACAGGGAGAACCAAATCCCCAGACCCACCGAGACACCAGCAAGCGAGGGGGACAAAGGCGTCACAGCGCCGCCGCAGGGC From 7e6bceaaadc2b8dabf0e7a4a95da9162b79aa527 Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Tue, 19 May 2026 17:05:55 -0500 Subject: [PATCH 15/18] Fix trailing sequence tests --- .../integration/OrchestrateE2ETest.kt | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt index 3103e17..f74fb88 100644 --- a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt +++ b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt @@ -327,19 +327,39 @@ class OrchestrateE2ETest { "Every formatted FASTA must exist on disk and be non-empty" ) // seqkit should have honored --line-width 60; sample one file and check. + // Multi-contig FASTAs have one trailing (possibly short) line per contig, + // so we must split interior vs trailing lines by walking the file rather + // than flattening all sequence lines and dropping only the last one. val sample = formattedFiles.first() - val sequenceLineLengths = sample.readLines() - .filterNot { it.startsWith(">") || it.isBlank() } - .map { it.length } - if (sequenceLineLengths.isNotEmpty()) { - // All but the final line of each contig must be exactly 60 chars. - val nonFinalLines = sequenceLineLengths.dropLast(1) + val allLines = sample.readLines() + val interiorLineLengths = mutableListOf() + val trailingLineLengths = mutableListOf() + for (i in allLines.indices) { + val line = allLines[i] + if (line.startsWith(">") || line.isBlank()) continue + val next = allLines.getOrNull(i + 1) + val isTrailingForContig = next == null || + next.startsWith(">") || + next.isBlank() + if (isTrailingForContig) { + trailingLineLengths.add(line.length) + } else { + interiorLineLengths.add(line.length) + } + } + if (interiorLineLengths.isNotEmpty()) { assertTrue( - nonFinalLines.all { it == 60 } || nonFinalLines.isEmpty(), - "All non-trailing sequence lines should be 60 chars wide in $sample; " + - "saw widths=${sequenceLineLengths.distinct().sorted()}" + interiorLineLengths.all { it == 60 }, + "All interior sequence lines should be 60 chars wide in $sample; " + + "saw interior widths=${interiorLineLengths.distinct().sorted()}, " + + "trailing widths=${trailingLineLengths.distinct().sorted()}" ) } + assertTrue( + trailingLineLengths.all { it in 1..60 }, + "Trailing sequence lines must be 1..60 chars wide in $sample; " + + "saw trailing widths=${trailingLineLengths.distinct().sorted()}" + ) // --------------------------------------------------------------- // Log file contract: each pipeline step writes its own log file. From 73d3d8387d439e763f78f8f60272b141c13f514b Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Tue, 19 May 2026 18:22:33 -0500 Subject: [PATCH 16/18] Add missing steps --- .../commands/BuildSplineKnots.kt | 6 +- .../commands/ConvertRopebwt2Ps4g.kt | 22 +- .../net/maizegenetics/commands/Orchestrate.kt | 374 +++++++++++++++++- .../maizegenetics/commands/RopeBwtChrIndex.kt | 2 +- .../net/maizegenetics/commands/RopeBwtMem.kt | 42 +- .../net/maizegenetics/utils/FileUtils.kt | 2 +- 6 files changed, 405 insertions(+), 43 deletions(-) diff --git a/src/main/kotlin/net/maizegenetics/commands/BuildSplineKnots.kt b/src/main/kotlin/net/maizegenetics/commands/BuildSplineKnots.kt index 9bcafc7..db1d8b9 100644 --- a/src/main/kotlin/net/maizegenetics/commands/BuildSplineKnots.kt +++ b/src/main/kotlin/net/maizegenetics/commands/BuildSplineKnots.kt @@ -19,9 +19,9 @@ import kotlin.io.path.* class BuildSplineKnots : CliktCommand(name = "build-spline-knots") { companion object { - private const val LOG_FILE_NAME = "13_build_spline_knots.log" + private const val LOG_FILE_NAME = "14_build_spline_knots.log" private const val OUTPUT_DIR = "output" - private const val SPLINE_KNOTS_RESULTS_DIR = "13_spline_knots_results" + private const val SPLINE_KNOTS_RESULTS_DIR = "14_spline_knots_results" private const val DEFAULT_VCF_TYPE = "hvcf" private const val DEFAULT_MIN_INDEL_LENGTH = 10 private const val DEFAULT_NUM_BPS_PER_KNOT = 50000 @@ -49,7 +49,7 @@ class BuildSplineKnots : CliktCommand(name = "build-spline-knots") { private val outputDirOption by option( "--output-dir", "-o", - help = "Output directory to write the spline knots to (default: work_dir/output/13_spline_knots_results)" + help = "Output directory to write the spline knots to (default: work_dir/output/14_spline_knots_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private val minIndelLength by option( diff --git a/src/main/kotlin/net/maizegenetics/commands/ConvertRopebwt2Ps4g.kt b/src/main/kotlin/net/maizegenetics/commands/ConvertRopebwt2Ps4g.kt index 3c11499..2c221ad 100644 --- a/src/main/kotlin/net/maizegenetics/commands/ConvertRopebwt2Ps4g.kt +++ b/src/main/kotlin/net/maizegenetics/commands/ConvertRopebwt2Ps4g.kt @@ -18,9 +18,11 @@ import kotlin.system.exitProcess class ConvertRopebwt2Ps4g : CliktCommand(name = "convert-ropebwt2ps4g") { companion object { - private const val LOG_FILE_NAME = "14_convert_ropebwt2ps4g.log" + private const val LOG_FILE_NAME = "15_convert_ropebwt2ps4g.log" private const val OUTPUT_DIR = "output" - private const val CONVERT_RESULTS_DIR = "14_convert_ropebwt2ps4g_results" + private const val CONVERT_RESULTS_DIR = "15_convert_ropebwt2ps4g_results" + private const val UPSTREAM_BED_DIR = "13_ropebwt_mem_results" + private const val UPSTREAM_SPLINE_DIR = "14_spline_knots_results" private const val PS4G_FILE_PATHS_FILE = "ps4g_file_paths.txt" private const val DEFAULT_MIN_MEM_LENGTH = 135 private const val DEFAULT_MAX_NUM_HITS = 16 @@ -42,12 +44,12 @@ class ConvertRopebwt2Ps4g : CliktCommand(name = "convert-ropebwt2ps4g") { private val outputDirOption by option( "--output-dir", "-o", - help = "Output directory for PS4G files (default: work_dir/output/14_convert_ropebwt2ps4g_results)" + help = "Output directory for PS4G files (default: work_dir/output/15_convert_ropebwt2ps4g_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private val splineKnotDirOption by option( "--spline-knot-dir", "-s", - help = "Directory containing spline knots from step 13 (auto-detected if not specified)" + help = "Directory containing spline knots from step 14 (auto-detected if not specified)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private val minMemLength by option( @@ -63,14 +65,14 @@ class ConvertRopebwt2Ps4g : CliktCommand(name = "convert-ropebwt2ps4g") { .default(DEFAULT_MAX_NUM_HITS) private fun collectBedFiles(): List { - // If no input specified, try to auto-detect from step 12 + // If no input specified, try to auto-detect from step 13 (ropebwt-mem) val actualInput = bedInput ?: run { - logger.info("No BED input specified, attempting to auto-detect from step 12") + logger.info("No BED input specified, attempting to auto-detect from step 13") FileUtils.autoDetectStepOutput( workDir, - "12_ropebwt_mem_results", + UPSTREAM_BED_DIR, logger, - "Please specify --bed-input or ensure step 12 (ropebwt-mem) has been run" + "Please specify --bed-input or ensure step 13 (ropebwt-mem) has been run" ) } @@ -85,9 +87,9 @@ class ConvertRopebwt2Ps4g : CliktCommand(name = "convert-ropebwt2ps4g") { private fun findSplineKnotDir(): Path { return FileUtils.autoDetectStepOutput( workDir, - "13_spline_knots_results", + UPSTREAM_SPLINE_DIR, logger, - "Please specify --spline-knot-dir manually or ensure step 13 (build-spline-knots) has been run" + "Please specify --spline-knot-dir manually or ensure step 14 (build-spline-knots) has been run" ) } diff --git a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt index e1fc13b..f328a9f 100644 --- a/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt +++ b/src/main/kotlin/net/maizegenetics/commands/Orchestrate.kt @@ -33,7 +33,10 @@ data class PipelineConfig( val generate_recombined_sequences: GenerateRecombinedSequencesConfig? = null, val format_recombined_fastas: FormatRecombinedFastasConfig? = null, val mutated_maf_to_gvcf: MutatedMafToGvcfConfig? = null, - val rope_bwt_chr_index: RopeBwtChrIndexConfig? = null + val rope_bwt_chr_index: RopeBwtChrIndexConfig? = null, + val ropebwt_mem: RopebwtMemConfig? = null, + val build_spline_knots: BuildSplineKnotsConfig? = null, + val convert_ropebwt2ps4g: ConvertRopebwt2Ps4gConfig? = null ) data class AlignAssembliesConfig( @@ -136,6 +139,33 @@ data class RopeBwtChrIndexConfig( val output: String? = null // Optional: Custom output directory ) +data class RopebwtMemConfig( + val fastq_input: String, // Required: FASTQ file, directory, or text list (no upstream auto-gen) + val index_file: String? = null, // Optional: .fmd index (defaults to step 12 output) + val l_value: Int? = null, // Optional: -l (defaults to 2 x FASTA count from step 12 keyfile) + val p_value: Int? = null, // Optional: -p (default: 168) + val threads: Int? = null, // Optional: number of threads (default: 1) + val output: String? = null // Optional: Custom output directory +) + +data class BuildSplineKnotsConfig( + val vcf_dir: String? = null, // Optional: VCF directory (defaults to step 11 mutated GVCFs) + val vcf_type: String? = null, // Optional: "hvcf" or "gvcf" (default: "hvcf") + val min_indel_length: Int? = null, // Optional: gVCF only + val num_bps_per_knot: Int? = null, // Optional: knot density + val contig_list: String? = null, // Optional: comma-separated chromosomes + val random_seed: Int? = null, // Optional: deterministic downsampling seed + val output: String? = null // Optional: Custom output directory +) + +data class ConvertRopebwt2Ps4gConfig( + val bed_input: String? = null, // Optional: BED file/dir/list (defaults to step 13) + val spline_knot_dir: String? = null, // Optional: spline knot dir (defaults to step 14) + val min_mem_length: Int? = null, // Optional: minimum MEM length threshold + val max_num_hits: Int? = null, // Optional: maximum haplotype hits per alignment + val output: String? = null // Optional: Custom output directory +) + class Orchestrate : CliktCommand(name = "orchestrate") { companion object { private const val LOG_FILE_NAME = "00_orchestrate.log" @@ -446,6 +476,49 @@ class Orchestrate : CliktCommand(name = "orchestrate") { ) } else null + // Parse ropebwt_mem - fastq_input is required when the section is present + @Suppress("UNCHECKED_CAST") + val ropebwtMemMap = configMap["ropebwt_mem"] as? Map + val ropebwtMem = if (configMap.containsKey("ropebwt_mem")) { + RopebwtMemConfig( + fastq_input = ropebwtMemMap?.get("fastq_input") as? String + ?: throw IllegalArgumentException("ropebwt_mem.fastq_input is required"), + index_file = ropebwtMemMap["index_file"] as? String, + l_value = ropebwtMemMap["l_value"] as? Int, + p_value = ropebwtMemMap["p_value"] as? Int, + threads = ropebwtMemMap["threads"] as? Int, + output = ropebwtMemMap["output"] as? String + ) + } else null + + // Parse build_spline_knots - check if key exists (even with empty/null value means "run with defaults") + @Suppress("UNCHECKED_CAST") + val buildSplineKnotsMap = configMap["build_spline_knots"] as? Map + val buildSplineKnots = if (configMap.containsKey("build_spline_knots")) { + BuildSplineKnotsConfig( + vcf_dir = buildSplineKnotsMap?.get("vcf_dir") as? String, + vcf_type = buildSplineKnotsMap?.get("vcf_type") as? String, + min_indel_length = buildSplineKnotsMap?.get("min_indel_length") as? Int, + num_bps_per_knot = buildSplineKnotsMap?.get("num_bps_per_knot") as? Int, + contig_list = buildSplineKnotsMap?.get("contig_list") as? String, + random_seed = buildSplineKnotsMap?.get("random_seed") as? Int, + output = buildSplineKnotsMap?.get("output") as? String + ) + } else null + + // Parse convert_ropebwt2ps4g - check if key exists (even with empty/null value means "run with defaults") + @Suppress("UNCHECKED_CAST") + val convertRopebwt2Ps4gMap = configMap["convert_ropebwt2ps4g"] as? Map + val convertRopebwt2Ps4g = if (configMap.containsKey("convert_ropebwt2ps4g")) { + ConvertRopebwt2Ps4gConfig( + bed_input = convertRopebwt2Ps4gMap?.get("bed_input") as? String, + spline_knot_dir = convertRopebwt2Ps4gMap?.get("spline_knot_dir") as? String, + min_mem_length = convertRopebwt2Ps4gMap?.get("min_mem_length") as? Int, + max_num_hits = convertRopebwt2Ps4gMap?.get("max_num_hits") as? Int, + output = convertRopebwt2Ps4gMap?.get("output") as? String + ) + } else null + return PipelineConfig( work_dir = workDir, run_steps = runSteps, @@ -460,7 +533,10 @@ class Orchestrate : CliktCommand(name = "orchestrate") { generate_recombined_sequences = generateRecombinedSequences, format_recombined_fastas = formatRecombinedFastas, mutated_maf_to_gvcf = mutatedMafToGvcf, - rope_bwt_chr_index = ropeBwtChrIndex + rope_bwt_chr_index = ropeBwtChrIndex, + ropebwt_mem = ropebwtMem, + build_spline_knots = buildSplineKnots, + convert_ropebwt2ps4g = convertRopebwt2Ps4g ) } catch (e: Exception) { logger.error("Failed to parse configuration file: ${e.message}", e) @@ -505,6 +581,38 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } logger.info("") + // Pre-flight sanity check: pinning a single sample_name in step 11 while + // step 12 is auto-generating its keyfile from FASTA basenames guarantees + // a name mismatch between step-14 spline knots (keyed by VCF sample name) + // and step-13 BED contigs (keyed by step-12 keyfile sample names). PHG's + // convert-ropebwt2ps4g-file silently drops every record in that case, so + // step 15 produces a 0-row PS4G. Warn loudly when both conditions hold. + val step11SampleNamePinned = + config.mutated_maf_to_gvcf?.sample_name != null && + shouldRunStep("mutated_maf_to_gvcf", config) + val step12AutoKeyfile = + config.rope_bwt_chr_index != null && + config.rope_bwt_chr_index.keyfile == null && + shouldRunStep("rope_bwt_chr_index", config) + if (step11SampleNamePinned && step12AutoKeyfile) { + logger.warn("=".repeat(80)) + logger.warn( + "WARNING: mutated_maf_to_gvcf.sample_name is pinned to " + + "'${config.mutated_maf_to_gvcf!!.sample_name}', but rope_bwt_chr_index " + + "is auto-generating its keyfile from FASTA basenames. This will collapse " + + "every mutated gVCF into a single VCF sample, so step-14 spline knots will " + + "be keyed by '${config.mutated_maf_to_gvcf.sample_name}' while step-13 BED " + + "contigs will be keyed by FASTA basenames (e.g. '0', '1'). " + + "PHG convert-ropebwt2ps4g-file will then silently drop every record and " + + "step 15 will produce an empty PS4G." + ) + logger.warn( + "Recommended fix: omit mutated_maf_to_gvcf.sample_name so each gVCF is sampled " + + "by its MAF basename, which matches the auto-generated step-12 keyfile." + ) + logger.warn("=".repeat(80)) + } + // Track outputs between steps var mafFilePaths: Path? = null var gvcfOutputDir: Path? = null @@ -519,7 +627,10 @@ class Orchestrate : CliktCommand(name = "orchestrate") { var recombinedFastasDir: Path? = null var formattedFastasDir: Path? = null var mutatedMafFilePaths: Path? = null // MAF file paths from step 10 (align_mutated_assemblies) + var mutatedGvcfOutputDir: Path? = null // Mutated GVCF output directory from step 11 var ropeBwtIndexDir: Path? = null // RopeBWT index output directory from step 12 + var ropeBwtMemOutputDir: Path? = null // BED output directory from step 13 (ropebwt_mem) + var splineKnotsOutputDir: Path? = null // Spline-knots output directory from step 14 (build_spline_knots) try { // Step 1: Align Assemblies (if configured and should run) @@ -1327,10 +1438,11 @@ class Orchestrate : CliktCommand(name = "orchestrate") { // Determine output directory (custom or default) - resolve to absolute path // Always use step 11 output directory by default (not MafToGvcf's default) - val mutatedGvcfOutputDir = (config.mutated_maf_to_gvcf.output_dir?.let { + val step11OutputDir = (config.mutated_maf_to_gvcf.output_dir?.let { Path.of(it).toAbsolutePath().normalize() } ?: workDir.resolve("output").resolve("11_mutated_gvcf_results")) .toAbsolutePath().normalize() + mutatedGvcfOutputDir = step11OutputDir // Determine output file if specified - resolve to absolute path val outputFile = config.mutated_maf_to_gvcf.output_file?.let { @@ -1339,13 +1451,13 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("Reference FASTA: $step11RefFasta") logger.info("MAF input: $mafInput") - logger.info("Output directory: $mutatedGvcfOutputDir") + logger.info("Output directory: $step11OutputDir") val args = buildList { add("--work-dir=$workDir") add("--reference-file=$step11RefFasta") add("--maf-file=$mafInput") - add("--output-dir=$mutatedGvcfOutputDir") // Always pass output dir to ensure step 11 location + add("--output-dir=$step11OutputDir") // Always pass output dir to ensure step 11 location if (outputFile != null) { add("--output-file=$outputFile") } @@ -1357,8 +1469,8 @@ class Orchestrate : CliktCommand(name = "orchestrate") { MafToGvcf().parse(args) restoreOrchestratorLogging(workDir) - if (!mutatedGvcfOutputDir.exists()) { - throw RuntimeException("Expected mutated GVCF output directory not found: $mutatedGvcfOutputDir") + if (!step11OutputDir.exists()) { + throw RuntimeException("Expected mutated GVCF output directory not found: $step11OutputDir") } logger.info("Step 11 completed successfully") @@ -1366,6 +1478,18 @@ class Orchestrate : CliktCommand(name = "orchestrate") { } else { if (config.mutated_maf_to_gvcf != null) { logger.info("Skipping mutated-maf-to-gvcf (not in run_steps)") + + // Check custom output location first, then default + val previousMutatedGvcfDir = (config.mutated_maf_to_gvcf.output_dir?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: workDir.resolve("output").resolve("11_mutated_gvcf_results")) + .toAbsolutePath().normalize() + if (previousMutatedGvcfDir.exists()) { + mutatedGvcfOutputDir = previousMutatedGvcfDir + logger.info("Using previous mutated-maf-to-gvcf outputs: $mutatedGvcfOutputDir") + } else { + logger.warn("Previous mutated-maf-to-gvcf outputs not found. Downstream steps may fail.") + } } else { logger.info("Skipping mutated-maf-to-gvcf (not configured)") } @@ -1499,6 +1623,242 @@ class Orchestrate : CliktCommand(name = "orchestrate") { logger.info("") } + // Step 13: RopeBWT MEM Alignment (if configured and should run) + if (config.ropebwt_mem != null && shouldRunStep("ropebwt_mem", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 13: RopeBWT MEM Alignment") + logger.info("=".repeat(80)) + + // fastq_input is required when this step is configured + val fastqInput = Path.of(config.ropebwt_mem.fastq_input).toAbsolutePath().normalize() + if (!fastqInput.exists()) { + throw RuntimeException("Cannot run ropebwt-mem: FASTQ input not found at $fastqInput") + } + + // Resolve optional index_file override (defaults to .fmd discovered in step 12 output by RopeBwtMem itself) + val indexFileOverride = config.ropebwt_mem.index_file?.let { + Path.of(it).toAbsolutePath().normalize() + } + + // Determine output directory (custom or default) - resolve to absolute path + val customOutput = config.ropebwt_mem.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val step13OutputDir = (customOutput ?: workDir.resolve("output").resolve("13_ropebwt_mem_results")) + .toAbsolutePath().normalize() + + logger.info("FASTQ input: $fastqInput") + if (indexFileOverride != null) { + logger.info("Index file (override): $indexFileOverride") + } else if (ropeBwtIndexDir != null) { + logger.info("Index will be auto-detected from step 12 output: $ropeBwtIndexDir") + } + + val args = buildList { + add("--work-dir=$workDir") + add("--fastq-input=$fastqInput") + if (indexFileOverride != null) { + add("--index-file=$indexFileOverride") + } + if (config.ropebwt_mem.l_value != null) { + add("--l-value=${config.ropebwt_mem.l_value}") + } + if (config.ropebwt_mem.p_value != null) { + add("--p-value=${config.ropebwt_mem.p_value}") + } + if (config.ropebwt_mem.threads != null) { + add("--threads=${config.ropebwt_mem.threads}") + } + if (customOutput != null) { + add("--output-dir=$customOutput") + } + } + + RopeBwtMem().parse(args) + restoreOrchestratorLogging(workDir) + + ropeBwtMemOutputDir = step13OutputDir + + if (!ropeBwtMemOutputDir.exists()) { + throw RuntimeException("Expected ropebwt-mem output directory not found: $ropeBwtMemOutputDir") + } + + logger.info("Step 13 completed successfully") + logger.info("") + } else { + if (config.ropebwt_mem != null) { + logger.info("Skipping ropebwt-mem (not in run_steps)") + + val customOutput = config.ropebwt_mem.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val previousMemDir = (customOutput ?: workDir.resolve("output").resolve("13_ropebwt_mem_results")) + .toAbsolutePath().normalize() + if (previousMemDir.exists()) { + ropeBwtMemOutputDir = previousMemDir + logger.info("Using previous ropebwt-mem outputs: $ropeBwtMemOutputDir") + } else { + logger.warn("Previous ropebwt-mem outputs not found. Downstream steps may fail.") + } + } else { + logger.info("Skipping ropebwt-mem (not configured)") + } + logger.info("") + } + + // Step 14: Build Spline Knots (if configured and should run) + if (config.build_spline_knots != null && shouldRunStep("build_spline_knots", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 14: Build Spline Knots") + logger.info("=".repeat(80)) + + // Determine VCF input directory. Prefer config.vcf_dir; otherwise + // chain from step 11 (mutated GVCFs) when available. + val vcfDir = config.build_spline_knots.vcf_dir?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: mutatedGvcfOutputDir + if (vcfDir == null) { + throw RuntimeException("Cannot run build-spline-knots: no VCF input available (specify 'vcf_dir' in config or run mutated-maf-to-gvcf first)") + } + if (!vcfDir.exists()) { + throw RuntimeException("Cannot run build-spline-knots: VCF input directory not found at $vcfDir") + } + logger.info("VCF directory: $vcfDir") + + // Default to "gvcf" when chaining from step 11 (which produces gVCFs) + // and the user hasn't pinned a vcf_type explicitly. + val vcfType = config.build_spline_knots.vcf_type + ?: if (config.build_spline_knots.vcf_dir == null) "gvcf" else null + if (vcfType != null) { + logger.info("VCF type: $vcfType") + } + + // Determine output directory (custom or default) + val customOutput = config.build_spline_knots.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val step14OutputDir = (customOutput ?: workDir.resolve("output").resolve("14_spline_knots_results")) + .toAbsolutePath().normalize() + + val args = buildList { + add("--work-dir=$workDir") + add("--vcf-dir=$vcfDir") + if (vcfType != null) { + add("--vcf-type=$vcfType") + } + if (config.build_spline_knots.min_indel_length != null) { + add("--min-indel-length=${config.build_spline_knots.min_indel_length}") + } + if (config.build_spline_knots.num_bps_per_knot != null) { + add("--num-bps-per-knot=${config.build_spline_knots.num_bps_per_knot}") + } + if (config.build_spline_knots.contig_list != null) { + add("--contig-list=${config.build_spline_knots.contig_list}") + } + if (config.build_spline_knots.random_seed != null) { + add("--random-seed=${config.build_spline_knots.random_seed}") + } + if (customOutput != null) { + add("--output-dir=$customOutput") + } + } + + BuildSplineKnots().parse(args) + restoreOrchestratorLogging(workDir) + + splineKnotsOutputDir = step14OutputDir + + if (!splineKnotsOutputDir.exists()) { + throw RuntimeException("Expected build-spline-knots output directory not found: $splineKnotsOutputDir") + } + + logger.info("Step 14 completed successfully") + logger.info("") + } else { + if (config.build_spline_knots != null) { + logger.info("Skipping build-spline-knots (not in run_steps)") + + val customOutput = config.build_spline_knots.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val previousSplineDir = (customOutput ?: workDir.resolve("output").resolve("14_spline_knots_results")) + .toAbsolutePath().normalize() + if (previousSplineDir.exists()) { + splineKnotsOutputDir = previousSplineDir + logger.info("Using previous build-spline-knots outputs: $splineKnotsOutputDir") + } else { + logger.warn("Previous build-spline-knots outputs not found. Downstream steps may fail.") + } + } else { + logger.info("Skipping build-spline-knots (not configured)") + } + logger.info("") + } + + // Step 15: Convert RopeBWT to PS4G (if configured and should run) + if (config.convert_ropebwt2ps4g != null && shouldRunStep("convert_ropebwt2ps4g", config)) { + logger.info("=".repeat(80)) + logger.info("STEP 15: Convert RopeBWT to PS4G") + logger.info("=".repeat(80)) + + // BED input: explicit override, else chain from step 13 + val bedInput = config.convert_ropebwt2ps4g.bed_input?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: ropeBwtMemOutputDir + if (bedInput == null) { + throw RuntimeException("Cannot run convert-ropebwt2ps4g: no BED input available (specify 'bed_input' in config or run ropebwt-mem first)") + } + logger.info("BED input: $bedInput") + + // Spline knots: explicit override, else chain from step 14 + val splineKnotDir = config.convert_ropebwt2ps4g.spline_knot_dir?.let { + Path.of(it).toAbsolutePath().normalize() + } ?: splineKnotsOutputDir + if (splineKnotDir == null) { + throw RuntimeException("Cannot run convert-ropebwt2ps4g: no spline-knot directory available (specify 'spline_knot_dir' in config or run build-spline-knots first)") + } + logger.info("Spline knot directory: $splineKnotDir") + + // Determine output directory (custom or default) + val customOutput = config.convert_ropebwt2ps4g.output?.let { + Path.of(it).toAbsolutePath().normalize() + } + val step15OutputDir = (customOutput ?: workDir.resolve("output").resolve("15_convert_ropebwt2ps4g_results")) + .toAbsolutePath().normalize() + + val args = buildList { + add("--work-dir=$workDir") + add("--bed-input=$bedInput") + add("--spline-knot-dir=$splineKnotDir") + if (config.convert_ropebwt2ps4g.min_mem_length != null) { + add("--min-mem-length=${config.convert_ropebwt2ps4g.min_mem_length}") + } + if (config.convert_ropebwt2ps4g.max_num_hits != null) { + add("--max-num-hits=${config.convert_ropebwt2ps4g.max_num_hits}") + } + if (customOutput != null) { + add("--output-dir=$customOutput") + } + } + + ConvertRopebwt2Ps4g().parse(args) + restoreOrchestratorLogging(workDir) + + if (!step15OutputDir.exists()) { + throw RuntimeException("Expected convert-ropebwt2ps4g output directory not found: $step15OutputDir") + } + + logger.info("Step 15 completed successfully") + logger.info("") + } else { + if (config.convert_ropebwt2ps4g != null) { + logger.info("Skipping convert-ropebwt2ps4g (not in run_steps)") + } else { + logger.info("Skipping convert-ropebwt2ps4g (not configured)") + } + logger.info("") + } + // Pipeline completed successfully logger.info("=".repeat(80)) logger.info("PIPELINE COMPLETED SUCCESSFULLY!") diff --git a/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt b/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt index be81f3f..71afc53 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RopeBwtChrIndex.kt @@ -19,7 +19,7 @@ import kotlin.system.exitProcess class RopeBwtChrIndex : CliktCommand(name = "rope-bwt-chr-index") { companion object { - private const val LOG_FILE_NAME = "11_rope_bwt_chr_index.log" + private const val LOG_FILE_NAME = "12_rope_bwt_chr_index.log" private const val ROPE_BWT_RESULTS_DIR = "12_rope_bwt_index_results" private const val KEYFILE_NAME = "phg_keyfile.txt" private const val DEFAULT_INDEX_PREFIX = "phgIndex" diff --git a/src/main/kotlin/net/maizegenetics/commands/RopeBwtMem.kt b/src/main/kotlin/net/maizegenetics/commands/RopeBwtMem.kt index a646be9..9b732c3 100644 --- a/src/main/kotlin/net/maizegenetics/commands/RopeBwtMem.kt +++ b/src/main/kotlin/net/maizegenetics/commands/RopeBwtMem.kt @@ -18,8 +18,9 @@ import kotlin.system.exitProcess class RopeBwtMem : CliktCommand(name = "ropebwt-mem") { companion object { - private const val LOG_FILE_NAME = "12_ropebwt_mem.log" - private const val ROPEBWT_MEM_RESULTS_DIR = "12_ropebwt_mem_results" + private const val LOG_FILE_NAME = "13_ropebwt_mem.log" + private const val ROPEBWT_MEM_RESULTS_DIR = "13_ropebwt_mem_results" + private const val UPSTREAM_INDEX_DIR = "12_rope_bwt_index_results" private const val BED_FILE_PATHS_FILE = "bed_file_paths.txt" private const val DEFAULT_P_VALUE = 168 private const val KEYFILE_NAME = "phg_keyfile.txt" @@ -40,12 +41,12 @@ class RopeBwtMem : CliktCommand(name = "ropebwt-mem") { private val indexFile by option( "--index-file", "-i", - help = "Path to the .fmd index file from rope-bwt-chr-index (auto-detected from step 11 if not specified)" + help = "Path to the .fmd index file from rope-bwt-chr-index (auto-detected from step 12 if not specified)" ).path(mustExist = false, canBeFile = true, canBeDir = false) private val lValue by option( "--l-value", "-l", - help = "The -l parameter value (auto-calculated as 2 * number of FASTA samples from step 11 if not specified)" + help = "The -l parameter value (auto-calculated as 2 * number of FASTA samples from step 12 if not specified)" ).int() private val pValue by option( @@ -62,7 +63,7 @@ class RopeBwtMem : CliktCommand(name = "ropebwt-mem") { private val outputDirOption by option( "--output-dir", "-o", - help = "Custom output directory (default: work_dir/output/12_ropebwt_mem_results)" + help = "Custom output directory (default: work_dir/output/13_ropebwt_mem_results)" ).path(mustExist = false, canBeFile = false, canBeDir = true) private fun collectFastqFiles(): List { @@ -75,21 +76,20 @@ class RopeBwtMem : CliktCommand(name = "ropebwt-mem") { } private fun calculateLValue(): Int { - // Try to find keyfile from step 11 - val step11OutputDir = workDir.resolve("output").resolve("11_rope_bwt_index_results") - val keyfilePath = step11OutputDir.resolve(KEYFILE_NAME) + // Try to find keyfile from rope-bwt-chr-index (step 12) + val upstreamDir = workDir.resolve("output").resolve(UPSTREAM_INDEX_DIR) + val keyfilePath = upstreamDir.resolve(KEYFILE_NAME) if (!keyfilePath.exists()) { logger.error("Cannot auto-calculate -l value: keyfile not found at $keyfilePath") - logger.error("Please specify --l-value manually or ensure step 11 (rope-bwt-chr-index) has been run") + logger.error("Please specify --l-value manually or ensure step 12 (rope-bwt-chr-index) has been run") exitProcess(1) } - logger.info("Reading keyfile from step 11: $keyfilePath") - val lines = keyfilePath.readLines() - - // Count lines excluding header - val fastaCount = lines.size - 1 + logger.info("Reading keyfile from step 12: $keyfilePath") + // Keyfile written by RopeBwtChrIndex / Orchestrate has no header: every + // non-blank line is `\t`. Count those directly. + val fastaCount = keyfilePath.readLines().count { it.isNotBlank() } if (fastaCount <= 0) { logger.error("Keyfile has no FASTA entries: $keyfilePath") exitProcess(1) @@ -101,25 +101,25 @@ class RopeBwtMem : CliktCommand(name = "ropebwt-mem") { } private fun findIndexFile(): Path { - val step11OutputDir = workDir.resolve("output").resolve("11_rope_bwt_index_results") + val upstreamDir = workDir.resolve("output").resolve(UPSTREAM_INDEX_DIR) - if (!step11OutputDir.exists()) { - logger.error("Cannot auto-detect index file: step 11 output directory not found at $step11OutputDir") - logger.error("Please specify --index-file manually or ensure step 11 (rope-bwt-chr-index) has been run") + if (!upstreamDir.exists()) { + logger.error("Cannot auto-detect index file: step 12 output directory not found at $upstreamDir") + logger.error("Please specify --index-file manually or ensure step 12 (rope-bwt-chr-index) has been run") exitProcess(1) } // Look for .fmd files in the directory - val fmdFiles = step11OutputDir.listDirectoryEntries("*.fmd") + val fmdFiles = upstreamDir.listDirectoryEntries("*.fmd") if (fmdFiles.isEmpty()) { - logger.error("Cannot auto-detect index file: no .fmd files found in $step11OutputDir") + logger.error("Cannot auto-detect index file: no .fmd files found in $upstreamDir") logger.error("Please specify --index-file manually") exitProcess(1) } if (fmdFiles.size > 1) { - logger.warn("Multiple .fmd files found in $step11OutputDir") + logger.warn("Multiple .fmd files found in $upstreamDir") logger.warn("Using the first one: ${fmdFiles[0].fileName}") } diff --git a/src/main/kotlin/net/maizegenetics/utils/FileUtils.kt b/src/main/kotlin/net/maizegenetics/utils/FileUtils.kt index 762506d..dfe76cb 100644 --- a/src/main/kotlin/net/maizegenetics/utils/FileUtils.kt +++ b/src/main/kotlin/net/maizegenetics/utils/FileUtils.kt @@ -169,7 +169,7 @@ object FileUtils { * Auto-detects output from a previous pipeline step * * @param workDir The working directory - * @param stepDirName The step directory name (e.g., "12_ropebwt_mem_results") + * @param stepDirName The step directory name (e.g., "13_ropebwt_mem_results") * @param logger Logger for error messages * @param customMessage Optional custom error message * @return The detected directory path From 5e2ffb4e5aeffdb57f1f40f598f42bed13063b2a Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Fri, 22 May 2026 08:30:33 -0500 Subject: [PATCH 17/18] Add integration tests for 13-15 --- .../commands/ConvertRopebwt2Ps4gUnitTest.kt | 174 +++++++++++ .../commands/RopeBwtChrIndexUnitTest.kt | 131 ++++++++ .../commands/RopeBwtMemUnitTest.kt | 178 +++++++++++ .../integration/OrchestrateE2ETest.kt | 285 ++++++++++++++++-- 4 files changed, 749 insertions(+), 19 deletions(-) create mode 100644 src/test/kotlin/net/maizegenetics/commands/ConvertRopebwt2Ps4gUnitTest.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/RopeBwtChrIndexUnitTest.kt create mode 100644 src/test/kotlin/net/maizegenetics/commands/RopeBwtMemUnitTest.kt diff --git a/src/test/kotlin/net/maizegenetics/commands/ConvertRopebwt2Ps4gUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/ConvertRopebwt2Ps4gUnitTest.kt new file mode 100644 index 0000000..4367553 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/ConvertRopebwt2Ps4gUnitTest.kt @@ -0,0 +1,174 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.io.File +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.exists +import kotlin.io.path.readLines +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [ConvertRopebwt2Ps4g]. Verifies the PHG CLI invocation, + * auto-detection of step-13 BED outputs and step-14 spline knots, and the + * ps4g_file_paths.txt manifest -- without actually running phg. + */ +class ConvertRopebwt2Ps4gUnitTest { + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + private fun stubPhgBinary(workDir: Path): Path { + val phgDir = workDir.resolve("src/phg_v2/bin") + phgDir.createDirectories() + val phg = phgDir.resolve("phg") + phg.writeText("#!/bin/sh\nexit 0\n") + phg.toFile().setExecutable(true) + return phg + } + + /** Pretend step 13 already ran: drop BED files into the expected directory. */ + private fun stubUpstreamBeds(workDir: Path, sampleNames: List): Path { + val bedDir = workDir.resolve("output/13_ropebwt_mem_results") + .also { it.createDirectories() } + sampleNames.forEach { name -> + bedDir.resolve("$name.bed").writeText("chr1\t0\t10\n") + } + return bedDir + } + + /** Pretend step 14 already ran: create the spline-knots directory. */ + private fun stubUpstreamSplineKnots(workDir: Path): Path { + return workDir.resolve("output/14_spline_knots_results") + .also { + it.createDirectories() + it.resolve("chr1.knots").writeText("0\t100\n") + } + } + + /** + * RecordingProcessExecutor that simulates PHG's side-effect of creating + * a `.ps4g` file in the output dir on success. + */ + private fun phgSucceedingExecutor() = RecordingProcessExecutor(defaultExitCode = 0) { inv -> + val outputDir = inv.command.dropWhile { it != "--output-dir" }.getOrNull(1)?.let { File(it) } + val bedArg = inv.command.dropWhile { it != "--ropebwt-bed" }.getOrNull(1)?.let { File(it) } + if (outputDir != null && bedArg != null) { + outputDir.mkdirs() + File(outputDir, "${bedArg.nameWithoutExtension}.ps4g").writeText("PS4G\n") + } + 0 + } + + @Test + fun autoDetectsBedAndSplineDirAndWritesPs4gPaths(@TempDir workDir: Path) { + stubPhgBinary(workDir) + stubUpstreamBeds(workDir, listOf("sampleA", "sampleB")) + stubUpstreamSplineKnots(workDir) + + val executor = phgSucceedingExecutor() + ProcessRunner.withExecutor(executor) { + ConvertRopebwt2Ps4g().parse( + listOf( + "--work-dir", workDir.toString() + ) + ) + } + + // Two BED inputs -> two phg invocations, each pointing at the + // auto-detected step-13 and step-14 directories. + assertEquals(2, executor.invocations.size) + val expectedSplineDir = workDir.resolve("output/14_spline_knots_results") + .toAbsolutePath().toString() + val expectedOutputDir = workDir.resolve("output/15_convert_ropebwt2ps4g_results") + .toAbsolutePath().toString() + executor.invocations.forEach { inv -> + assertTrue(inv.command.first().endsWith("phg")) + assertEquals("convert-ropebwt2ps4g-file", inv.command[1]) + assertEquals(expectedSplineDir, inv.argAfter("--spline-knot-dir")) + assertEquals(expectedOutputDir, inv.argAfter("--output-dir")) + assertEquals("135", inv.argAfter("--min-mem-length")) + assertEquals("16", inv.argAfter("--max-num-hits")) + } + + // ps4g_file_paths.txt should enumerate the two PS4G outputs. + val pathsFile = workDir.resolve("output/15_convert_ropebwt2ps4g_results/ps4g_file_paths.txt") + assertTrue(pathsFile.exists(), "ps4g_file_paths.txt should be written") + val lines = pathsFile.readLines().filter { it.isNotBlank() } + assertEquals(2, lines.size) + assertTrue(lines.all { it.endsWith(".ps4g") }) + assertTrue(lines.any { it.endsWith("/sampleA.ps4g") }) + assertTrue(lines.any { it.endsWith("/sampleB.ps4g") }) + } + + @Test + fun customMemAndHitsParametersAreForwarded(@TempDir workDir: Path) { + stubPhgBinary(workDir) + stubUpstreamBeds(workDir, listOf("only")) + stubUpstreamSplineKnots(workDir) + + val executor = phgSucceedingExecutor() + ProcessRunner.withExecutor(executor) { + ConvertRopebwt2Ps4g().parse( + listOf( + "--work-dir", workDir.toString(), + "--min-mem-length", "148", + "--max-num-hits", "50" + ) + ) + } + + val inv = executor.invocations.single() + assertEquals("148", inv.argAfter("--min-mem-length")) + assertEquals("50", inv.argAfter("--max-num-hits")) + } + + @Test + fun explicitBedAndSplineDirOverridesAreUsed(@TempDir workDir: Path) { + stubPhgBinary(workDir) + + // Custom locations -- nothing under output/. + val customBedDir = workDir.resolve("my_beds").also { it.createDirectories() } + customBedDir.resolve("custom.bed").writeText("chr1\t0\t5\n") + val customSplineDir = workDir.resolve("my_spline").also { + it.createDirectories() + it.resolve("chr1.knots").writeText("0\t100\n") + } + val customOut = workDir.resolve("my_ps4g_out") + + val executor = phgSucceedingExecutor() + ProcessRunner.withExecutor(executor) { + ConvertRopebwt2Ps4g().parse( + listOf( + "--work-dir", workDir.toString(), + "--bed-input", customBedDir.toString(), + "--spline-knot-dir", customSplineDir.toString(), + "--output-dir", customOut.toString() + ) + ) + } + + val inv = executor.invocations.single() + assertEquals( + customSplineDir.toAbsolutePath().toString(), + inv.argAfter("--spline-knot-dir") + ) + assertEquals( + customOut.toAbsolutePath().toString(), + inv.argAfter("--output-dir") + ) + assertTrue( + customOut.resolve("custom.ps4g").exists(), + "PS4G output should land under --output-dir, not the default step-15 path" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/RopeBwtChrIndexUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/RopeBwtChrIndexUnitTest.kt new file mode 100644 index 0000000..422b8c6 --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/RopeBwtChrIndexUnitTest.kt @@ -0,0 +1,131 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.exists +import kotlin.io.path.readLines +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [RopeBwtChrIndex]. Verifies the PHG CLI invocation, the + * auto-generated keyfile contents, and the --delete-fmr-index presence-flag + * wiring -- all without actually shelling out to phg. + */ +class RopeBwtChrIndexUnitTest { + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + /** + * Create a fake PHG layout (bin/phg) inside [workDir] so the command's + * [net.maizegenetics.utils.ValidationUtils.validatePhgSetup] passes. + */ + private fun stubPhgBinary(workDir: Path): Path { + val phgDir = workDir.resolve("src/phg_v2/bin") + phgDir.createDirectories() + val phg = phgDir.resolve("phg") + phg.writeText("#!/bin/sh\nexit 0\n") + phg.toFile().setExecutable(true) + return phg + } + + private fun writeFasta(path: Path, contig: String = ">chr1\nACGT\n") { + path.parent?.createDirectories() + path.writeText(contig) + } + + @Test + fun fastaInputAutoGeneratesKeyfileAndInvokesPhg(@TempDir workDir: Path) { + stubPhgBinary(workDir) + val fastaDir = workDir.resolve("fastas").also { it.createDirectories() } + writeFasta(fastaDir.resolve("sampleA.fa")) + writeFasta(fastaDir.resolve("sampleB.fa")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + RopeBwtChrIndex().parse( + listOf( + "--work-dir", workDir.toString(), + "--fasta-input", fastaDir.toString(), + "--threads", "4" + ) + ) + } + + assertEquals(1, executor.invocations.size, "phg should be invoked exactly once") + val inv = executor.invocations.single() + assertTrue(inv.command.first().endsWith("phg")) + assertEquals("rope-bwt-chr-index", inv.command[1]) + assertEquals("4", inv.argAfter("--threads")) + assertEquals("phgIndex", inv.argAfter("--index-file-prefix")) + + // --delete-fmr-index is a presence flag and must be absent by default. + assertTrue(!inv.command.contains("--delete-fmr-index")) + + // Output directory matches the v0.2 step-12 layout. + val expectedOutput = workDir.resolve("output/12_rope_bwt_index_results") + assertEquals(expectedOutput.toAbsolutePath().toString(), inv.argAfter("--output-dir")) + assertTrue(expectedOutput.exists(), "Output directory should be created") + + // The keyfile must be generated next to the index files, contain one + // line per FASTA, and have no header (downstream `ropebwt-mem` reads + // it as raw `\t` rows). + val keyfile = expectedOutput.resolve("phg_keyfile.txt") + assertTrue(keyfile.exists(), "Auto-generated keyfile should exist") + val keyLines = keyfile.readLines().filter { it.isNotBlank() } + assertEquals(2, keyLines.size, "Two FASTAs -> two keyfile rows") + assertTrue(keyLines.all { it.split("\t").size == 2 }, "Each row is pathsample") + val sampleNames = keyLines.map { it.split("\t")[1] } + assertEquals(setOf("sampleA", "sampleB"), sampleNames.toSet()) + } + + @Test + fun providedKeyfileIsForwardedWithoutRegeneration(@TempDir workDir: Path) { + stubPhgBinary(workDir) + + // Real on-disk FASTA so validateKeyfile passes. + val fastaDir = workDir.resolve("fastas").also { it.createDirectories() } + val fastaA = fastaDir.resolve("a.fa").also { writeFasta(it) } + + val outputDir = workDir.resolve("phg_index") + val keyfile = workDir.resolve("custom_keyfile.txt").apply { + writeText("${fastaA.toAbsolutePath()}\tcustomSample\n") + } + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + RopeBwtChrIndex().parse( + listOf( + "--work-dir", workDir.toString(), + "--keyfile", keyfile.toString(), + "--output-dir", outputDir.toString(), + "--index-file-prefix", "myIndex", + "--delete-fmr-index" + ) + ) + } + + val inv = executor.invocations.single() + assertEquals(keyfile.toAbsolutePath().toString(), inv.argAfter("--keyfile")) + assertEquals(outputDir.toAbsolutePath().toString(), inv.argAfter("--output-dir")) + assertEquals("myIndex", inv.argAfter("--index-file-prefix")) + assertTrue(inv.command.contains("--delete-fmr-index")) + + // Should NOT have written its own keyfile alongside the index when one + // was passed in explicitly. + assertTrue( + !outputDir.resolve("phg_keyfile.txt").exists(), + "Provided keyfile must not trigger an auto-generated one" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/commands/RopeBwtMemUnitTest.kt b/src/test/kotlin/net/maizegenetics/commands/RopeBwtMemUnitTest.kt new file mode 100644 index 0000000..4e6145d --- /dev/null +++ b/src/test/kotlin/net/maizegenetics/commands/RopeBwtMemUnitTest.kt @@ -0,0 +1,178 @@ +package net.maizegenetics.commands + +import com.github.ajalt.clikt.core.parse +import net.maizegenetics.utils.ProcessRunner +import net.maizegenetics.utils.testing.RecordingProcessExecutor +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.io.TempDir +import java.nio.file.Path +import kotlin.io.path.createDirectories +import kotlin.io.path.exists +import kotlin.io.path.readLines +import kotlin.io.path.writeText +import kotlin.test.assertEquals +import kotlin.test.assertTrue + +/** + * Unit tests for [RopeBwtMem]. Each test installs a [RecordingProcessExecutor] + * so we can assert on the exact `pixi run ropebwt3 mem ...` invocation line, + * the auto-detected -l value, and the per-sample BED layout, without + * actually running ropebwt3. + */ +class RopeBwtMemUnitTest { + + @AfterEach + fun cleanup() { + ProcessRunner.resetExecutor() + } + + /** + * Lay out a fake "upstream" `12_rope_bwt_index_results/` directory: + * - a .fmd index file so findIndexFile() succeeds + * - a no-header keyfile so calculateLValue() returns 2 * N + */ + private fun stubUpstreamRopeBwtIndex(workDir: Path, sampleCount: Int = 3): Path { + val upstream = workDir.resolve("output/12_rope_bwt_index_results") + .also { it.createDirectories() } + upstream.resolve("phgIndex.fmd").writeText("FMD") + val keyfile = upstream.resolve("phg_keyfile.txt") + val lines = (1..sampleCount).map { i -> "/dev/null/sample$i.fa\tsample$i" } + keyfile.writeText(lines.joinToString("\n")) + return upstream + } + + private fun writeFastq(path: Path) { + path.parent?.createDirectories() + path.writeText( + """ + @read1 + ACGTACGTAC + + + !!!!!!!!!! + + """.trimIndent() + ) + } + + @Test + fun ropebwtMemAutoDetectsIndexAndLValue(@TempDir workDir: Path) { + stubUpstreamRopeBwtIndex(workDir, sampleCount = 4) + + val fastqDir = workDir.resolve("fastqs").also { it.createDirectories() } + writeFastq(fastqDir.resolve("sampleA.fq")) + writeFastq(fastqDir.resolve("sampleB.fq")) + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + RopeBwtMem().parse( + listOf( + "--work-dir", workDir.toString(), + "--fastq-input", fastqDir.toString(), + "--threads", "8" + ) + ) + } + + // Two FASTQ files -> two ropebwt3 invocations. + assertEquals(2, executor.invocations.size, "One ropebwt3 mem call per FASTQ") + + // Every invocation should target the auto-detected .fmd index, use the + // calculated -l value (2 x 4 = 8), and forward --threads / -p defaults. + val expectedIndex = workDir.resolve("output/12_rope_bwt_index_results/phgIndex.fmd") + .toAbsolutePath().toString() + executor.invocations.forEach { inv -> + // RecordingProcessExecutor receives the command AFTER ProcessRunner + // strips the leading "pixi run" prefix when SEQ_SIM_SKIP_PIXI_PREFIX + // is enabled. Support both shapes so the test is independent of + // that env var. + val cmd = inv.command + val firstTool = if (cmd.size >= 2 && cmd[0] == "pixi" && cmd[1] == "run") cmd[2] else cmd[0] + assertEquals("ropebwt3", firstTool, "First non-pixi token should be ropebwt3") + assertTrue(cmd.contains("mem"), "Subcommand should be `mem`") + assertEquals("8", inv.argAfter("-t")) + assertEquals("8", inv.argAfter("-l")) + assertEquals("168", inv.argAfter("-p")) + // Index path is one of the positional args near the end. + assertTrue(cmd.contains(expectedIndex), "Index path should appear: $expectedIndex") + } + + // BED outputs should land alongside per-sample suffix in step-13 dir. + val expectedDir = workDir.resolve("output/13_ropebwt_mem_results") + assertTrue(expectedDir.exists(), "Step 13 output dir should be created") + assertTrue( + expectedDir.resolve("sampleA_ropebwt.bed").exists(), + "sampleA BED file should be created (RecordingProcessExecutor stubs outputFile)" + ) + assertTrue( + expectedDir.resolve("sampleB_ropebwt.bed").exists(), + "sampleB BED file should be created" + ) + + // bed_file_paths.txt must enumerate every successful BED in absolute form. + val bedPathsFile = expectedDir.resolve("bed_file_paths.txt") + assertTrue(bedPathsFile.exists()) + val bedLines = bedPathsFile.readLines().filter { it.isNotBlank() } + assertEquals(2, bedLines.size) + assertTrue(bedLines.all { it.endsWith("_ropebwt.bed") }) + } + + @Test + fun explicitLValueAndIndexOverrideAutoDetect(@TempDir workDir: Path) { + // No upstream index dir -- we're overriding both explicitly. + workDir.createDirectories() + // RopeBwtMem still requires the working directory to exist (for + // ValidationUtils.validateWorkingDirectory). + + val fastq = workDir.resolve("only.fq").also { writeFastq(it) } + val explicitIndex = workDir.resolve("custom_index.fmd").apply { writeText("FMD") } + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + RopeBwtMem().parse( + listOf( + "--work-dir", workDir.toString(), + "--fastq-input", fastq.toString(), + "--index-file", explicitIndex.toString(), + "--l-value", "33", + "--p-value", "250", + "--threads", "2" + ) + ) + } + + val inv = executor.invocations.single() + assertEquals("2", inv.argAfter("-t")) + assertEquals("33", inv.argAfter("-l")) + assertEquals("250", inv.argAfter("-p")) + assertTrue( + inv.command.contains(explicitIndex.toAbsolutePath().toString()), + "Explicit index path should be forwarded verbatim" + ) + } + + @Test + fun customOutputDirIsRespected(@TempDir workDir: Path) { + stubUpstreamRopeBwtIndex(workDir, sampleCount = 1) + val fastq = workDir.resolve("solo.fastq").also { writeFastq(it) } + val customOut = workDir.resolve("custom_bed_out") + + val executor = RecordingProcessExecutor(defaultExitCode = 0) + ProcessRunner.withExecutor(executor) { + RopeBwtMem().parse( + listOf( + "--work-dir", workDir.toString(), + "--fastq-input", fastq.toString(), + "--output-dir", customOut.toString() + ) + ) + } + + // BED file lands in the custom output directory, not the default. + assertTrue(customOut.resolve("solo_ropebwt.bed").exists()) + assertTrue( + !workDir.resolve("output/13_ropebwt_mem_results").exists(), + "Default output dir should NOT be created when --output-dir is provided" + ) + } +} diff --git a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt index f74fb88..e50f6a2 100644 --- a/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt +++ b/src/test/kotlin/net/maizegenetics/integration/OrchestrateE2ETest.kt @@ -15,7 +15,7 @@ import kotlin.test.assertTrue /** * End-to-end test: run `orchestrate` against the smallseq test fixtures - * through every pipeline step (1-9) and assert that each step's expected + * through every pipeline step (1-15) and assert that each step's expected * outputs are produced. * * Only runs inside the seq-sim-dev container (SEQ_SIM_IN_CONTAINER=1). @@ -45,13 +45,74 @@ class OrchestrateE2ETest { } /** - * Full pipeline (steps 1-9) E2E: align_assemblies -> maf_to_gvcf -> - * downsample_gvcf -> convert_to_fasta -> pick_crossovers -> - * create_chain_files -> convert_coordinates -> - * generate_recombined_sequences -> format_recombined_fastas. + * Synthesize a tiny but well-formed FASTQ file from one of the smallseq + * query FASTAs. We slide a fixed-width window across the concatenated + * sequence so every read is a true substring of a parent assembly -- + * giving step 13 (`ropebwt3 mem`) a meaningful match rate against the + * step-12 PHG index without checking external fixture files into the + * repo. * - * Validates that every step's expected outputs are produced and that - * the orchestrator chains them together correctly end-to-end. + * Phred+33 quality is set to `I` (Q40) across the board; ropebwt3 does + * not use quality scores for matching but ignores nothing either, so we + * keep the FASTQ syntactically valid. + */ + private fun synthesizeFastqFromFasta( + sourceFasta: Path, + target: Path, + readLength: Int = 150, + numReads: Int = 200, + sampleName: String = "synthetic" + ) { + val sequence = buildString { + sourceFasta.toFile().useLines { lines -> + lines.forEach { line -> + if (!line.startsWith(">") && line.isNotBlank()) append(line.trim()) + } + } + } + require(sequence.length >= readLength) { + "Source FASTA $sourceFasta is shorter than the requested read length ($readLength)" + } + + val step = ((sequence.length - readLength) / numReads).coerceAtLeast(1) + val quality = "I".repeat(readLength) + + target.parent?.createDirectories() + target.toFile().bufferedWriter().use { out -> + var written = 0 + var offset = 0 + while (written < numReads && offset + readLength <= sequence.length) { + val read = sequence.substring(offset, offset + readLength) + out.write("@${sampleName}_read${written + 1}\n") + out.write("$read\n") + out.write("+\n") + out.write("$quality\n") + written += 1 + offset += step + } + } + } + + /** + * Full pipeline (steps 1-15) E2E. Validates that every step's expected + * outputs are produced and that the orchestrator chains them together + * correctly end-to-end: + * + * 1. align_assemblies -> 01_anchorwave_results/ + * 2. maf_to_gvcf -> 02_gvcf_results/ + * 3. downsample_gvcf -> 03_downsample_results/ + * 4. convert_to_fasta -> 04_fasta_results/ + * 5. pick_crossovers -> 05_crossovers_results/ + * 6. create_chain_files -> 06_chain_results/ + * 7. convert_coordinates -> 07_coordinates_results/ + * 8. generate_recombined_sequences -> 08_recombined_sequences/ + * 9. format_recombined_fastas -> 09_formatted_fastas/ + * 10. align_mutated_assemblies -> 10_mutated_alignment_results/ + * 11. mutated_maf_to_gvcf -> 11_mutated_gvcf_results/ + * 12. rope_bwt_chr_index -> 12_rope_bwt_index_results/ + * 13. ropebwt_mem -> 13_ropebwt_mem_results/ + * 14. build_spline_knots -> 14_spline_knots_results/ + * 15. convert_ropebwt2ps4g -> 15_convert_ropebwt2ps4g_results/ * * Uses a persistent working directory under `build/test-output/` (not * [org.junit.jupiter.api.io.TempDir]) so intermediate pipeline outputs @@ -60,25 +121,23 @@ class OrchestrateE2ETest { * test hermetic. */ @Test - fun orchestrateRunsFullPipelineStepsOneThroughNine() { - // Steps 1-9 require: PHG + AnchorWave (step 1), biokotlin-tools - // (step 2), MLImpute (steps 3-4 and the python scripts that back - // pick_crossovers / convert_coordinates / generate_recombined_sequences), - // and seqkit (step 9). The orchestrator's auto-run of - // setup-environment populates biokotlin-tools and MLImpute on first - // run; the PHGv2 binary is picked up from SEQ_SIM_PHG_DIR. + fun orchestrateRunsFullPipelineStepsOneThroughFifteen() { + // Every PHG-backed step needs the phg binary + anchorwave on PATH; + // steps 13/15 additionally need `ropebwt3` (provided by the PHGv2 + // conda env that the dev container activates). Steps 3-4 / 5-9 rely + // on MLImpute + biokotlin-tools + seqkit which the orchestrator's + // auto-run of setup-environment installs on first run. IntegrationGuard.requirePhg() IntegrationGuard.requireAnchorwave() IntegrationGuard.logContainerMemoryBudget() - val workDir = persistentWorkDir("orchestrate-steps-1-9") + val workDir = persistentWorkDir("orchestrate-steps-1-15") println(">>> Persisting full-pipeline E2E outputs at: $workDir") // pick_crossovers requires an EVEN number of assemblies (they're // paired for crossover simulation). smallseq ships 3 query FASTAs - // (LineA/LineB/LineC) and each input flows through to exactly one - // downsampled GVCF + one FASTA, so we feed only 2 queries through - // the pipeline to keep the assembly count even end-to-end. + // (LineA/LineB/LineC); we feed only LineA + LineB so the crossover + // pairing succeeds end-to-end. val queryListFile = workDir.resolve("queries.txt") queryListFile.writeText( listOf( @@ -87,6 +146,17 @@ class OrchestrateE2ETest { ).joinToString("\n") { it.toString() } + "\n" ) + // Synthesize a tiny FASTQ for step 13 from LineA. Reads are true + // substrings of LineA so they have a high chance of matching the + // step-12 PHG index (which is built over the recombined founder + // FASTAs, themselves stitched from LineA/LineB segments). + val fastqDir = workDir.resolve("fastq_input").also { it.createDirectories() } + synthesizeFastqFromFasta( + sourceFasta = smallseqRoot.resolve("queries/LineA.fa"), + target = fastqDir.resolve("synthA.fq"), + sampleName = "synthA" + ) + val configPath = workDir.resolve("pipeline.yaml") configPath.writeText( """ @@ -102,6 +172,12 @@ class OrchestrateE2ETest { - convert_coordinates - generate_recombined_sequences - format_recombined_fastas + - align_mutated_assemblies + - mutated_maf_to_gvcf + - rope_bwt_chr_index + - ropebwt_mem + - build_spline_knots + - convert_ropebwt2ps4g align_assemblies: ref_gff: "${smallseqRoot.resolve("anchors.gff")}" @@ -134,6 +210,36 @@ class OrchestrateE2ETest { format_recombined_fastas: line_width: 60 threads: 2 + + align_mutated_assemblies: + threads: 2 + + # Intentionally omit sample_name: pinning a single sample name across + # multiple mutated MAFs collapses every gVCF (and thus every spline + # knot gamete) into one name, which then cannot match the per-FASTA + # sample names that step 12 (rope-bwt-chr-index) bakes into the BWT + # index. Leaving sample_name unset makes MafToGvcf derive each gVCF's + # sample name from the MAF basename (0, 1, ...), which lines up with + # the auto-generated step-12 keyfile and lets step 15 produce a + # non-empty PS4G. + mutated_maf_to_gvcf: {} + + rope_bwt_chr_index: + threads: 2 + delete_fmr_index: true + + ropebwt_mem: + fastq_input: "${fastqDir.toString()}" + threads: 2 + + build_spline_knots: + vcf_type: "gvcf" + num_bps_per_knot: 1000 + random_seed: 42 + + convert_ropebwt2ps4g: + min_mem_length: 50 + max_num_hits: 32 """.trimIndent() ) @@ -361,6 +467,139 @@ class OrchestrateE2ETest { "saw trailing widths=${trailingLineLengths.distinct().sorted()}" ) + // --------------------------------------------------------------- + // Step 10: align_mutated_assemblies -> 10_mutated_alignment_results/ + // --------------------------------------------------------------- + val step10Dir = workDir.resolve("output/10_mutated_alignment_results").toFile() + assertTrue(step10Dir.exists() && step10Dir.isDirectory, "Step 10 output directory must exist") + val mutatedMafPaths = File(step10Dir, "maf_file_paths.txt") + assertTrue( + mutatedMafPaths.exists() && mutatedMafPaths.length() > 0, + "Step 10's maf_file_paths.txt must be non-empty" + ) + val mutatedMafFiles = mutatedMafPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(mutatedMafFiles.isNotEmpty(), "Step 10 should produce at least one mutated MAF") + assertTrue( + mutatedMafFiles.all { it.exists() && it.length() > 0 }, + "Every mutated MAF listed must exist and be non-empty" + ) + + // --------------------------------------------------------------- + // Step 11: mutated_maf_to_gvcf -> 11_mutated_gvcf_results/ + // --------------------------------------------------------------- + val step11Dir = workDir.resolve("output/11_mutated_gvcf_results").toFile() + assertTrue(step11Dir.exists() && step11Dir.isDirectory, "Step 11 output directory must exist") + val mutatedGvcfPathsFile = File(step11Dir, "gvcf_file_paths.txt") + assertTrue(mutatedGvcfPathsFile.exists(), "Step 11 gvcf_file_paths.txt must exist") + val mutatedGvcfFiles = mutatedGvcfPathsFile.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(mutatedGvcfFiles.isNotEmpty(), "Step 11 should produce at least one mutated GVCF") + assertTrue( + mutatedGvcfFiles.all { it.exists() && it.length() > 0 }, + "Every mutated GVCF listed must exist on disk and be non-empty" + ) + assertTrue( + mutatedGvcfFiles.all { it.name.endsWith(".g.vcf.gz") }, + "Every mutated GVCF should be biokotlin-compressed (.g.vcf.gz)" + ) + + // --------------------------------------------------------------- + // Step 12: rope_bwt_chr_index -> 12_rope_bwt_index_results/ + // --------------------------------------------------------------- + val step12Dir = workDir.resolve("output/12_rope_bwt_index_results").toFile() + assertTrue(step12Dir.exists() && step12Dir.isDirectory, "Step 12 output directory must exist") + val keyfile = File(step12Dir, "phg_keyfile.txt") + assertTrue( + keyfile.exists() && keyfile.length() > 0, + "Step 12 must auto-generate a keyfile next to the .fmd index" + ) + val keyfileLines = keyfile.readLines().filter { it.isNotBlank() } + assertTrue(keyfileLines.isNotEmpty(), "Keyfile must contain at least one row") + assertTrue( + keyfileLines.all { it.split("\t").size == 2 }, + "Auto-generated keyfile rows are \\t (no header)" + ) + val fmdFiles = step12Dir.listFiles { f -> f.name.endsWith(".fmd") }?.toList().orEmpty() + assertTrue( + fmdFiles.isNotEmpty() && fmdFiles.all { it.length() > 0 }, + "Step 12 must produce at least one non-empty .fmd index file" + ) + + // --------------------------------------------------------------- + // Step 13: ropebwt_mem -> 13_ropebwt_mem_results/ + // --------------------------------------------------------------- + val step13Dir = workDir.resolve("output/13_ropebwt_mem_results").toFile() + assertTrue(step13Dir.exists() && step13Dir.isDirectory, "Step 13 output directory must exist") + val bedPaths = File(step13Dir, "bed_file_paths.txt") + assertTrue(bedPaths.exists(), "bed_file_paths.txt must exist after ropebwt-mem") + val bedFiles = bedPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(bedFiles.isNotEmpty(), "Step 13 should produce at least one BED file") + assertTrue( + bedFiles.all { it.exists() }, + "Every BED listed must exist on disk" + ) + assertTrue( + bedFiles.all { it.name.endsWith("_ropebwt.bed") }, + "Step 13 names BED outputs as _ropebwt.bed" + ) + + // --------------------------------------------------------------- + // Step 14: build_spline_knots -> 14_spline_knots_results/ + // --------------------------------------------------------------- + val step14Dir = workDir.resolve("output/14_spline_knots_results").toFile() + assertTrue(step14Dir.exists() && step14Dir.isDirectory, "Step 14 output directory must exist") + val splineFiles = step14Dir.listFiles { f -> f.isFile && f.length() > 0 }?.toList().orEmpty() + assertTrue( + splineFiles.isNotEmpty(), + "Step 14 should drop at least one non-empty spline-knot file (got: ${step14Dir.listFiles()?.map { it.name } ?: emptyList()})" + ) + + // --------------------------------------------------------------- + // Step 15: convert_ropebwt2ps4g -> 15_convert_ropebwt2ps4g_results/ + // --------------------------------------------------------------- + val step15Dir = workDir.resolve("output/15_convert_ropebwt2ps4g_results").toFile() + assertTrue(step15Dir.exists() && step15Dir.isDirectory, "Step 15 output directory must exist") + val ps4gPaths = File(step15Dir, "ps4g_file_paths.txt") + assertTrue( + ps4gPaths.exists(), + "Step 15 must write ps4g_file_paths.txt (synthesized FASTQ is built from " + + "LineA so the BED produced in step 13 always has matches against the index)" + ) + val ps4gFiles = ps4gPaths.readLines().filter { it.isNotBlank() }.map { File(it) } + assertTrue(ps4gFiles.isNotEmpty(), "ps4g_file_paths.txt must list at least one PS4G") + assertTrue( + ps4gFiles.all { it.exists() }, + "Every PS4G file listed in ps4g_file_paths.txt must exist on disk" + ) + assertTrue( + ps4gFiles.all { it.name.endsWith(".ps4g") }, + "Step 15 names PS4G outputs with the .ps4g extension" + ) + + // The presence of a PS4G file alone isn't enough: PHG happily writes an + // empty PS4G when every BED contig fails to resolve against the spline + // knots (e.g. when step-11 sample names don't match the step-12 keyfile + // sample names). Assert that at least one PS4G has a positive + // #TotalUniqueCounts header AND at least one data row beyond the + // gameteSet/refContig/refPosBinned/count header line. + val ps4gWithData = ps4gFiles.filter { ps4g -> + val lines = ps4g.readLines() + val totalUnique = lines + .firstOrNull { it.startsWith("#TotalUniqueCounts:") } + ?.substringAfter(":") + ?.trim() + ?.toLongOrNull() ?: 0L + val dataRows = lines.count { it.isNotBlank() && !it.startsWith("#") } - 1 + totalUnique > 0 && dataRows > 0 + } + assertTrue( + ps4gWithData.isNotEmpty(), + "At least one PS4G must contain alignment data; every PS4G is empty " + + "(saw files=${ps4gFiles.map { it.name }}). This usually means step-11 " + + "gVCF sample names don't match the step-12 keyfile sample names, so " + + "PHG's convert-ropebwt2ps4g-file couldn't resolve any BED contig " + + "against the step-14 spline knots." + ) + // --------------------------------------------------------------- // Log file contract: each pipeline step writes its own log file. // --------------------------------------------------------------- @@ -377,7 +616,15 @@ class OrchestrateE2ETest { "06_create_chain_files.log", "07_convert_coordinates.log", "08_generate_recombined_sequences.log", - "09_format_recombined_fastas.log" + "09_format_recombined_fastas.log", + "10_align_mutated_assemblies.log", + // Step 11 reuses MafToGvcf, which writes its own LOG_FILE_NAME + // ("02_maf_to_gvcf.log"); that file is already covered above and + // gets appended to when the orchestrator drives step 11 too. + "12_rope_bwt_chr_index.log", + "13_ropebwt_mem.log", + "14_build_spline_knots.log", + "15_convert_ropebwt2ps4g.log" ).forEach { expected -> assertTrue( expected in logNames, From 73c11ed1fcaa4554e812b7a258544b8d436003cd Mon Sep 17 00:00:00 2001 From: Brandon Monier Date: Fri, 22 May 2026 11:43:01 -0500 Subject: [PATCH 18/18] Add "free disk space" step --- .github/workflows/pr-check.yml | 16 ++++++++++++++++ build.gradle.kts | 2 +- docker/Dockerfile.dev | 10 +++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml index 192c0e9..716f8f1 100644 --- a/.github/workflows/pr-check.yml +++ b/.github/workflows/pr-check.yml @@ -115,6 +115,22 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Free runner disk space + # ubuntu-latest ships with only ~14 GB free, which is not enough to + # build the seq-sim-dev image (pixi env + micromamba env + PHGv2 + + # JDK 21 + buildx layers all materialize on disk simultaneously). + # This reclaims ~25-30 GB by removing pre-installed Android SDK, + # .NET, GHC, and large apt packages we don't use. + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false # keep JDKs/node we use elsewhere + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: false # keep buildx daemon images + swap-storage: true + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/build.gradle.kts b/build.gradle.kts index 9fe16ef..d078de2 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "net.maizegenetics" -version = "0.2.10" +version = "0.3.0" repositories { mavenCentral() diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 3786c3c..5567db5 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -120,10 +120,18 @@ COPY src/main/resources/pixi.toml /opt/seq-sim-prebuilt/pixi.toml # runtime `pixi install` is just hardlinks/copies from a warm cache. ENV PIXI_CACHE_DIR=/var/cache/pixi ENV RATTLER_CACHE_DIR=/var/cache/pixi +# The realized env at /opt/seq-sim-prebuilt/.pixi/envs/default is only used +# to warm /var/cache/pixi -- it is never executed at runtime because the +# container sets SEQ_SIM_SKIP_PIXI_PREFIX=1 (see ProcessRunner.kt), which +# routes all pipeline tool calls through the micromamba phgv2-conda env on +# PATH instead. Delete it after the install so we don't ship ~2-3 GB of +# duplicated python/anchorwave/minimap2/ropebwt3 binaries in the image +# layer. The /var/cache/pixi cache is what seeds the runtime workdir. RUN mkdir -p /var/cache/pixi && \ cd /opt/seq-sim-prebuilt && \ pixi install --manifest-path pixi.toml && \ - test -d /opt/seq-sim-prebuilt/.pixi/envs/default + test -d /opt/seq-sim-prebuilt/.pixi/envs/default && \ + rm -rf /opt/seq-sim-prebuilt/.pixi # --------------------------------------------------------------------------- # Pre-download PHGv2 latest release so `setup-environment` can skip the