"""Command-line interface for ancify.
Usage::
ancify init [-o config.yaml] # generate a template config
ancify project -c config.yaml # Phase 1: project alignments
ancify call -c config.yaml # Phase 2: call ancestral states
ancify evaluate -c config.yaml # Phase 3: evaluate calls
ancify run -c config.yaml # run all phases
ancify train -c config.yaml [-o model.lgb] # train ML model
The package can also be invoked as ``python -m ancify <cmd> ...``.
"""
import argparse
import logging
import sys
import textwrap
from .config import load_config
EXAMPLE_CONFIG = textwrap.dedent("""\
# ── Ancestral Allele Polarization Pipeline ──────────────────────
# Adapt this template for your focal species and outgroups.
# Informal label for the focal species (used only in log messages).
focal_species: my_species
# Tab-separated file with at least two columns:
# chromosome_name <TAB> length
# Additional columns are ignored.
chromosome_lengths: chromLens.txt
# Optional: restrict to specific chromosomes.
# If omitted, every chromosome in the lengths file is processed.
# chromosomes:
# - chr1
# - chr2
# - chrX
outgroups:
# Inner outgroup: one or more closely related species.
# A majority vote determines the inner consensus.
inner:
- name: close_species_1
alignment: focal.closeSpecies1.net.axt.gz
- name: close_species_2
alignment: focal.closeSpecies2.net.axt.gz
# Outer outgroup: one or more distantly related species.
# Serves as an independent confirmation of the inner call.
outer:
- name: distant_species
alignment: focal.distantSpecies.net.axt.gz
# Working directory for intermediate projected sequences.
work_dir: .
# Directory for final ancestral FASTA files.
output_dir: ./ancestral_calls
# Minimum allele count to accept a majority-vote consensus.
min_inner_freq: 1
min_outer_freq: 1
# Number of parallel worker processes.
num_cpus: 4
# ── Ancestral inference method ────────────────────────────────────
# "voting" (default): two-tier inner/outer outgroup voting.
# "parsimony": Fitch parsimony on a phylogenetic tree.
# "likelihood": Felsenstein pruning with a substitution model.
#
# method: parsimony
#
# Newick tree topology (required when method is "parsimony" or "likelihood").
# Can be an inline Newick string or a path to a .nwk file.
# Leaf names must match outgroup 'name' fields.
# tree: "(((close_species_1:0.008,close_species_2:0.008):0.002,distant_species:0.009):0.020,outgroup2:0.038)"
# ── Likelihood options (only when method: likelihood) ─────────────
# method: likelihood
# tree: "(((close_species_1:0.008,close_species_2:0.008):0.002,distant_species:0.009):0.020,outgroup2:0.038)"
# substitution_model: HKY85 # JC69 (default), K80, HKY85, or GTR
# model_kappa: 2.0 # transition/transversion ratio (K80, HKY85)
# model_base_freqs: [0.3, 0.2, 0.2, 0.3] # equilibrium freqs (HKY85, GTR)
# model_rates: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # exchangeability rates (GTR only)
# likelihood_high_threshold: 0.8 # posterior >= this → uppercase (high conf)
# likelihood_low_threshold: 0.5 # posterior >= this → lowercase (low conf)
# Optional evaluation block. Both sub-sections are independent;
# include either, both, or neither.
#
# Patterns support two placeholders:
# {chrom} -- full chromosome name (e.g. "chr1")
# {chrom_id} -- name without leading "chr" (e.g. "1")
#
# evaluation:
# reference_dir: ./reference_ancestral/
# reference_pattern: "ancestor_{chrom_id}.fa"
# vcf_dir: ./vcf/
# vcf_pattern: "variants.{chrom}.vcf.gz"
""")
def _setup_logging(verbose=False):
level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(
level=level,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
[docs]
def cmd_init(args):
out = args.output or "config.yaml"
with open(out, "w") as f:
f.write(EXAMPLE_CONFIG)
print(f"Template configuration written to {out}")
[docs]
def cmd_project(args):
cfg = load_config(args.config)
if args.num_cpus:
cfg.num_cpus = args.num_cpus
from .project import run_projection
run_projection(cfg)
[docs]
def cmd_call(args):
cfg = load_config(args.config)
if args.num_cpus:
cfg.num_cpus = args.num_cpus
from .ancestral import run_ancestral_calling
run_ancestral_calling(cfg)
[docs]
def cmd_evaluate(args):
cfg = load_config(args.config)
if args.num_cpus:
cfg.num_cpus = args.num_cpus
from .evaluate import run_evaluation
run_evaluation(cfg)
[docs]
def cmd_train(args):
cfg = load_config(args.config)
if args.num_cpus:
cfg.num_cpus = args.num_cpus
if args.output:
cfg.ml_model_path = args.output
from .ml import train_from_config
out_path = train_from_config(cfg)
print(f"Trained ML model saved to {out_path}")
[docs]
def cmd_run(args):
cfg = load_config(args.config)
if args.num_cpus:
cfg.num_cpus = args.num_cpus
from .project import run_projection
from .ancestral import run_ancestral_calling
run_projection(cfg)
run_ancestral_calling(cfg)
if cfg.evaluation:
from .evaluate import run_evaluation
run_evaluation(cfg)
[docs]
def main():
parser = argparse.ArgumentParser(
prog="ancify",
description="Ancestral allele polarization using outgroup species.",
)
parser.add_argument("-v", "--verbose", action="store_true",
help="enable debug logging")
sub = parser.add_subparsers(dest="command")
p = sub.add_parser("init", help="generate a template config file")
p.add_argument("-o", "--output", help="output path (default: config.yaml)")
p.set_defaults(func=cmd_init)
for name, helptext, func in [
("project", "Phase 1: project outgroup alignments", cmd_project),
("call", "Phase 2: call ancestral states", cmd_call),
("evaluate", "Phase 3: evaluate ancestral calls", cmd_evaluate),
("run", "run all phases end-to-end", cmd_run),
]:
p = sub.add_parser(name, help=helptext)
p.add_argument("-c", "--config", required=True,
help="path to YAML configuration file")
p.add_argument("-n", "--num-cpus", type=int,
help="override num_cpus from config")
p.set_defaults(func=func)
p = sub.add_parser("train", help="train an ML model for ancestral calling")
p.add_argument("-c", "--config", required=True,
help="path to YAML configuration file")
p.add_argument("-o", "--output",
help="output path for trained model (default: from config or work_dir)")
p.add_argument("-n", "--num-cpus", type=int,
help="override num_cpus from config")
p.set_defaults(func=cmd_train)
args = parser.parse_args()
_setup_logging(getattr(args, "verbose", False))
if not args.command:
parser.print_help()
sys.exit(1)
args.func(args)