Python for Genomics Sequencing — Deep Dive

Production genomics pipeline architecture

A clinical or research genomics pipeline must be reproducible, scalable, and auditable. Modern pipelines use workflow managers to orchestrate steps across compute clusters.

Sample Sheet → QC → Alignment → Duplicate Marking → Base Recalibration →
  Variant Calling → Joint Genotyping → Filtering → Annotation → Reporting

Workflow management with Snakemake

Snakemake (Python-based) defines pipelines as rules with inputs, outputs, and shell commands:

# Snakefile

SAMPLES = ["sample_A", "sample_B", "sample_C"]

rule all:
    input:
        expand("results/{sample}.annotated.vcf.gz", sample=SAMPLES)

rule trim_reads:
    input:
        r1="raw/{sample}_R1.fastq.gz",
        r2="raw/{sample}_R2.fastq.gz"
    output:
        r1="trimmed/{sample}_R1.fastq.gz",
        r2="trimmed/{sample}_R2.fastq.gz",
        json="qc/{sample}_fastp.json"
    threads: 4
    shell:
        """
        fastp -i {input.r1} -I {input.r2} \
              -o {output.r1} -O {output.r2} \
              -j {output.json} -w {threads}
        """

rule align:
    input:
        r1="trimmed/{sample}_R1.fastq.gz",
        r2="trimmed/{sample}_R2.fastq.gz",
        ref="reference/genome.fa"
    output:
        bam="aligned/{sample}.sorted.bam",
        bai="aligned/{sample}.sorted.bam.bai"
    threads: 16
    shell:
        """
        bwa-mem2 mem -t {threads} \
            -R '@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\\tPL:ILLUMINA' \
            {input.ref} {input.r1} {input.r2} | \
        samtools sort -@ 4 -o {output.bam}
        samtools index {output.bam}
        """

rule mark_duplicates:
    input:
        "aligned/{sample}.sorted.bam"
    output:
        bam="dedup/{sample}.dedup.bam",
        metrics="qc/{sample}.dup_metrics.txt"
    shell:
        """
        gatk MarkDuplicates -I {input} -O {output.bam} \
            -M {output.metrics} --CREATE_INDEX true
        """

rule call_variants:
    input:
        bam="dedup/{sample}.dedup.bam",
        ref="reference/genome.fa"
    output:
        "variants/{sample}.g.vcf.gz"
    shell:
        """
        gatk HaplotypeCaller -R {input.ref} -I {input.bam} \
            -O {output} -ERC GVCF
        """

Run with: snakemake --cores 64 --use-conda for automatic environment management.

Advanced BAM processing with pysam

Coverage analysis

import pysam
import numpy as np

def coverage_stats(bam_path: str, bed_path: str) -> dict:
    """Calculate coverage statistics over target regions."""
    bam = pysam.AlignmentFile(bam_path, "rb")
    
    coverages = []
    regions = []
    
    with open(bed_path) as f:
        for line in f:
            chrom, start, end, *rest = line.strip().split("\t")
            start, end = int(start), int(end)
            name = rest[0] if rest else f"{chrom}:{start}-{end}"
            
            # pileup gives per-base coverage
            region_cov = np.zeros(end - start, dtype=np.int32)
            for pileup_col in bam.pileup(chrom, start, end, 
                                          truncate=True, min_mapping_quality=20):
                pos = pileup_col.reference_pos - start
                if 0 <= pos < len(region_cov):
                    region_cov[pos] = pileup_col.nsegments
            
            coverages.append(region_cov)
            regions.append({
                "name": name,
                "mean_cov": region_cov.mean(),
                "median_cov": np.median(region_cov),
                "pct_20x": (region_cov >= 20).mean() * 100,
                "pct_30x": (region_cov >= 30).mean() * 100,
            })
    
    bam.close()
    return {
        "regions": regions,
        "overall_mean": np.mean([r["mean_cov"] for r in regions]),
        "pct_target_20x": np.mean([r["pct_20x"] for r in regions]),
    }

Structural variant evidence

def find_discordant_pairs(bam_path: str, chrom: str, start: int, end: int,
                           min_insert: int = 1000) -> list[dict]:
    """Find read pairs with abnormal insert sizes (SV evidence)."""
    bam = pysam.AlignmentFile(bam_path, "rb")
    discordant = []
    
    for read in bam.fetch(chrom, start, end):
        if (read.is_proper_pair or read.is_unmapped or 
            read.mate_is_unmapped or read.is_secondary):
            continue
        
        insert = abs(read.template_length)
        if insert > min_insert:
            discordant.append({
                "read_name": read.query_name,
                "chrom": chrom,
                "pos": read.reference_start,
                "mate_chrom": read.next_reference_name,
                "mate_pos": read.next_reference_start,
                "insert_size": insert,
                "orientation": ("F" if not read.is_reverse else "R") +
                              ("F" if not read.mate_is_reverse else "R"),
            })
    
    bam.close()
    return discordant

Variant filtering and quality control

GATK VQSR alternative: hard filtering in Python

from cyvcf2 import VCF, Writer

def hard_filter_variants(input_vcf: str, output_vcf: str):
    """Apply GATK-recommended hard filters when VQSR is not possible."""
    vcf_in = VCF(input_vcf)
    vcf_in.add_filter_to_header({
        "ID": "LowQual", "Description": "QD<2 or FS>60 or MQ<40 or SOR>3"
    })
    
    vcf_out = Writer(output_vcf, vcf_in)
    
    stats = {"total": 0, "pass": 0, "filtered": 0}
    
    for variant in vcf_in:
        stats["total"] += 1
        
        qd = variant.INFO.get("QD", 999)
        fs = variant.INFO.get("FS", 0)
        mq = variant.INFO.get("MQ", 999)
        sor = variant.INFO.get("SOR", 0)
        
        filters = []
        
        if variant.is_snp:
            if qd < 2: filters.append("LowQD")
            if fs > 60: filters.append("HighFS")
            if mq < 40: filters.append("LowMQ")
            if sor > 3: filters.append("HighSOR")
        else:  # Indels
            if qd < 2: filters.append("LowQD")
            if fs > 200: filters.append("HighFS")
            if sor > 10: filters.append("HighSOR")
        
        if filters:
            variant.FILTER = filters
            stats["filtered"] += 1
        else:
            stats["pass"] += 1
        
        vcf_out.write_record(variant)
    
    vcf_out.close()
    vcf_in.close()
    return stats

Population frequency filtering

import pandas as pd

def classify_variants(annotated_df: pd.DataFrame) -> pd.DataFrame:
    """Classify variants by clinical relevance."""
    df = annotated_df.copy()
    
    # Frequency filters
    df["is_rare"] = df["gnomAD_AF"].fillna(0) < 0.01
    df["is_ultra_rare"] = df["gnomAD_AF"].fillna(0) < 0.0001
    
    # Consequence severity
    severe = {"frameshift_variant", "stop_gained", "splice_donor_variant",
              "splice_acceptor_variant", "start_lost"}
    moderate = {"missense_variant", "inframe_deletion", "inframe_insertion"}
    
    df["severity"] = df["consequence"].apply(
        lambda c: "HIGH" if c in severe else "MODERATE" if c in moderate else "LOW"
    )
    
    # Prioritization score
    df["priority"] = (
        df["is_rare"].astype(int) * 2 +
        df["severity"].map({"HIGH": 3, "MODERATE": 2, "LOW": 0}) +
        df["CADD_phred"].fillna(0).gt(20).astype(int) * 2 +
        df["ClinVar_CLNSIG"].str.contains("Pathogenic", na=False).astype(int) * 5
    )
    
    return df.sort_values("priority", ascending=False)

Population genetics with scikit-allel

For population-scale analysis across hundreds or thousands of genomes:

import allel
import numpy as np

# Load VCF into scikit-allel
callset = allel.read_vcf("population.vcf.gz", 
                          fields=["variants/CHROM", "variants/POS",
                                  "calldata/GT", "samples"])

gt = allel.GenotypeArray(callset["calldata/GT"])
samples = callset["samples"]

# Allele frequencies
ac = gt.count_alleles()
af = ac.to_frequencies()

# Principal Component Analysis for population structure
# Filter to common, biallelic SNPs
is_biallelic = ac.max_allele() == 1
is_common = af[:, 1] > 0.05
mask = is_biallelic & is_common

gt_filtered = gt[mask]
gn = gt_filtered.to_n_alt()  # Convert to alt allele counts (0, 1, 2)

# PCA
coords, model = allel.pca(gn, n_components=10, scaler="patterson")

import matplotlib.pyplot as plt
plt.scatter(coords[:, 0], coords[:, 1], c=population_labels, cmap="tab10", s=10)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Population Structure")
plt.savefig("pca_population.png", dpi=150)

Genome-wide association study (GWAS) setup

def gwas_scan(genotypes: np.ndarray, phenotype: np.ndarray, 
              covariates: np.ndarray) -> np.ndarray:
    """Simple linear regression GWAS scan."""
    from scipy import stats
    
    n_variants = genotypes.shape[0]
    p_values = np.zeros(n_variants)
    
    for i in range(n_variants):
        geno = genotypes[i]
        valid = ~np.isnan(geno) & ~np.isnan(phenotype)
        
        if valid.sum() < 30:
            p_values[i] = 1.0
            continue
        
        # Residualize phenotype on covariates
        X = np.column_stack([covariates[valid], geno[valid]])
        slope, intercept, r, p, se = stats.linregress(geno[valid], phenotype[valid])
        p_values[i] = p
    
    return p_values

# Manhattan plot
def manhattan_plot(chroms, positions, p_values, threshold=5e-8):
    fig, ax = plt.subplots(figsize=(16, 6))
    neg_log_p = -np.log10(p_values)
    
    colors = ["#1f77b4", "#aec7e8"]
    x_offset = 0
    x_ticks = []
    
    for i, chrom in enumerate(sorted(set(chroms))):
        mask = chroms == chrom
        x = positions[mask] + x_offset
        ax.scatter(x, neg_log_p[mask], c=colors[i % 2], s=2, alpha=0.5)
        x_ticks.append((x.min() + x.max()) / 2)
        x_offset = x.max() + 1e6
    
    ax.axhline(-np.log10(threshold), color="red", linestyle="--", alpha=0.5)
    ax.set_ylabel("-log10(p-value)")
    ax.set_xlabel("Chromosome")
    plt.savefig("manhattan.png", dpi=200, bbox_inches="tight")

Long-read sequencing analysis

Long reads from Nanopore and PacBio enable structural variant detection and haplotype phasing:

def detect_structural_variants_from_long_reads(bam_path: str, 
                                                 min_sv_size: int = 50) -> list[dict]:
    """Detect SVs from CIGAR strings of long-read alignments."""
    bam = pysam.AlignmentFile(bam_path, "rb")
    svs = []
    
    for read in bam:
        if read.is_secondary or read.is_supplementary or read.mapping_quality < 20:
            continue
        
        ref_pos = read.reference_start
        
        for op, length in read.cigartuples:
            if op == 2 and length >= min_sv_size:  # Deletion
                svs.append({
                    "type": "DEL",
                    "chrom": read.reference_name,
                    "start": ref_pos,
                    "end": ref_pos + length,
                    "size": length,
                    "read": read.query_name,
                })
            elif op == 1 and length >= min_sv_size:  # Insertion
                svs.append({
                    "type": "INS",
                    "chrom": read.reference_name,
                    "pos": ref_pos,
                    "size": length,
                    "read": read.query_name,
                })
            
            if op in (0, 2, 3, 7, 8):  # Operations that consume reference
                ref_pos += length
    
    bam.close()
    return svs

Tradeoffs

Pipeline componentProduction choiceAlternativeWhen to switch
Workflow managerSnakemakeNextflow (nf-core)Large team, cloud-native
Aligner (short)BWA-MEM2DRAGEN (FPGA)Clinical latency requirements
Aligner (long)minimap2WinnowmapRepetitive regions
Variant callerGATK HaplotypeCallerDeepVariantHigher accuracy needed
VCF parsingcyvcf2pysam.VariantFileWhen already using pysam
Population geneticsscikit-allelPLINK (via subprocess)Standard GWAS workflows

The one thing to remember: A production Python genomics pipeline chains Snakemake for workflow management, pysam/cyvcf2 for data access, GATK/DeepVariant for variant calling, and scikit-allel for population analysis — and the key engineering challenge is managing the terabyte-scale data flows while maintaining reproducibility across thousands of samples.

pythongenomicsbioinformatics

See Also