Python for Genomics Sequencing — Deep Dive

Build production genomics pipelines in Python with pysam, Snakemake workflow management, variant filtering, and population-scale analysis using scikit-allel.

Production genomics pipeline architecture

A clinical or research genomics pipeline must be reproducible, scalable, and auditable. Modern pipelines use workflow managers to orchestrate steps across compute clusters.

Sample Sheet → QC → Alignment → Duplicate Marking → Base Recalibration →
  Variant Calling → Joint Genotyping → Filtering → Annotation → Reporting

Workflow management with Snakemake

Snakemake (Python-based) defines pipelines as rules with inputs, outputs, and shell commands:

# Snakefile

SAMPLES = ["sample_A", "sample_B", "sample_C"]

rule all:
    input:
        expand("results/{sample}.annotated.vcf.gz", sample=SAMPLES)

rule trim_reads:
    input:
        r1="raw/{sample}_R1.fastq.gz",
        r2="raw/{sample}_R2.fastq.gz"
    output:
        r1="trimmed/{sample}_R1.fastq.gz",
        r2="trimmed/{sample}_R2.fastq.gz",
        json="qc/{sample}_fastp.json"
    threads: 4
    shell:
        """
        fastp -i {input.r1} -I {input.r2} \
              -o {output.r1} -O {output.r2} \
              -j {output.json} -w {threads}
        """

rule align:
    input:
        r1="trimmed/{sample}_R1.fastq.gz",
        r2="trimmed/{sample}_R2.fastq.gz",
        ref="reference/genome.fa"
    output:
        bam="aligned/{sample}.sorted.bam",
        bai="aligned/{sample}.sorted.bam.bai"
    threads: 16
    shell:
        """
        bwa-mem2 mem -t {threads} \
            -R '@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\\tPL:ILLUMINA' \
            {input.ref} {input.r1} {input.r2} | \
        samtools sort -@ 4 -o {output.bam}
        samtools index {output.bam}
        """

rule mark_duplicates:
    input:
        "aligned/{sample}.sorted.bam"
    output:
        bam="dedup/{sample}.dedup.bam",
        metrics="qc/{sample}.dup_metrics.txt"
    shell:
        """
        gatk MarkDuplicates -I {input} -O {output.bam} \
            -M {output.metrics} --CREATE_INDEX true
        """

rule call_variants:
    input:
        bam="dedup/{sample}.dedup.bam",
        ref="reference/genome.fa"
    output:
        "variants/{sample}.g.vcf.gz"
    shell:
        """
        gatk HaplotypeCaller -R {input.ref} -I {input.bam} \
            -O {output} -ERC GVCF
        """

Run with: snakemake --cores 64 --use-conda for automatic environment management.

Advanced BAM processing with pysam

Coverage analysis

import pysam
import numpy as np

def coverage_stats(bam_path: str, bed_path: str) -> dict:
    """Calculate coverage statistics over target regions."""
    bam = pysam.AlignmentFile(bam_path, "rb")
    
    coverages = []
    regions = []
    
    with open(bed_path) as f:
        for line in f:
            chrom, start, end, *rest = line.strip().split("\t")
            start, end = int(start), int(end)
            name = rest[0] if rest else f"{chrom}:{start}-{end}"
            
            # pileup gives per-base coverage
            region_cov = np.zeros(end - start, dtype=np.int32)
            for pileup_col in bam.pileup(chrom, start, end, 
                                          truncate=True, min_mapping_quality=20):
                pos = pileup_col.reference_pos - start
                if 0 <= pos < len(region_cov):
                    region_cov[pos] = pileup_col.nsegments
            
            coverages.append(region_cov)
            regions.append({
                "name": name,
                "mean_cov": region_cov.mean(),
                "median_cov": np.median(region_cov),
                "pct_20x": (region_cov >= 20).mean() * 100,
                "pct_30x": (region_cov >= 30).mean() * 100,
            })
    
    bam.close()
    return {
        "regions": regions,
        "overall_mean": np.mean([r["mean_cov"] for r in regions]),
        "pct_target_20x": np.mean([r["pct_20x"] for r in regions]),
    }

Structural variant evidence

def find_discordant_pairs(bam_path: str, chrom: str, start: int, end: int,
                           min_insert: int = 1000) -> list[dict]:
    """Find read pairs with abnormal insert sizes (SV evidence)."""
    bam = pysam.AlignmentFile(bam_path, "rb")
    discordant = []
    
    for read in bam.fetch(chrom, start, end):
        if (read.is_proper_pair or read.is_unmapped or 
            read.mate_is_unmapped or read.is_secondary):
            continue
        
        insert = abs(read.template_length)
        if insert > min_insert:
            discordant.append({
                "read_name": read.query_name,
                "chrom": chrom,
                "pos": read.reference_start,
                "mate_chrom": read.next_reference_name,
                "mate_pos": read.next_reference_start,
                "insert_size": insert,
                "orientation": ("F" if not read.is_reverse else "R") +
                              ("F" if not read.mate_is_reverse else "R"),
            })
    
    bam.close()
    return discordant

Variant filtering and quality control

GATK VQSR alternative: hard filtering in Python

from cyvcf2 import VCF, Writer

def hard_filter_variants(input_vcf: str, output_vcf: str):
    """Apply GATK-recommended hard filters when VQSR is not possible."""
    vcf_in = VCF(input_vcf)
    vcf_in.add_filter_to_header({
        "ID": "LowQual", "Description": "QD<2 or FS>60 or MQ<40 or SOR>3"
    })
    
    vcf_out = Writer(output_vcf, vcf_in)
    
    stats = {"total": 0, "pass": 0, "filtered": 0}
    
    for variant in vcf_in:
        stats["total"] += 1
        
        qd = variant.INFO.get("QD", 999)
        fs = variant.INFO.get("FS", 0)
        mq = variant.INFO.get("MQ", 999)
        sor = variant.INFO.get("SOR", 0)
        
        filters = []
        
        if variant.is_snp:
            if qd < 2: filters.append("LowQD")
            if fs > 60: filters.append("HighFS")
            if mq < 40: filters.append("LowMQ")
            if sor > 3: filters.append("HighSOR")
        else:  # Indels
            if qd < 2: filters.append("LowQD")
            if fs > 200: filters.append("HighFS")
            if sor > 10: filters.append("HighSOR")
        
        if filters:
            variant.FILTER = filters
            stats["filtered"] += 1
        else:
            stats["pass"] += 1
        
        vcf_out.write_record(variant)
    
    vcf_out.close()
    vcf_in.close()
    return stats

Population frequency filtering

import pandas as pd

def classify_variants(annotated_df: pd.DataFrame) -> pd.DataFrame:
    """Classify variants by clinical relevance."""
    df = annotated_df.copy()
    
    # Frequency filters
    df["is_rare"] = df["gnomAD_AF"].fillna(0) < 0.01
    df["is_ultra_rare"] = df["gnomAD_AF"].fillna(0) < 0.0001
    
    # Consequence severity
    severe = {"frameshift_variant", "stop_gained", "splice_donor_variant",
              "splice_acceptor_variant", "start_lost"}
    moderate = {"missense_variant", "inframe_deletion", "inframe_insertion"}
    
    df["severity"] = df["consequence"].apply(
        lambda c: "HIGH" if c in severe else "MODERATE" if c in moderate else "LOW"
    )
    
    # Prioritization score
    df["priority"] = (
        df["is_rare"].astype(int) * 2 +
        df["severity"].map({"HIGH": 3, "MODERATE": 2, "LOW": 0}) +
        df["CADD_phred"].fillna(0).gt(20).astype(int) * 2 +
        df["ClinVar_CLNSIG"].str.contains("Pathogenic", na=False).astype(int) * 5
    )
    
    return df.sort_values("priority", ascending=False)

Population genetics with scikit-allel

For population-scale analysis across hundreds or thousands of genomes:

import allel
import numpy as np

# Load VCF into scikit-allel
callset = allel.read_vcf("population.vcf.gz", 
                          fields=["variants/CHROM", "variants/POS",
                                  "calldata/GT", "samples"])

gt = allel.GenotypeArray(callset["calldata/GT"])
samples = callset["samples"]

# Allele frequencies
ac = gt.count_alleles()
af = ac.to_frequencies()

# Principal Component Analysis for population structure
# Filter to common, biallelic SNPs
is_biallelic = ac.max_allele() == 1
is_common = af[:, 1] > 0.05
mask = is_biallelic & is_common

gt_filtered = gt[mask]
gn = gt_filtered.to_n_alt()  # Convert to alt allele counts (0, 1, 2)

# PCA
coords, model = allel.pca(gn, n_components=10, scaler="patterson")

import matplotlib.pyplot as plt
plt.scatter(coords[:, 0], coords[:, 1], c=population_labels, cmap="tab10", s=10)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Population Structure")
plt.savefig("pca_population.png", dpi=150)

Genome-wide association study (GWAS) setup

def gwas_scan(genotypes: np.ndarray, phenotype: np.ndarray, 
              covariates: np.ndarray) -> np.ndarray:
    """Simple linear regression GWAS scan."""
    from scipy import stats
    
    n_variants = genotypes.shape[0]
    p_values = np.zeros(n_variants)
    
    for i in range(n_variants):
        geno = genotypes[i]
        valid = ~np.isnan(geno) & ~np.isnan(phenotype)
        
        if valid.sum() < 30:
            p_values[i] = 1.0
            continue
        
        # Residualize phenotype on covariates
        X = np.column_stack([covariates[valid], geno[valid]])
        slope, intercept, r, p, se = stats.linregress(geno[valid], phenotype[valid])
        p_values[i] = p
    
    return p_values

# Manhattan plot
def manhattan_plot(chroms, positions, p_values, threshold=5e-8):
    fig, ax = plt.subplots(figsize=(16, 6))
    neg_log_p = -np.log10(p_values)
    
    colors = ["#1f77b4", "#aec7e8"]
    x_offset = 0
    x_ticks = []
    
    for i, chrom in enumerate(sorted(set(chroms))):
        mask = chroms == chrom
        x = positions[mask] + x_offset
        ax.scatter(x, neg_log_p[mask], c=colors[i % 2], s=2, alpha=0.5)
        x_ticks.append((x.min() + x.max()) / 2)
        x_offset = x.max() + 1e6
    
    ax.axhline(-np.log10(threshold), color="red", linestyle="--", alpha=0.5)
    ax.set_ylabel("-log10(p-value)")
    ax.set_xlabel("Chromosome")
    plt.savefig("manhattan.png", dpi=200, bbox_inches="tight")

Long-read sequencing analysis

Long reads from Nanopore and PacBio enable structural variant detection and haplotype phasing:

def detect_structural_variants_from_long_reads(bam_path: str, 
                                                 min_sv_size: int = 50) -> list[dict]:
    """Detect SVs from CIGAR strings of long-read alignments."""
    bam = pysam.AlignmentFile(bam_path, "rb")
    svs = []
    
    for read in bam:
        if read.is_secondary or read.is_supplementary or read.mapping_quality < 20:
            continue
        
        ref_pos = read.reference_start
        
        for op, length in read.cigartuples:
            if op == 2 and length >= min_sv_size:  # Deletion
                svs.append({
                    "type": "DEL",
                    "chrom": read.reference_name,
                    "start": ref_pos,
                    "end": ref_pos + length,
                    "size": length,
                    "read": read.query_name,
                })
            elif op == 1 and length >= min_sv_size:  # Insertion
                svs.append({
                    "type": "INS",
                    "chrom": read.reference_name,
                    "pos": ref_pos,
                    "size": length,
                    "read": read.query_name,
                })
            
            if op in (0, 2, 3, 7, 8):  # Operations that consume reference
                ref_pos += length
    
    bam.close()
    return svs

Tradeoffs

Pipeline component	Production choice	Alternative	When to switch
Workflow manager	Snakemake	Nextflow (nf-core)	Large team, cloud-native
Aligner (short)	BWA-MEM2	DRAGEN (FPGA)	Clinical latency requirements
Aligner (long)	minimap2	Winnowmap	Repetitive regions
Variant caller	GATK HaplotypeCaller	DeepVariant	Higher accuracy needed
VCF parsing	cyvcf2	pysam.VariantFile	When already using pysam
Population genetics	scikit-allel	PLINK (via subprocess)	Standard GWAS workflows

The one thing to remember: A production Python genomics pipeline chains Snakemake for workflow management, pysam/cyvcf2 for data access, GATK/DeepVariant for variant calling, and scikit-allel for population analysis — and the key engineering challenge is managing the terabyte-scale data flows while maintaining reproducibility across thousands of samples.

pythongenomicsbioinformatics