Python for Genomics Sequencing — Deep Dive
Production genomics pipeline architecture
A clinical or research genomics pipeline must be reproducible, scalable, and auditable. Modern pipelines use workflow managers to orchestrate steps across compute clusters.
Sample Sheet → QC → Alignment → Duplicate Marking → Base Recalibration →
Variant Calling → Joint Genotyping → Filtering → Annotation → Reporting
Workflow management with Snakemake
Snakemake (Python-based) defines pipelines as rules with inputs, outputs, and shell commands:
# Snakefile
SAMPLES = ["sample_A", "sample_B", "sample_C"]
rule all:
input:
expand("results/{sample}.annotated.vcf.gz", sample=SAMPLES)
rule trim_reads:
input:
r1="raw/{sample}_R1.fastq.gz",
r2="raw/{sample}_R2.fastq.gz"
output:
r1="trimmed/{sample}_R1.fastq.gz",
r2="trimmed/{sample}_R2.fastq.gz",
json="qc/{sample}_fastp.json"
threads: 4
shell:
"""
fastp -i {input.r1} -I {input.r2} \
-o {output.r1} -O {output.r2} \
-j {output.json} -w {threads}
"""
rule align:
input:
r1="trimmed/{sample}_R1.fastq.gz",
r2="trimmed/{sample}_R2.fastq.gz",
ref="reference/genome.fa"
output:
bam="aligned/{sample}.sorted.bam",
bai="aligned/{sample}.sorted.bam.bai"
threads: 16
shell:
"""
bwa-mem2 mem -t {threads} \
-R '@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\\tPL:ILLUMINA' \
{input.ref} {input.r1} {input.r2} | \
samtools sort -@ 4 -o {output.bam}
samtools index {output.bam}
"""
rule mark_duplicates:
input:
"aligned/{sample}.sorted.bam"
output:
bam="dedup/{sample}.dedup.bam",
metrics="qc/{sample}.dup_metrics.txt"
shell:
"""
gatk MarkDuplicates -I {input} -O {output.bam} \
-M {output.metrics} --CREATE_INDEX true
"""
rule call_variants:
input:
bam="dedup/{sample}.dedup.bam",
ref="reference/genome.fa"
output:
"variants/{sample}.g.vcf.gz"
shell:
"""
gatk HaplotypeCaller -R {input.ref} -I {input.bam} \
-O {output} -ERC GVCF
"""
Run with: snakemake --cores 64 --use-conda for automatic environment management.
Advanced BAM processing with pysam
Coverage analysis
import pysam
import numpy as np
def coverage_stats(bam_path: str, bed_path: str) -> dict:
"""Calculate coverage statistics over target regions."""
bam = pysam.AlignmentFile(bam_path, "rb")
coverages = []
regions = []
with open(bed_path) as f:
for line in f:
chrom, start, end, *rest = line.strip().split("\t")
start, end = int(start), int(end)
name = rest[0] if rest else f"{chrom}:{start}-{end}"
# pileup gives per-base coverage
region_cov = np.zeros(end - start, dtype=np.int32)
for pileup_col in bam.pileup(chrom, start, end,
truncate=True, min_mapping_quality=20):
pos = pileup_col.reference_pos - start
if 0 <= pos < len(region_cov):
region_cov[pos] = pileup_col.nsegments
coverages.append(region_cov)
regions.append({
"name": name,
"mean_cov": region_cov.mean(),
"median_cov": np.median(region_cov),
"pct_20x": (region_cov >= 20).mean() * 100,
"pct_30x": (region_cov >= 30).mean() * 100,
})
bam.close()
return {
"regions": regions,
"overall_mean": np.mean([r["mean_cov"] for r in regions]),
"pct_target_20x": np.mean([r["pct_20x"] for r in regions]),
}
Structural variant evidence
def find_discordant_pairs(bam_path: str, chrom: str, start: int, end: int,
min_insert: int = 1000) -> list[dict]:
"""Find read pairs with abnormal insert sizes (SV evidence)."""
bam = pysam.AlignmentFile(bam_path, "rb")
discordant = []
for read in bam.fetch(chrom, start, end):
if (read.is_proper_pair or read.is_unmapped or
read.mate_is_unmapped or read.is_secondary):
continue
insert = abs(read.template_length)
if insert > min_insert:
discordant.append({
"read_name": read.query_name,
"chrom": chrom,
"pos": read.reference_start,
"mate_chrom": read.next_reference_name,
"mate_pos": read.next_reference_start,
"insert_size": insert,
"orientation": ("F" if not read.is_reverse else "R") +
("F" if not read.mate_is_reverse else "R"),
})
bam.close()
return discordant
Variant filtering and quality control
GATK VQSR alternative: hard filtering in Python
from cyvcf2 import VCF, Writer
def hard_filter_variants(input_vcf: str, output_vcf: str):
"""Apply GATK-recommended hard filters when VQSR is not possible."""
vcf_in = VCF(input_vcf)
vcf_in.add_filter_to_header({
"ID": "LowQual", "Description": "QD<2 or FS>60 or MQ<40 or SOR>3"
})
vcf_out = Writer(output_vcf, vcf_in)
stats = {"total": 0, "pass": 0, "filtered": 0}
for variant in vcf_in:
stats["total"] += 1
qd = variant.INFO.get("QD", 999)
fs = variant.INFO.get("FS", 0)
mq = variant.INFO.get("MQ", 999)
sor = variant.INFO.get("SOR", 0)
filters = []
if variant.is_snp:
if qd < 2: filters.append("LowQD")
if fs > 60: filters.append("HighFS")
if mq < 40: filters.append("LowMQ")
if sor > 3: filters.append("HighSOR")
else: # Indels
if qd < 2: filters.append("LowQD")
if fs > 200: filters.append("HighFS")
if sor > 10: filters.append("HighSOR")
if filters:
variant.FILTER = filters
stats["filtered"] += 1
else:
stats["pass"] += 1
vcf_out.write_record(variant)
vcf_out.close()
vcf_in.close()
return stats
Population frequency filtering
import pandas as pd
def classify_variants(annotated_df: pd.DataFrame) -> pd.DataFrame:
"""Classify variants by clinical relevance."""
df = annotated_df.copy()
# Frequency filters
df["is_rare"] = df["gnomAD_AF"].fillna(0) < 0.01
df["is_ultra_rare"] = df["gnomAD_AF"].fillna(0) < 0.0001
# Consequence severity
severe = {"frameshift_variant", "stop_gained", "splice_donor_variant",
"splice_acceptor_variant", "start_lost"}
moderate = {"missense_variant", "inframe_deletion", "inframe_insertion"}
df["severity"] = df["consequence"].apply(
lambda c: "HIGH" if c in severe else "MODERATE" if c in moderate else "LOW"
)
# Prioritization score
df["priority"] = (
df["is_rare"].astype(int) * 2 +
df["severity"].map({"HIGH": 3, "MODERATE": 2, "LOW": 0}) +
df["CADD_phred"].fillna(0).gt(20).astype(int) * 2 +
df["ClinVar_CLNSIG"].str.contains("Pathogenic", na=False).astype(int) * 5
)
return df.sort_values("priority", ascending=False)
Population genetics with scikit-allel
For population-scale analysis across hundreds or thousands of genomes:
import allel
import numpy as np
# Load VCF into scikit-allel
callset = allel.read_vcf("population.vcf.gz",
fields=["variants/CHROM", "variants/POS",
"calldata/GT", "samples"])
gt = allel.GenotypeArray(callset["calldata/GT"])
samples = callset["samples"]
# Allele frequencies
ac = gt.count_alleles()
af = ac.to_frequencies()
# Principal Component Analysis for population structure
# Filter to common, biallelic SNPs
is_biallelic = ac.max_allele() == 1
is_common = af[:, 1] > 0.05
mask = is_biallelic & is_common
gt_filtered = gt[mask]
gn = gt_filtered.to_n_alt() # Convert to alt allele counts (0, 1, 2)
# PCA
coords, model = allel.pca(gn, n_components=10, scaler="patterson")
import matplotlib.pyplot as plt
plt.scatter(coords[:, 0], coords[:, 1], c=population_labels, cmap="tab10", s=10)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Population Structure")
plt.savefig("pca_population.png", dpi=150)
Genome-wide association study (GWAS) setup
def gwas_scan(genotypes: np.ndarray, phenotype: np.ndarray,
covariates: np.ndarray) -> np.ndarray:
"""Simple linear regression GWAS scan."""
from scipy import stats
n_variants = genotypes.shape[0]
p_values = np.zeros(n_variants)
for i in range(n_variants):
geno = genotypes[i]
valid = ~np.isnan(geno) & ~np.isnan(phenotype)
if valid.sum() < 30:
p_values[i] = 1.0
continue
# Residualize phenotype on covariates
X = np.column_stack([covariates[valid], geno[valid]])
slope, intercept, r, p, se = stats.linregress(geno[valid], phenotype[valid])
p_values[i] = p
return p_values
# Manhattan plot
def manhattan_plot(chroms, positions, p_values, threshold=5e-8):
fig, ax = plt.subplots(figsize=(16, 6))
neg_log_p = -np.log10(p_values)
colors = ["#1f77b4", "#aec7e8"]
x_offset = 0
x_ticks = []
for i, chrom in enumerate(sorted(set(chroms))):
mask = chroms == chrom
x = positions[mask] + x_offset
ax.scatter(x, neg_log_p[mask], c=colors[i % 2], s=2, alpha=0.5)
x_ticks.append((x.min() + x.max()) / 2)
x_offset = x.max() + 1e6
ax.axhline(-np.log10(threshold), color="red", linestyle="--", alpha=0.5)
ax.set_ylabel("-log10(p-value)")
ax.set_xlabel("Chromosome")
plt.savefig("manhattan.png", dpi=200, bbox_inches="tight")
Long-read sequencing analysis
Long reads from Nanopore and PacBio enable structural variant detection and haplotype phasing:
def detect_structural_variants_from_long_reads(bam_path: str,
min_sv_size: int = 50) -> list[dict]:
"""Detect SVs from CIGAR strings of long-read alignments."""
bam = pysam.AlignmentFile(bam_path, "rb")
svs = []
for read in bam:
if read.is_secondary or read.is_supplementary or read.mapping_quality < 20:
continue
ref_pos = read.reference_start
for op, length in read.cigartuples:
if op == 2 and length >= min_sv_size: # Deletion
svs.append({
"type": "DEL",
"chrom": read.reference_name,
"start": ref_pos,
"end": ref_pos + length,
"size": length,
"read": read.query_name,
})
elif op == 1 and length >= min_sv_size: # Insertion
svs.append({
"type": "INS",
"chrom": read.reference_name,
"pos": ref_pos,
"size": length,
"read": read.query_name,
})
if op in (0, 2, 3, 7, 8): # Operations that consume reference
ref_pos += length
bam.close()
return svs
Tradeoffs
| Pipeline component | Production choice | Alternative | When to switch |
|---|---|---|---|
| Workflow manager | Snakemake | Nextflow (nf-core) | Large team, cloud-native |
| Aligner (short) | BWA-MEM2 | DRAGEN (FPGA) | Clinical latency requirements |
| Aligner (long) | minimap2 | Winnowmap | Repetitive regions |
| Variant caller | GATK HaplotypeCaller | DeepVariant | Higher accuracy needed |
| VCF parsing | cyvcf2 | pysam.VariantFile | When already using pysam |
| Population genetics | scikit-allel | PLINK (via subprocess) | Standard GWAS workflows |
The one thing to remember: A production Python genomics pipeline chains Snakemake for workflow management, pysam/cyvcf2 for data access, GATK/DeepVariant for variant calling, and scikit-allel for population analysis — and the key engineering challenge is managing the terabyte-scale data flows while maintaining reproducibility across thousands of samples.
See Also
- Python Biopython Bioinformatics How Python helps scientists read the instruction manual hidden inside every living thing's DNA.
- Python Clinical Trial Analysis How Python helps scientists figure out whether a new medicine actually works by crunching the numbers from clinical trials.
- Python Drug Interaction Modeling How Python helps scientists figure out which medicines are safe to take together and which combinations could be dangerous.
- Python Medical Image Analysis How Python helps doctors see inside your body more clearly by teaching computers to read X-rays, MRIs, and CT scans.
- Python Pandemic Modeling How Python helps scientists predict the spread of diseases like COVID-19 and plan the best ways to slow them down.