NVIDIA Jetson Nano ML with Python — Deep Dive

Build production ML pipelines on Jetson Nano with TensorRT, DeepStream, CUDA memory management, and real-time video inference in Python.

Environment Setup

JetPack Installation

Flash JetPack to your Jetson using NVIDIA’s SDK Manager or the SD card image:

# Check JetPack version
head -1 /etc/nv_tegra_release
# Typically: R35 (rev 4.1), GCID: ..., BOARD: t210ref

# Verify CUDA
nvcc --version
# cuda_11.4

# Check available GPU memory
python3 -c "
import subprocess
result = subprocess.run(['tegrastats'], capture_output=True, text=True, timeout=2)
print(result.stdout[:200])
"

Installing PyTorch for Jetson

NVIDIA provides pre-built PyTorch wheels for ARM + CUDA:

# Don't pip install pytorch — the standard wheels don't include CUDA for ARM
# Use NVIDIA's builds:
wget https://developer.download.nvidia.com/compute/redist/jp/v512/pytorch/torch-2.1.0a0+41361538.nv23.06-cp38-cp38-linux_aarch64.whl

pip3 install torch-2.1.0a0+41361538.nv23.06-cp38-cp38-linux_aarch64.whl

# Verify GPU access
python3 -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0))"
# True
# NVIDIA Tegra X1

TensorRT Optimization Pipeline

Converting PyTorch to TensorRT

import torch
import tensorrt as trt

# Step 1: Export PyTorch model to ONNX
model = load_your_trained_model()
model.eval().cuda()

dummy_input = torch.randn(1, 3, 224, 224).cuda()
torch.onnx.export(
    model,
    dummy_input,
    "model.onnx",
    opset_version=13,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}}
)

# Step 2: Build TensorRT engine
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(
    1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, logger)

with open("model.onnx", "rb") as f:
    if not parser.parse(f.read()):
        for i in range(parser.num_errors):
            print(parser.get_error(i))
        raise RuntimeError("ONNX parsing failed")

config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 28)  # 256 MB

# Enable FP16 for ~2x speedup on Jetson Nano
config.set_flag(trt.BuilderFlag.FP16)

# Build the engine (this takes minutes — cache the result)
engine_bytes = builder.build_serialized_network(network, config)

with open("model.engine", "wb") as f:
    f.write(engine_bytes)

Running TensorRT Inference

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np

# Load engine
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)

with open("model.engine", "rb") as f:
    engine = runtime.deserialize_cuda_engine(f.read())

context = engine.create_execution_context()

# Allocate GPU buffers
input_shape = (1, 3, 224, 224)
output_shape = (1, 1000)

d_input = cuda.mem_alloc(np.prod(input_shape) * 4)   # float32
d_output = cuda.mem_alloc(np.prod(output_shape) * 4)
stream = cuda.Stream()

def infer(input_data: np.ndarray) -> np.ndarray:
    """Run single inference on TensorRT engine."""
    output = np.empty(output_shape, dtype=np.float32)

    # Transfer input to GPU
    cuda.memcpy_htod_async(d_input, input_data.ravel(), stream)

    # Execute
    context.execute_async_v2(
        bindings=[int(d_input), int(d_output)],
        stream_handle=stream.handle
    )

    # Transfer output back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    stream.synchronize()

    return output

Real-Time Video Pipeline

Camera + Inference with GStreamer

GStreamer handles hardware-accelerated video decode on Jetson, freeing the CPU for other tasks:

import cv2
import numpy as np
import time

# GStreamer pipeline for CSI camera with hardware decode
gst_pipeline = (
    "nvarguscamerasrc sensor-id=0 ! "
    "video/x-raw(memory:NVMM), width=1280, height=720, "
    "framerate=30/1, format=NV12 ! "
    "nvvidconv flip-method=0 ! "
    "video/x-raw, width=640, height=480, format=BGRx ! "
    "videoconvert ! "
    "video/x-raw, format=BGR ! appsink drop=1"
)

cap = cv2.VideoCapture(gst_pipeline, cv2.CAP_GSTREAMER)

if not cap.isOpened():
    raise RuntimeError("Failed to open camera")

frame_count = 0
start_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess for model
    input_tensor = cv2.resize(frame, (224, 224))
    input_tensor = input_tensor.astype(np.float32) / 255.0
    input_tensor = np.transpose(input_tensor, (2, 0, 1))  # HWC -> CHW
    input_tensor = np.expand_dims(input_tensor, 0)         # Add batch dim

    # Run TensorRT inference
    output = infer(input_tensor)
    class_id = np.argmax(output[0])

    # Overlay result
    cv2.putText(frame, f"Class: {class_id}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    frame_count += 1
    if frame_count % 100 == 0:
        fps = frame_count / (time.time() - start_time)
        print(f"FPS: {fps:.1f}")

cap.release()

DeepStream for Multi-Stream Processing

NVIDIA’s DeepStream SDK handles multiple video streams with GPU-accelerated inference:

import gi
gi.require_version("Gst", "1.0")
from gi.repository import Gst, GLib

Gst.init(None)

# DeepStream pipeline handles:
# - Hardware video decode (NVDEC)
# - Batched inference (multiple streams -> one GPU batch)
# - Tracking, analytics, and output

pipeline_str = """
    filesrc location=video.mp4 !
    qtdemux ! h264parse ! nvv4l2decoder !
    m.sink_0 nvstreammux name=m batch-size=1
        width=1920 height=1080 !
    nvinfer config-file-path=config_infer_primary.txt !
    nvvideoconvert ! nvdsosd !
    nvvideoconvert ! nv3dsink
"""

pipeline = Gst.parse_launch(pipeline_str)
pipeline.set_state(Gst.State.PLAYING)

loop = GLib.MainLoop()
loop.run()

Memory Management on 4 GB

Monitoring Memory

import subprocess

def get_jetson_stats():
    """Parse tegrastats for memory and GPU usage."""
    proc = subprocess.Popen(
        ["tegrastats", "--interval", "1000"],
        stdout=subprocess.PIPE,
        text=True
    )
    line = proc.stdout.readline()
    proc.terminate()
    return line

# Or use jtop (pip install jetson-stats)
# jtop provides a htop-like interface for Jetson

Memory Optimization Strategies

# 1. Run headless — saves ~400 MB
# sudo systemctl set-default multi-user.target

# 2. Increase swap (JetPack default is often 2 GB)
# sudo fallocate -l 4G /swapfile
# sudo chmod 600 /swapfile
# sudo mkswap /swapfile
# sudo swapon /swapfile

# 3. Use FP16 everywhere
model = model.half()  # Convert model to FP16
input_tensor = input_tensor.half()

# 4. Limit TensorRT workspace
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 26)  # 64 MB

# 5. Process frames in-place to avoid copies
frame_buffer = np.zeros((480, 640, 3), dtype=np.uint8)
# Reuse this buffer instead of allocating new arrays each frame

Power Mode Configuration

# Check current power mode
sudo nvpmodel -q

# Set 5W mode (2 CPU cores, lower clocks)
sudo nvpmodel -m 1

# Set 10W mode (4 CPU cores, higher clocks)
sudo nvpmodel -m 0

# Maximize clocks within current power mode
sudo jetson_clocks

INT8 Calibration for Maximum Performance

import tensorrt as trt
import numpy as np

class CalibrationDataset:
    def __init__(self, data_dir, batch_size=8, num_batches=50):
        self.batches = self._load_batches(data_dir, batch_size, num_batches)
        self.batch_idx = 0

    def _load_batches(self, data_dir, batch_size, num_batches):
        # Load and preprocess calibration images
        batches = []
        for i in range(num_batches):
            batch = load_and_preprocess_batch(data_dir, batch_size)
            batches.append(batch.astype(np.float32))
        return batches

class Int8Calibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, dataset, cache_file="calibration.cache"):
        super().__init__()
        self.dataset = dataset
        self.cache_file = cache_file
        self.d_input = cuda.mem_alloc(
            dataset.batches[0].nbytes
        )

    def get_batch_size(self):
        return self.dataset.batches[0].shape[0]

    def get_batch(self, names):
        if self.dataset.batch_idx >= len(self.dataset.batches):
            return None
        batch = self.dataset.batches[self.dataset.batch_idx]
        cuda.memcpy_htod(self.d_input, batch)
        self.dataset.batch_idx += 1
        return [int(self.d_input)]

    def read_calibration_cache(self):
        try:
            with open(self.cache_file, "rb") as f:
                return f.read()
        except FileNotFoundError:
            return None

    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            f.write(cache)

# Use in engine build
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = Int8Calibrator(CalibrationDataset("cal_data/"))

Production Deployment Checklist

The one thing to remember: Production Jetson Nano ML combines TensorRT engine optimization, GStreamer hardware-accelerated video, aggressive memory management (headless + FP16 + swap), and power mode tuning — the gap between a working demo and a reliable deployed system is entirely in these optimizations.

pythonmachine-learningedge-computing