NVIDIA Jetson Nano ML with Python — Deep Dive
Environment Setup
JetPack Installation
Flash JetPack to your Jetson using NVIDIA’s SDK Manager or the SD card image:
# Check JetPack version
head -1 /etc/nv_tegra_release
# Typically: R35 (rev 4.1), GCID: ..., BOARD: t210ref
# Verify CUDA
nvcc --version
# cuda_11.4
# Check available GPU memory
python3 -c "
import subprocess
result = subprocess.run(['tegrastats'], capture_output=True, text=True, timeout=2)
print(result.stdout[:200])
"
Installing PyTorch for Jetson
NVIDIA provides pre-built PyTorch wheels for ARM + CUDA:
# Don't pip install pytorch — the standard wheels don't include CUDA for ARM
# Use NVIDIA's builds:
wget https://developer.download.nvidia.com/compute/redist/jp/v512/pytorch/torch-2.1.0a0+41361538.nv23.06-cp38-cp38-linux_aarch64.whl
pip3 install torch-2.1.0a0+41361538.nv23.06-cp38-cp38-linux_aarch64.whl
# Verify GPU access
python3 -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0))"
# True
# NVIDIA Tegra X1
TensorRT Optimization Pipeline
Converting PyTorch to TensorRT
import torch
import tensorrt as trt
# Step 1: Export PyTorch model to ONNX
model = load_your_trained_model()
model.eval().cuda()
dummy_input = torch.randn(1, 3, 224, 224).cuda()
torch.onnx.export(
model,
dummy_input,
"model.onnx",
opset_version=13,
input_names=["input"],
output_names=["output"],
dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}}
)
# Step 2: Build TensorRT engine
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, logger)
with open("model.onnx", "rb") as f:
if not parser.parse(f.read()):
for i in range(parser.num_errors):
print(parser.get_error(i))
raise RuntimeError("ONNX parsing failed")
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 28) # 256 MB
# Enable FP16 for ~2x speedup on Jetson Nano
config.set_flag(trt.BuilderFlag.FP16)
# Build the engine (this takes minutes — cache the result)
engine_bytes = builder.build_serialized_network(network, config)
with open("model.engine", "wb") as f:
f.write(engine_bytes)
Running TensorRT Inference
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
# Load engine
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
with open("model.engine", "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
# Allocate GPU buffers
input_shape = (1, 3, 224, 224)
output_shape = (1, 1000)
d_input = cuda.mem_alloc(np.prod(input_shape) * 4) # float32
d_output = cuda.mem_alloc(np.prod(output_shape) * 4)
stream = cuda.Stream()
def infer(input_data: np.ndarray) -> np.ndarray:
"""Run single inference on TensorRT engine."""
output = np.empty(output_shape, dtype=np.float32)
# Transfer input to GPU
cuda.memcpy_htod_async(d_input, input_data.ravel(), stream)
# Execute
context.execute_async_v2(
bindings=[int(d_input), int(d_output)],
stream_handle=stream.handle
)
# Transfer output back
cuda.memcpy_dtoh_async(output, d_output, stream)
stream.synchronize()
return output
Real-Time Video Pipeline
Camera + Inference with GStreamer
GStreamer handles hardware-accelerated video decode on Jetson, freeing the CPU for other tasks:
import cv2
import numpy as np
import time
# GStreamer pipeline for CSI camera with hardware decode
gst_pipeline = (
"nvarguscamerasrc sensor-id=0 ! "
"video/x-raw(memory:NVMM), width=1280, height=720, "
"framerate=30/1, format=NV12 ! "
"nvvidconv flip-method=0 ! "
"video/x-raw, width=640, height=480, format=BGRx ! "
"videoconvert ! "
"video/x-raw, format=BGR ! appsink drop=1"
)
cap = cv2.VideoCapture(gst_pipeline, cv2.CAP_GSTREAMER)
if not cap.isOpened():
raise RuntimeError("Failed to open camera")
frame_count = 0
start_time = time.time()
while True:
ret, frame = cap.read()
if not ret:
break
# Preprocess for model
input_tensor = cv2.resize(frame, (224, 224))
input_tensor = input_tensor.astype(np.float32) / 255.0
input_tensor = np.transpose(input_tensor, (2, 0, 1)) # HWC -> CHW
input_tensor = np.expand_dims(input_tensor, 0) # Add batch dim
# Run TensorRT inference
output = infer(input_tensor)
class_id = np.argmax(output[0])
# Overlay result
cv2.putText(frame, f"Class: {class_id}", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
frame_count += 1
if frame_count % 100 == 0:
fps = frame_count / (time.time() - start_time)
print(f"FPS: {fps:.1f}")
cap.release()
DeepStream for Multi-Stream Processing
NVIDIA’s DeepStream SDK handles multiple video streams with GPU-accelerated inference:
import gi
gi.require_version("Gst", "1.0")
from gi.repository import Gst, GLib
Gst.init(None)
# DeepStream pipeline handles:
# - Hardware video decode (NVDEC)
# - Batched inference (multiple streams -> one GPU batch)
# - Tracking, analytics, and output
pipeline_str = """
filesrc location=video.mp4 !
qtdemux ! h264parse ! nvv4l2decoder !
m.sink_0 nvstreammux name=m batch-size=1
width=1920 height=1080 !
nvinfer config-file-path=config_infer_primary.txt !
nvvideoconvert ! nvdsosd !
nvvideoconvert ! nv3dsink
"""
pipeline = Gst.parse_launch(pipeline_str)
pipeline.set_state(Gst.State.PLAYING)
loop = GLib.MainLoop()
loop.run()
Memory Management on 4 GB
Monitoring Memory
import subprocess
def get_jetson_stats():
"""Parse tegrastats for memory and GPU usage."""
proc = subprocess.Popen(
["tegrastats", "--interval", "1000"],
stdout=subprocess.PIPE,
text=True
)
line = proc.stdout.readline()
proc.terminate()
return line
# Or use jtop (pip install jetson-stats)
# jtop provides a htop-like interface for Jetson
Memory Optimization Strategies
# 1. Run headless — saves ~400 MB
# sudo systemctl set-default multi-user.target
# 2. Increase swap (JetPack default is often 2 GB)
# sudo fallocate -l 4G /swapfile
# sudo chmod 600 /swapfile
# sudo mkswap /swapfile
# sudo swapon /swapfile
# 3. Use FP16 everywhere
model = model.half() # Convert model to FP16
input_tensor = input_tensor.half()
# 4. Limit TensorRT workspace
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 26) # 64 MB
# 5. Process frames in-place to avoid copies
frame_buffer = np.zeros((480, 640, 3), dtype=np.uint8)
# Reuse this buffer instead of allocating new arrays each frame
Power Mode Configuration
# Check current power mode
sudo nvpmodel -q
# Set 5W mode (2 CPU cores, lower clocks)
sudo nvpmodel -m 1
# Set 10W mode (4 CPU cores, higher clocks)
sudo nvpmodel -m 0
# Maximize clocks within current power mode
sudo jetson_clocks
INT8 Calibration for Maximum Performance
import tensorrt as trt
import numpy as np
class CalibrationDataset:
def __init__(self, data_dir, batch_size=8, num_batches=50):
self.batches = self._load_batches(data_dir, batch_size, num_batches)
self.batch_idx = 0
def _load_batches(self, data_dir, batch_size, num_batches):
# Load and preprocess calibration images
batches = []
for i in range(num_batches):
batch = load_and_preprocess_batch(data_dir, batch_size)
batches.append(batch.astype(np.float32))
return batches
class Int8Calibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, dataset, cache_file="calibration.cache"):
super().__init__()
self.dataset = dataset
self.cache_file = cache_file
self.d_input = cuda.mem_alloc(
dataset.batches[0].nbytes
)
def get_batch_size(self):
return self.dataset.batches[0].shape[0]
def get_batch(self, names):
if self.dataset.batch_idx >= len(self.dataset.batches):
return None
batch = self.dataset.batches[self.dataset.batch_idx]
cuda.memcpy_htod(self.d_input, batch)
self.dataset.batch_idx += 1
return [int(self.d_input)]
def read_calibration_cache(self):
try:
with open(self.cache_file, "rb") as f:
return f.read()
except FileNotFoundError:
return None
def write_calibration_cache(self, cache):
with open(self.cache_file, "wb") as f:
f.write(cache)
# Use in engine build
config.set_flag(trt.BuilderFlag.INT8)
config.int8_calibrator = Int8Calibrator(CalibrationDataset("cal_data/"))
Production Deployment Checklist
- Model optimized with TensorRT (FP16 minimum, INT8 if accuracy allows)
- Power mode set appropriately (5W for battery, 10W for mains power)
- Running headless to maximize available RAM
- Swap configured (4-8 GB for safety)
- Thermal monitoring with automatic throttle detection
- Watchdog process to restart inference on OOM or crash
- Camera pipeline uses GStreamer hardware decode (not OpenCV software decode)
- Model loading cached — TensorRT engine build is slow, serialize and reload
- systemd service for auto-start on boot
- Logging inference latency P95/P99 for anomaly detection
The one thing to remember: Production Jetson Nano ML combines TensorRT engine optimization, GStreamer hardware-accelerated video, aggressive memory management (headless + FP16 + swap), and power mode tuning — the gap between a working demo and a reliable deployed system is entirely in these optimizations.
See Also
- Python Coral Tpu Inference Why a tiny USB stick can make AI predictions faster than a powerful laptop — and how Python programmers use it.
- Python Edge Impulse Integration How a friendly online platform helps Python developers teach tiny devices to hear, see, and feel — without being an AI expert.
- Python Tflite Edge Deployment How Python developers shrink smart AI brains to fit inside tiny devices like phones, cameras, and sensors.
- Activation Functions Why neural networks need these tiny mathematical functions — and how ReLU's simplicity accidentally made deep learning possible.
- Ai Agents Architecture How AI systems go from answering questions to actually doing things — the design patterns that turn language models into autonomous agents that browse, code, and plan.