TensorFlow TensorBoard — Deep Dive
Architecture of TensorBoard
TensorBoard consists of a Python backend that reads event files and a TypeScript/Polymer frontend that renders dashboards. Event files use Protocol Buffer serialization with a sequential format:
[length][crc32 of length][data][crc32 of data]
Each record contains a tf.Event proto with a wall-clock timestamp, step number, and one of several payload types (summary, graph_def, session_log). The backend serves these via a REST API that the frontend consumes.
Understanding this architecture matters when you need to write custom logging, parse logs programmatically, or deploy TensorBoard in production environments.
Advanced Logging Techniques
Custom Training Loop Integration
import tensorflow as tf
train_log_dir = "logs/gradient_tape/train"
val_log_dir = "logs/gradient_tape/val"
train_writer = tf.summary.create_file_writer(train_log_dir)
val_writer = tf.summary.create_file_writer(val_log_dir)
@tf.function
def train_step(model, optimizer, x, y, step):
with tf.GradientTape() as tape:
predictions = model(x, training=True)
loss = loss_fn(y, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
# Log detailed training information
with train_writer.as_default(step=step):
tf.summary.scalar("loss", loss)
tf.summary.scalar("learning_rate", optimizer.learning_rate)
# Gradient statistics per layer
for var, grad in zip(model.trainable_variables, gradients):
tf.summary.histogram(f"gradients/{var.name}", grad)
tf.summary.scalar(
f"gradient_norm/{var.name}",
tf.norm(grad)
)
# Weight statistics
for var in model.trainable_variables:
tf.summary.histogram(f"weights/{var.name}", var)
return loss
Logging Images with Annotations
def log_predictions(model, test_images, test_labels, step):
predictions = model(test_images, training=False)
pred_labels = tf.argmax(predictions, axis=1)
# Create annotated visualization
fig, axes = plt.subplots(4, 4, figsize=(12, 12))
for i, ax in enumerate(axes.flat):
ax.imshow(test_images[i])
color = "green" if pred_labels[i] == test_labels[i] else "red"
ax.set_title(
f"Pred: {class_names[pred_labels[i]]}\n"
f"True: {class_names[test_labels[i]]}",
color=color
)
ax.axis("off")
buf = io.BytesIO()
fig.savefig(buf, format="png")
plt.close(fig)
buf.seek(0)
image = tf.image.decode_png(buf.getvalue(), channels=4)
image = tf.expand_dims(image, 0)
with val_writer.as_default(step=step):
tf.summary.image("predictions", image)
Logging Text and Markdown
with train_writer.as_default(step=0):
# Log experiment configuration
config_table = (
"| Parameter | Value |\n"
"|---|---|\n"
f"| Learning Rate | {lr} |\n"
f"| Batch Size | {batch_size} |\n"
f"| Architecture | {model_name} |\n"
f"| Optimizer | {optimizer_name} |\n"
)
tf.summary.text("experiment_config", config_table)
HParams Dashboard — Hyperparameter Tuning
The HParams plugin lets you log and compare hyperparameter configurations:
from tensorboard.plugins.hparams import api as hp
# Define hyperparameter space
HP_LR = hp.HParam("learning_rate", hp.RealInterval(1e-5, 1e-1))
HP_DROPOUT = hp.HParam("dropout", hp.RealInterval(0.1, 0.5))
HP_OPTIMIZER = hp.HParam("optimizer", hp.Discrete(["adam", "sgd", "rmsprop"]))
METRIC_ACCURACY = "epoch_accuracy"
METRIC_LOSS = "epoch_loss"
# Log experiment configuration
with tf.summary.create_file_writer("logs/hparams_tuning").as_default():
hp.hparams_config(
hparams=[HP_LR, HP_DROPOUT, HP_OPTIMIZER],
metrics=[
hp.Metric(METRIC_ACCURACY, display_name="Accuracy"),
hp.Metric(METRIC_LOSS, display_name="Loss"),
],
)
def run_experiment(run_dir, hparams):
with tf.summary.create_file_writer(run_dir).as_default():
hp.hparams(hparams)
model = build_model(
dropout_rate=hparams[HP_DROPOUT],
optimizer=hparams[HP_OPTIMIZER],
learning_rate=hparams[HP_LR]
)
model.fit(
train_data, epochs=20,
validation_data=val_data,
callbacks=[tf.keras.callbacks.TensorBoard(run_dir)]
)
_, accuracy = model.evaluate(test_data)
tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)
# Run grid search
session_num = 0
for lr in [1e-3, 1e-4, 1e-5]:
for dropout in [0.1, 0.2, 0.3]:
for optimizer in ["adam", "sgd"]:
hparams = {
HP_LR: lr,
HP_DROPOUT: dropout,
HP_OPTIMIZER: optimizer,
}
run_name = f"run-{session_num}"
run_experiment(f"logs/hparams_tuning/{run_name}", hparams)
session_num += 1
The HParams dashboard shows a parallel coordinates plot and a table view, making it easy to identify which hyperparameter combinations produce the best metrics.
Embedding Projector
Visualize high-dimensional embeddings in 2D/3D using PCA, t-SNE, or UMAP:
from tensorboard.plugins import projector
import numpy as np
# Save embeddings
embedding_data = model.get_layer("embedding").get_weights()[0]
np.savetxt("logs/embeddings/vectors.tsv", embedding_data, delimiter="\t")
# Save metadata (labels for each point)
with open("logs/embeddings/metadata.tsv", "w") as f:
for word in vocabulary:
f.write(f"{word}\n")
# Configure projector
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_path = "vectors.tsv"
embedding.metadata_path = "metadata.tsv"
projector.visualize_embeddings("logs/embeddings", config)
The projector is invaluable for NLP (visualizing word clusters), recommendation systems (understanding item similarity), and debugging classification models (checking if classes form distinct clusters).
Profiler Deep Dive
Capturing Detailed Profiles
# Profile specific batch range
tb_callback = tf.keras.callbacks.TensorBoard(
log_dir="./logs/profiled_run",
histogram_freq=1,
profile_batch="10,20" # Profile batches 10 through 20
)
model.fit(train_data, epochs=10, callbacks=[tb_callback])
Programmatic Profiling
# Start/stop profiling manually
tf.profiler.experimental.start("./logs/manual_profile")
for step, (x, y) in enumerate(train_dataset):
train_step(model, optimizer, x, y, step)
if step == 50:
break
tf.profiler.experimental.stop()
Reading the Profiler Output
The profiler generates several views:
Overview Page:
- Step-time breakdown: compute vs. input vs. host-to-device transfer
- Recommendation engine suggests specific optimizations
Trace Viewer:
- Timeline of all operations on CPU and GPU
- Shows overlap between data pipeline and model execution
- Identifies gaps where the GPU sits idle
GPU Kernel Stats:
- Time spent in each GPU kernel
- Occupancy percentage (how well each kernel uses GPU cores)
- Memory bandwidth utilization
Memory Profile:
- Peak memory allocation
- Memory timeline showing allocations and deallocations
- Identifies operations that cause memory spikes
Acting on Profiler Insights
| Profiler Finding | Action |
|---|---|
| ”Input bound” warning | Add prefetch, parallel map, or cache to data pipeline |
| Low GPU occupancy | Increase batch size or use mixed precision |
| Host-to-device transfer bottleneck | Pin data to GPU memory, use prefetch_to_device |
| Memory near limit | Enable gradient checkpointing, reduce batch size |
| Many small GPU kernels | Ensure tf.function tracing, use XLA compilation |
Remote and Team TensorBoard
TensorBoard.dev (Hosted)
# Upload experiment to TensorBoard.dev
tensorboard dev upload --logdir ./logs --name "Experiment v2.3"
# Output: https://tensorboard.dev/experiment/xxxxxxxx/
Shared via URL — anyone can view without installing anything. Free but public (do not upload proprietary data).
Self-Hosted for Teams
# Serve TensorBoard on a shared server
tensorboard --logdir=gs://team-bucket/experiments \
--host=0.0.0.0 \
--port=6006 \
--reload_interval=30
# With authentication (via reverse proxy)
# Use nginx/Caddy with OAuth2 proxy in front of TensorBoard
Docker Deployment
FROM tensorflow/tensorflow:latest
RUN pip install tensorboard
EXPOSE 6006
ENTRYPOINT ["tensorboard", "--logdir=/logs", "--host=0.0.0.0"]
# docker-compose.yml
services:
tensorboard:
build: .
ports:
- "6006:6006"
volumes:
- ./logs:/logs:ro
restart: unless-stopped
Custom Plugins
Build domain-specific dashboards:
# Register a custom scalar plugin
from tensorboard.plugins.scalar import metadata as scalar_metadata
# Custom summary with plugin-specific metadata
summary_metadata = tf.compat.v1.SummaryMetadata(
plugin_data=tf.compat.v1.SummaryMetadata.PluginData(
plugin_name="custom_plugin"
)
)
For complex custom plugins, implement a TBPlugin subclass with a backend (BasePlugin) and frontend (TypeScript/Polymer component). This is how teams build specialized dashboards for fairness metrics, data quality monitoring, or domain-specific visualizations.
Production Logging Patterns
Efficient Logging (Avoid Slowdowns)
# Log every N steps instead of every step
LOG_FREQUENCY = 100
for step, (x, y) in enumerate(train_dataset):
loss = train_step(model, optimizer, x, y)
if step % LOG_FREQUENCY == 0:
with train_writer.as_default(step=step):
tf.summary.scalar("loss", loss)
# Expensive logging only occasionally
if step % (LOG_FREQUENCY * 10) == 0:
log_weight_histograms(model, step)
log_gradient_norms(model, step)
Log Rotation and Cleanup
import glob
import os
import time
def cleanup_old_logs(log_dir, max_age_days=30):
"""Remove TensorBoard logs older than max_age_days."""
cutoff = time.time() - (max_age_days * 86400)
for event_file in glob.glob(f"{log_dir}/**/events.out.*", recursive=True):
if os.path.getmtime(event_file) < cutoff:
os.remove(event_file)
Structured Experiment Organization
logs/
├── 2026-03-28_resnet50_lr0001/
│ ├── train/
│ ├── validation/
│ └── hparams/
├── 2026-03-28_resnet50_lr001/
│ ├── train/
│ ├── validation/
│ └── hparams/
└── 2026-03-29_efficientnet_lr0001/
├── train/
├── validation/
└── hparams/
Name runs with date, model, and key hyperparameters for easy filtering in the TensorBoard UI.
The one thing to remember: TensorBoard is most powerful when you go beyond basic loss curves — use the profiler to eliminate training bottlenecks, HParams for systematic tuning, and structured logging with periodic cleanup to keep experiments manageable.
See Also
- Python Pytorch Lightning Training How PyTorch Lightning removes the boring parts of training AI models so researchers can focus on ideas instead of boilerplate.
- Python Tensorflow Custom Layers How to teach TensorFlow new tricks by building your own custom layers — explained with a cookie cutter analogy.
- Python Tensorflow Data Pipelines How TensorFlow feeds data to your model without wasting time — explained like a restaurant kitchen that never stops cooking.
- Python Tensorflow Keras Api Why Keras is TensorFlow's friendly front door — and how it turns complex math into simple building blocks anyone can stack together.
- Python Tensorflow Model Optimization Why making a trained model smaller and faster matters — explained like packing a suitcase for a trip.