TensorFlow TensorBoard — Deep Dive

Advanced TensorBoard usage — custom plugins, hyperparameter tuning dashboards, embedding projector, profiler-driven optimization, and production deployment.

Architecture of TensorBoard

TensorBoard consists of a Python backend that reads event files and a TypeScript/Polymer frontend that renders dashboards. Event files use Protocol Buffer serialization with a sequential format:

[length][crc32 of length][data][crc32 of data]

Each record contains a tf.Event proto with a wall-clock timestamp, step number, and one of several payload types (summary, graph_def, session_log). The backend serves these via a REST API that the frontend consumes.

Understanding this architecture matters when you need to write custom logging, parse logs programmatically, or deploy TensorBoard in production environments.

Advanced Logging Techniques

Custom Training Loop Integration

import tensorflow as tf

train_log_dir = "logs/gradient_tape/train"
val_log_dir = "logs/gradient_tape/val"
train_writer = tf.summary.create_file_writer(train_log_dir)
val_writer = tf.summary.create_file_writer(val_log_dir)

@tf.function
def train_step(model, optimizer, x, y, step):
    with tf.GradientTape() as tape:
        predictions = model(x, training=True)
        loss = loss_fn(y, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Log detailed training information
    with train_writer.as_default(step=step):
        tf.summary.scalar("loss", loss)
        tf.summary.scalar("learning_rate", optimizer.learning_rate)

        # Gradient statistics per layer
        for var, grad in zip(model.trainable_variables, gradients):
            tf.summary.histogram(f"gradients/{var.name}", grad)
            tf.summary.scalar(
                f"gradient_norm/{var.name}",
                tf.norm(grad)
            )

        # Weight statistics
        for var in model.trainable_variables:
            tf.summary.histogram(f"weights/{var.name}", var)

    return loss

Logging Images with Annotations

def log_predictions(model, test_images, test_labels, step):
    predictions = model(test_images, training=False)
    pred_labels = tf.argmax(predictions, axis=1)

    # Create annotated visualization
    fig, axes = plt.subplots(4, 4, figsize=(12, 12))
    for i, ax in enumerate(axes.flat):
        ax.imshow(test_images[i])
        color = "green" if pred_labels[i] == test_labels[i] else "red"
        ax.set_title(
            f"Pred: {class_names[pred_labels[i]]}\n"
            f"True: {class_names[test_labels[i]]}",
            color=color
        )
        ax.axis("off")

    buf = io.BytesIO()
    fig.savefig(buf, format="png")
    plt.close(fig)
    buf.seek(0)
    image = tf.image.decode_png(buf.getvalue(), channels=4)
    image = tf.expand_dims(image, 0)

    with val_writer.as_default(step=step):
        tf.summary.image("predictions", image)

Logging Text and Markdown

with train_writer.as_default(step=0):
    # Log experiment configuration
    config_table = (
        "| Parameter | Value |\n"
        "|---|---|\n"
        f"| Learning Rate | {lr} |\n"
        f"| Batch Size | {batch_size} |\n"
        f"| Architecture | {model_name} |\n"
        f"| Optimizer | {optimizer_name} |\n"
    )
    tf.summary.text("experiment_config", config_table)

HParams Dashboard — Hyperparameter Tuning

The HParams plugin lets you log and compare hyperparameter configurations:

from tensorboard.plugins.hparams import api as hp

# Define hyperparameter space
HP_LR = hp.HParam("learning_rate", hp.RealInterval(1e-5, 1e-1))
HP_DROPOUT = hp.HParam("dropout", hp.RealInterval(0.1, 0.5))
HP_OPTIMIZER = hp.HParam("optimizer", hp.Discrete(["adam", "sgd", "rmsprop"]))

METRIC_ACCURACY = "epoch_accuracy"
METRIC_LOSS = "epoch_loss"

# Log experiment configuration
with tf.summary.create_file_writer("logs/hparams_tuning").as_default():
    hp.hparams_config(
        hparams=[HP_LR, HP_DROPOUT, HP_OPTIMIZER],
        metrics=[
            hp.Metric(METRIC_ACCURACY, display_name="Accuracy"),
            hp.Metric(METRIC_LOSS, display_name="Loss"),
        ],
    )

def run_experiment(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)

        model = build_model(
            dropout_rate=hparams[HP_DROPOUT],
            optimizer=hparams[HP_OPTIMIZER],
            learning_rate=hparams[HP_LR]
        )

        model.fit(
            train_data, epochs=20,
            validation_data=val_data,
            callbacks=[tf.keras.callbacks.TensorBoard(run_dir)]
        )

        _, accuracy = model.evaluate(test_data)
        tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

# Run grid search
session_num = 0
for lr in [1e-3, 1e-4, 1e-5]:
    for dropout in [0.1, 0.2, 0.3]:
        for optimizer in ["adam", "sgd"]:
            hparams = {
                HP_LR: lr,
                HP_DROPOUT: dropout,
                HP_OPTIMIZER: optimizer,
            }
            run_name = f"run-{session_num}"
            run_experiment(f"logs/hparams_tuning/{run_name}", hparams)
            session_num += 1

The HParams dashboard shows a parallel coordinates plot and a table view, making it easy to identify which hyperparameter combinations produce the best metrics.

Embedding Projector

Visualize high-dimensional embeddings in 2D/3D using PCA, t-SNE, or UMAP:

from tensorboard.plugins import projector
import numpy as np

# Save embeddings
embedding_data = model.get_layer("embedding").get_weights()[0]
np.savetxt("logs/embeddings/vectors.tsv", embedding_data, delimiter="\t")

# Save metadata (labels for each point)
with open("logs/embeddings/metadata.tsv", "w") as f:
    for word in vocabulary:
        f.write(f"{word}\n")

# Configure projector
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_path = "vectors.tsv"
embedding.metadata_path = "metadata.tsv"
projector.visualize_embeddings("logs/embeddings", config)

The projector is invaluable for NLP (visualizing word clusters), recommendation systems (understanding item similarity), and debugging classification models (checking if classes form distinct clusters).

Profiler Deep Dive

Capturing Detailed Profiles

# Profile specific batch range
tb_callback = tf.keras.callbacks.TensorBoard(
    log_dir="./logs/profiled_run",
    histogram_freq=1,
    profile_batch="10,20"  # Profile batches 10 through 20
)

model.fit(train_data, epochs=10, callbacks=[tb_callback])

Programmatic Profiling

# Start/stop profiling manually
tf.profiler.experimental.start("./logs/manual_profile")

for step, (x, y) in enumerate(train_dataset):
    train_step(model, optimizer, x, y, step)
    if step == 50:
        break

tf.profiler.experimental.stop()

Reading the Profiler Output

The profiler generates several views:

Overview Page:

Step-time breakdown: compute vs. input vs. host-to-device transfer
Recommendation engine suggests specific optimizations

Trace Viewer:

Timeline of all operations on CPU and GPU
Shows overlap between data pipeline and model execution
Identifies gaps where the GPU sits idle

GPU Kernel Stats:

Time spent in each GPU kernel
Occupancy percentage (how well each kernel uses GPU cores)
Memory bandwidth utilization

Memory Profile:

Peak memory allocation
Memory timeline showing allocations and deallocations
Identifies operations that cause memory spikes

Acting on Profiler Insights

Profiler Finding	Action
”Input bound” warning	Add prefetch, parallel map, or cache to data pipeline
Low GPU occupancy	Increase batch size or use mixed precision
Host-to-device transfer bottleneck	Pin data to GPU memory, use `prefetch_to_device`
Memory near limit	Enable gradient checkpointing, reduce batch size
Many small GPU kernels	Ensure `tf.function` tracing, use XLA compilation

Remote and Team TensorBoard

TensorBoard.dev (Hosted)

# Upload experiment to TensorBoard.dev
tensorboard dev upload --logdir ./logs --name "Experiment v2.3"

# Output: https://tensorboard.dev/experiment/xxxxxxxx/

Shared via URL — anyone can view without installing anything. Free but public (do not upload proprietary data).

Self-Hosted for Teams

# Serve TensorBoard on a shared server
tensorboard --logdir=gs://team-bucket/experiments \
    --host=0.0.0.0 \
    --port=6006 \
    --reload_interval=30

# With authentication (via reverse proxy)
# Use nginx/Caddy with OAuth2 proxy in front of TensorBoard

Docker Deployment

FROM tensorflow/tensorflow:latest

RUN pip install tensorboard

EXPOSE 6006

ENTRYPOINT ["tensorboard", "--logdir=/logs", "--host=0.0.0.0"]

# docker-compose.yml
services:
  tensorboard:
    build: .
    ports:
      - "6006:6006"
    volumes:
      - ./logs:/logs:ro
    restart: unless-stopped

Custom Plugins

Build domain-specific dashboards:

# Register a custom scalar plugin
from tensorboard.plugins.scalar import metadata as scalar_metadata

# Custom summary with plugin-specific metadata
summary_metadata = tf.compat.v1.SummaryMetadata(
    plugin_data=tf.compat.v1.SummaryMetadata.PluginData(
        plugin_name="custom_plugin"
    )
)

For complex custom plugins, implement a TBPlugin subclass with a backend (BasePlugin) and frontend (TypeScript/Polymer component). This is how teams build specialized dashboards for fairness metrics, data quality monitoring, or domain-specific visualizations.

Production Logging Patterns

Efficient Logging (Avoid Slowdowns)

# Log every N steps instead of every step
LOG_FREQUENCY = 100

for step, (x, y) in enumerate(train_dataset):
    loss = train_step(model, optimizer, x, y)

    if step % LOG_FREQUENCY == 0:
        with train_writer.as_default(step=step):
            tf.summary.scalar("loss", loss)
            # Expensive logging only occasionally
            if step % (LOG_FREQUENCY * 10) == 0:
                log_weight_histograms(model, step)
                log_gradient_norms(model, step)

Log Rotation and Cleanup

import glob
import os
import time

def cleanup_old_logs(log_dir, max_age_days=30):
    """Remove TensorBoard logs older than max_age_days."""
    cutoff = time.time() - (max_age_days * 86400)
    for event_file in glob.glob(f"{log_dir}/**/events.out.*", recursive=True):
        if os.path.getmtime(event_file) < cutoff:
            os.remove(event_file)

Structured Experiment Organization

logs/
├── 2026-03-28_resnet50_lr0001/
│   ├── train/
│   ├── validation/
│   └── hparams/
├── 2026-03-28_resnet50_lr001/
│   ├── train/
│   ├── validation/
│   └── hparams/
└── 2026-03-29_efficientnet_lr0001/
    ├── train/
    ├── validation/
    └── hparams/

Name runs with date, model, and key hyperparameters for easy filtering in the TensorBoard UI.

The one thing to remember: TensorBoard is most powerful when you go beyond basic loss curves — use the profiler to eliminate training bottlenecks, HParams for systematic tuning, and structured logging with periodic cleanup to keep experiments manageable.

pythonmachine-learningtensorflowvisualization