Community Detection with Python — Deep Dive

Implement Louvain, Leiden, and overlapping community detection in Python with NetworkX, cdlib, and igraph — including evaluation, visualization, and scalability patterns.

Community detection with NetworkX

NetworkX provides several built-in community detection methods in networkx.algorithms.community:

import networkx as nx
from networkx.algorithms.community import (
    greedy_modularity_communities,
    louvain_communities,
    label_propagation_communities,
    girvan_newman,
)

# Generate a test graph with known community structure
G = nx.LFR_benchmark_graph(
    n=1000, tau1=3, tau2=1.5, mu=0.1,
    average_degree=10, min_community=20, seed=42,
)

# Louvain (NetworkX 3.x+)
communities_louvain = louvain_communities(G, seed=42)
print(f"Louvain: {len(communities_louvain)} communities")

# Greedy modularity
communities_greedy = greedy_modularity_communities(G)
print(f"Greedy: {len(communities_greedy)} communities")

# Label propagation
communities_lpa = list(label_propagation_communities(G))
print(f"LPA: {len(communities_lpa)} communities")

Modularity computation

from networkx.algorithms.community.quality import modularity

Q = modularity(G, communities_louvain)
print(f"Modularity: {Q:.4f}")

The Leiden algorithm with leidenalg

The Leiden algorithm requires the leidenalg package, which wraps a C++ implementation via igraph:

import igraph as ig
import leidenalg

# Convert from NetworkX to igraph
G_ig = ig.Graph.from_networkx(G)

# Run Leiden with modularity optimization
partition = leidenalg.find_partition(
    G_ig,
    leidenalg.ModularityVertexPartition,
    seed=42,
)

print(f"Leiden: {len(partition)} communities")
print(f"Modularity: {partition.modularity:.4f}")

# Access community membership
membership = partition.membership  # list: node_i -> community_id

Resolution parameter

The resolution parameter controls community granularity. Higher values find smaller communities:

# Fine-grained communities
partition_fine = leidenalg.find_partition(
    G_ig,
    leidenalg.RBConfigurationVertexPartition,
    resolution_parameter=1.5,  # default is 1.0
    seed=42,
)

# Coarse communities
partition_coarse = leidenalg.find_partition(
    G_ig,
    leidenalg.RBConfigurationVertexPartition,
    resolution_parameter=0.5,
    seed=42,
)

print(f"Fine: {len(partition_fine)} communities")
print(f"Coarse: {len(partition_coarse)} communities")

Multi-resolution analysis

Sweep the resolution parameter to understand the hierarchical community structure:

import numpy as np

resolutions = np.logspace(-1, 1, 20)
results = []

for res in resolutions:
    part = leidenalg.find_partition(
        G_ig,
        leidenalg.RBConfigurationVertexPartition,
        resolution_parameter=res,
        seed=42,
    )
    results.append({
        "resolution": res,
        "n_communities": len(part),
        "modularity": part.modularity,
        "sizes": sorted([len(c) for c in part], reverse=True),
    })

for r in results:
    print(f"res={r['resolution']:.2f}: {r['n_communities']} communities, "
          f"Q={r['modularity']:.4f}, largest={r['sizes'][0]}")

Overlapping community detection with cdlib

Real networks often have nodes belonging to multiple communities. The cdlib library provides overlapping detection methods:

from cdlib import algorithms

# DEMON: Democratic Estimate of the Modular Organization of a Network
demon_comms = algorithms.demon(G, min_com_size=10, epsilon=0.25)
print(f"DEMON: {len(demon_comms.communities)} communities")

# Check overlap: nodes in multiple communities
from collections import Counter
membership_count = Counter()
for comm in demon_comms.communities:
    for node in comm:
        membership_count[node] += 1

overlapping_nodes = sum(1 for count in membership_count.values() if count > 1)
print(f"Nodes in multiple communities: {overlapping_nodes}/{G.number_of_nodes()}")

# Angel algorithm
angel_comms = algorithms.angel(G, threshold=0.5, min_community_size=10)
print(f"ANGEL: {len(angel_comms.communities)} communities")

Evaluating overlapping communities

from cdlib import evaluation

# Internal quality metrics (no ground truth needed)
print(f"Average internal density: {evaluation.avg_internal_degree(G, demon_comms):.4f}")
print(f"Conductance: {evaluation.conductance(G, demon_comms):.4f}")

# With ground truth
ground_truth_comms = algorithms.louvain(G)  # pretend this is ground truth
nmi = evaluation.normalized_mutual_information(demon_comms, ground_truth_comms)
print(f"NMI: {nmi.score:.4f}")

Weighted and directed community detection

Weighted graphs

Most algorithms accept edge weights. Edges with higher weights are treated as stronger connections:

# Add weights to edges
for u, v in G.edges():
    G[u][v]["weight"] = np.random.uniform(0.1, 1.0)

# Louvain with weights
weighted_comms = louvain_communities(G, weight="weight", seed=42)

# Leiden with weights
G_ig_w = ig.Graph.from_networkx(G)
partition_w = leidenalg.find_partition(
    G_ig_w,
    leidenalg.ModularityVertexPartition,
    weights="weight",
    seed=42,
)

Directed graphs

For directed networks (web links, citation networks), use algorithms that respect edge direction:

DG = nx.scale_free_graph(1000, seed=42)
DG_ig = ig.Graph.from_networkx(DG)

# Leiden handles directed graphs
partition_dir = leidenalg.find_partition(
    DG_ig,
    leidenalg.ModularityVertexPartition,
    seed=42,
)

Visualization

Community-colored network plots

import matplotlib.pyplot as plt
import matplotlib.cm as cm

def plot_communities(G, communities, title="Communities"):
    pos = nx.spring_layout(G, seed=42, k=0.15)

    # Assign colors to communities
    node_colors = {}
    cmap = cm.get_cmap("tab20", len(communities))
    for i, comm in enumerate(communities):
        for node in comm:
            node_colors[node] = cmap(i)

    colors = [node_colors.get(n, (0.8, 0.8, 0.8, 1.0)) for n in G.nodes()]

    plt.figure(figsize=(12, 8))
    nx.draw_networkx_edges(G, pos, alpha=0.1, width=0.3)
    nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=20)
    plt.title(f"{title} ({len(communities)} communities)")
    plt.axis("off")
    plt.tight_layout()
    plt.savefig("communities.png", dpi=200)
    plt.close()

plot_communities(G, communities_louvain, "Louvain Communities")

Alluvial diagrams for comparing partitions

When comparing results from different algorithms or resolutions, track how nodes move between communities across methods.

Scalability

Performance comparison

Algorithm	Time complexity	10K nodes	1M nodes	100M nodes
Louvain	O(n log n)	<1s	~10s	~5 min
Leiden	O(n log n)	<1s	~15s	~8 min
Label Prop	O(m)	<1s	~5s	~2 min
Girvan-Newman	O(m²n)	~30s	infeasible	infeasible
Infomap	O(m)	<1s	~8s	~3 min

Distributed community detection

For graphs exceeding single-machine memory, use GraphX (Spark) or cuGraph (GPU):

import cugraph
import cudf

# Load edge list into GPU memory
edges_df = cudf.read_csv("edges.csv", names=["src", "dst", "weight"])
G_cu = cugraph.Graph()
G_cu.from_cudf_edgelist(edges_df, source="src", destination="dst", edge_attr="weight")

# Louvain on GPU
parts, modularity = cugraph.louvain(G_cu)
print(f"GPU Louvain: Q={modularity:.4f}, communities={parts['partition'].nunique()}")

cuGraph’s Louvain handles billion-edge graphs in minutes on a single GPU.

Statistical testing

Verify that detected communities are statistically significant:

def test_community_significance(G, communities, n_random=100):
    """Compare observed modularity against random graph null model."""
    observed_Q = modularity(G, communities)

    random_Qs = []
    for _ in range(n_random):
        G_rand = nx.configuration_model([d for _, d in G.degree()], seed=None)
        G_rand = nx.Graph(G_rand)  # remove multi-edges
        G_rand.remove_edges_from(nx.selfloop_edges(G_rand))
        rand_comms = louvain_communities(G_rand)
        random_Qs.append(modularity(G_rand, rand_comms))

    z_score = (observed_Q - np.mean(random_Qs)) / np.std(random_Qs)
    return {
        "observed_Q": observed_Q,
        "mean_random_Q": np.mean(random_Qs),
        "z_score": z_score,
        "significant": z_score > 2.0,
    }

A z-score above 2.0 indicates the community structure is unlikely to arise by chance.

One thing to remember: Use Leiden over Louvain for production workloads — it’s equally fast but guarantees well-connected communities. Always validate with modularity scores and, when possible, compare against a random null model to confirm the community structure is real.

pythongraph-theorydata-science