Community Detection with Python — Deep Dive
Community detection with NetworkX
NetworkX provides several built-in community detection methods in networkx.algorithms.community:
import networkx as nx
from networkx.algorithms.community import (
greedy_modularity_communities,
louvain_communities,
label_propagation_communities,
girvan_newman,
)
# Generate a test graph with known community structure
G = nx.LFR_benchmark_graph(
n=1000, tau1=3, tau2=1.5, mu=0.1,
average_degree=10, min_community=20, seed=42,
)
# Louvain (NetworkX 3.x+)
communities_louvain = louvain_communities(G, seed=42)
print(f"Louvain: {len(communities_louvain)} communities")
# Greedy modularity
communities_greedy = greedy_modularity_communities(G)
print(f"Greedy: {len(communities_greedy)} communities")
# Label propagation
communities_lpa = list(label_propagation_communities(G))
print(f"LPA: {len(communities_lpa)} communities")
Modularity computation
from networkx.algorithms.community.quality import modularity
Q = modularity(G, communities_louvain)
print(f"Modularity: {Q:.4f}")
The Leiden algorithm with leidenalg
The Leiden algorithm requires the leidenalg package, which wraps a C++ implementation via igraph:
import igraph as ig
import leidenalg
# Convert from NetworkX to igraph
G_ig = ig.Graph.from_networkx(G)
# Run Leiden with modularity optimization
partition = leidenalg.find_partition(
G_ig,
leidenalg.ModularityVertexPartition,
seed=42,
)
print(f"Leiden: {len(partition)} communities")
print(f"Modularity: {partition.modularity:.4f}")
# Access community membership
membership = partition.membership # list: node_i -> community_id
Resolution parameter
The resolution parameter controls community granularity. Higher values find smaller communities:
# Fine-grained communities
partition_fine = leidenalg.find_partition(
G_ig,
leidenalg.RBConfigurationVertexPartition,
resolution_parameter=1.5, # default is 1.0
seed=42,
)
# Coarse communities
partition_coarse = leidenalg.find_partition(
G_ig,
leidenalg.RBConfigurationVertexPartition,
resolution_parameter=0.5,
seed=42,
)
print(f"Fine: {len(partition_fine)} communities")
print(f"Coarse: {len(partition_coarse)} communities")
Multi-resolution analysis
Sweep the resolution parameter to understand the hierarchical community structure:
import numpy as np
resolutions = np.logspace(-1, 1, 20)
results = []
for res in resolutions:
part = leidenalg.find_partition(
G_ig,
leidenalg.RBConfigurationVertexPartition,
resolution_parameter=res,
seed=42,
)
results.append({
"resolution": res,
"n_communities": len(part),
"modularity": part.modularity,
"sizes": sorted([len(c) for c in part], reverse=True),
})
for r in results:
print(f"res={r['resolution']:.2f}: {r['n_communities']} communities, "
f"Q={r['modularity']:.4f}, largest={r['sizes'][0]}")
Overlapping community detection with cdlib
Real networks often have nodes belonging to multiple communities. The cdlib library provides overlapping detection methods:
from cdlib import algorithms
# DEMON: Democratic Estimate of the Modular Organization of a Network
demon_comms = algorithms.demon(G, min_com_size=10, epsilon=0.25)
print(f"DEMON: {len(demon_comms.communities)} communities")
# Check overlap: nodes in multiple communities
from collections import Counter
membership_count = Counter()
for comm in demon_comms.communities:
for node in comm:
membership_count[node] += 1
overlapping_nodes = sum(1 for count in membership_count.values() if count > 1)
print(f"Nodes in multiple communities: {overlapping_nodes}/{G.number_of_nodes()}")
# Angel algorithm
angel_comms = algorithms.angel(G, threshold=0.5, min_community_size=10)
print(f"ANGEL: {len(angel_comms.communities)} communities")
Evaluating overlapping communities
from cdlib import evaluation
# Internal quality metrics (no ground truth needed)
print(f"Average internal density: {evaluation.avg_internal_degree(G, demon_comms):.4f}")
print(f"Conductance: {evaluation.conductance(G, demon_comms):.4f}")
# With ground truth
ground_truth_comms = algorithms.louvain(G) # pretend this is ground truth
nmi = evaluation.normalized_mutual_information(demon_comms, ground_truth_comms)
print(f"NMI: {nmi.score:.4f}")
Weighted and directed community detection
Weighted graphs
Most algorithms accept edge weights. Edges with higher weights are treated as stronger connections:
# Add weights to edges
for u, v in G.edges():
G[u][v]["weight"] = np.random.uniform(0.1, 1.0)
# Louvain with weights
weighted_comms = louvain_communities(G, weight="weight", seed=42)
# Leiden with weights
G_ig_w = ig.Graph.from_networkx(G)
partition_w = leidenalg.find_partition(
G_ig_w,
leidenalg.ModularityVertexPartition,
weights="weight",
seed=42,
)
Directed graphs
For directed networks (web links, citation networks), use algorithms that respect edge direction:
DG = nx.scale_free_graph(1000, seed=42)
DG_ig = ig.Graph.from_networkx(DG)
# Leiden handles directed graphs
partition_dir = leidenalg.find_partition(
DG_ig,
leidenalg.ModularityVertexPartition,
seed=42,
)
Visualization
Community-colored network plots
import matplotlib.pyplot as plt
import matplotlib.cm as cm
def plot_communities(G, communities, title="Communities"):
pos = nx.spring_layout(G, seed=42, k=0.15)
# Assign colors to communities
node_colors = {}
cmap = cm.get_cmap("tab20", len(communities))
for i, comm in enumerate(communities):
for node in comm:
node_colors[node] = cmap(i)
colors = [node_colors.get(n, (0.8, 0.8, 0.8, 1.0)) for n in G.nodes()]
plt.figure(figsize=(12, 8))
nx.draw_networkx_edges(G, pos, alpha=0.1, width=0.3)
nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=20)
plt.title(f"{title} ({len(communities)} communities)")
plt.axis("off")
plt.tight_layout()
plt.savefig("communities.png", dpi=200)
plt.close()
plot_communities(G, communities_louvain, "Louvain Communities")
Alluvial diagrams for comparing partitions
When comparing results from different algorithms or resolutions, track how nodes move between communities across methods.
Scalability
Performance comparison
| Algorithm | Time complexity | 10K nodes | 1M nodes | 100M nodes |
|---|---|---|---|---|
| Louvain | O(n log n) | <1s | ~10s | ~5 min |
| Leiden | O(n log n) | <1s | ~15s | ~8 min |
| Label Prop | O(m) | <1s | ~5s | ~2 min |
| Girvan-Newman | O(m²n) | ~30s | infeasible | infeasible |
| Infomap | O(m) | <1s | ~8s | ~3 min |
Distributed community detection
For graphs exceeding single-machine memory, use GraphX (Spark) or cuGraph (GPU):
import cugraph
import cudf
# Load edge list into GPU memory
edges_df = cudf.read_csv("edges.csv", names=["src", "dst", "weight"])
G_cu = cugraph.Graph()
G_cu.from_cudf_edgelist(edges_df, source="src", destination="dst", edge_attr="weight")
# Louvain on GPU
parts, modularity = cugraph.louvain(G_cu)
print(f"GPU Louvain: Q={modularity:.4f}, communities={parts['partition'].nunique()}")
cuGraph’s Louvain handles billion-edge graphs in minutes on a single GPU.
Statistical testing
Verify that detected communities are statistically significant:
def test_community_significance(G, communities, n_random=100):
"""Compare observed modularity against random graph null model."""
observed_Q = modularity(G, communities)
random_Qs = []
for _ in range(n_random):
G_rand = nx.configuration_model([d for _, d in G.degree()], seed=None)
G_rand = nx.Graph(G_rand) # remove multi-edges
G_rand.remove_edges_from(nx.selfloop_edges(G_rand))
rand_comms = louvain_communities(G_rand)
random_Qs.append(modularity(G_rand, rand_comms))
z_score = (observed_Q - np.mean(random_Qs)) / np.std(random_Qs)
return {
"observed_Q": observed_Q,
"mean_random_Q": np.mean(random_Qs),
"z_score": z_score,
"significant": z_score > 2.0,
}
A z-score above 2.0 indicates the community structure is unlikely to arise by chance.
One thing to remember: Use Leiden over Louvain for production workloads — it’s equally fast but guarantees well-connected communities. Always validate with modularity scores and, when possible, compare against a random null model to confirm the community structure is real.
See Also
- Python Graph Embeddings How Python turns tangled webs of connections into neat lists of numbers that computers can actually work with.
- Python Graph Neural Networks How Python teaches computers to learn from connections — not just data points — by combining neural networks with graph structures.
- Python Link Prediction How Python guesses which connections are missing from a network — predicting future friendships, recommendations, and undiscovered relationships.
- Python Networkx Graph Analysis How Python maps connections between things — friends, roads, websites — and finds hidden patterns in those connections.
- Python Arima Forecasting How ARIMA models use patterns in past numbers to predict the future, explained like a bedtime story.