Course Recommendation in Python — Deep Dive
Educational recommendation systems must satisfy constraints that entertainment recommenders ignore: prerequisite ordering, skill coverage, and learning efficiency. This guide builds a complete system in Python.
Matrix Factorization for Course Embeddings
Learn latent representations of students and courses from enrollment and rating data:
import numpy as np
from dataclasses import dataclass
@dataclass
class MFModel:
user_factors: np.ndarray # (num_users, k)
item_factors: np.ndarray # (num_items, k)
user_bias: np.ndarray
item_bias: np.ndarray
global_mean: float
def train_matrix_factorization(
interactions: list[tuple[int, int, float]],
num_users: int, num_items: int,
k: int = 50, epochs: int = 20,
lr: float = 0.01, reg: float = 0.02
) -> MFModel:
"""Train matrix factorization with biases using SGD."""
rng = np.random.default_rng(42)
user_factors = rng.normal(0, 0.1, (num_users, k))
item_factors = rng.normal(0, 0.1, (num_items, k))
user_bias = np.zeros(num_users)
item_bias = np.zeros(num_items)
global_mean = np.mean([r for _, _, r in interactions])
for epoch in range(epochs):
rng.shuffle(interactions)
total_loss = 0.0
for user_id, item_id, rating in interactions:
pred = (global_mean + user_bias[user_id] + item_bias[item_id]
+ user_factors[user_id] @ item_factors[item_id])
error = rating - pred
total_loss += error ** 2
# Update factors
user_factors[user_id] += lr * (error * item_factors[item_id]
- reg * user_factors[user_id])
item_factors[item_id] += lr * (error * user_factors[user_id]
- reg * item_factors[item_id])
user_bias[user_id] += lr * (error - reg * user_bias[user_id])
item_bias[item_id] += lr * (error - reg * item_bias[item_id])
rmse = np.sqrt(total_loss / len(interactions))
if epoch % 5 == 0:
print(f"Epoch {epoch}: RMSE = {rmse:.4f}")
return MFModel(user_factors, item_factors, user_bias, item_bias, global_mean)
def predict_rating(model: MFModel, user_id: int, item_id: int) -> float:
"""Predict a user's rating for a course."""
return (model.global_mean + model.user_bias[user_id] + model.item_bias[item_id]
+ model.user_factors[user_id] @ model.item_factors[item_id])
Content-Based Similarity with Course Metadata
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
class CourseContentModel:
def __init__(self):
self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
self.course_embeddings = {}
self.courses = {}
def index_courses(self, courses: list[dict]):
"""Build content-based index from course metadata."""
for course in courses:
self.courses[course["id"]] = course
# Combine title, description, and tags into a single text
texts = [
f"{c['title']}. {c['description']}. Topics: {', '.join(c.get('tags', []))}"
for c in courses
]
embeddings = self.embedder.encode(texts, show_progress_bar=True)
for course, emb in zip(courses, embeddings):
self.course_embeddings[course["id"]] = emb
def find_similar(self, course_id: str, top_k: int = 10) -> list[dict]:
"""Find courses most similar to a given course."""
if course_id not in self.course_embeddings:
return []
target_emb = self.course_embeddings[course_id]
scores = []
for cid, emb in self.course_embeddings.items():
if cid == course_id:
continue
sim = float(np.dot(target_emb, emb) / (
np.linalg.norm(target_emb) * np.linalg.norm(emb)))
scores.append((cid, sim))
scores.sort(key=lambda x: -x[1])
return [{"course_id": cid, "similarity": sim}
for cid, sim in scores[:top_k]]
Prerequisite Graph and Topological Ordering
from collections import defaultdict, deque
class PrerequisiteGraph:
def __init__(self):
self.prerequisites = defaultdict(set) # course -> set of prereq courses
self.dependents = defaultdict(set) # course -> courses that depend on it
self.course_skills = {} # course -> {"teaches": [], "requires": []}
def add_course(self, course_id: str, teaches: list[str], requires: list[str]):
"""Register a course with its skill relationships."""
self.course_skills[course_id] = {"teaches": set(teaches), "requires": set(requires)}
def add_prerequisite(self, course_id: str, prereq_id: str):
"""Add a direct prerequisite relationship."""
self.prerequisites[course_id].add(prereq_id)
self.dependents[prereq_id].add(course_id)
def infer_prerequisites(self):
"""Infer prerequisite relationships from skill overlaps."""
for course_id, skills in self.course_skills.items():
for other_id, other_skills in self.course_skills.items():
if course_id == other_id:
continue
# If course requires skills that other teaches, other is a prereq
if skills["requires"] & other_skills["teaches"]:
self.add_prerequisite(course_id, other_id)
def is_ready(self, course_id: str, completed: set[str]) -> bool:
"""Check if a student has completed all prerequisites for a course."""
return self.prerequisites[course_id].issubset(completed)
def available_courses(self, completed: set[str], all_courses: set[str]) -> list[str]:
"""List courses the student is ready to take."""
remaining = all_courses - completed
return [c for c in remaining if self.is_ready(c, completed)]
def learning_path(self, start_skills: set[str], target_skills: set[str],
max_courses: int = 20) -> list[str]:
"""Find shortest path of courses from current skills to target skills."""
missing_skills = target_skills - start_skills
if not missing_skills:
return []
# Find courses that teach missing skills
relevant_courses = []
for cid, skills in self.course_skills.items():
if skills["teaches"] & missing_skills:
relevant_courses.append(cid)
# Topological sort of relevant courses respecting prerequisites
return self._topological_sort(relevant_courses)
def _topological_sort(self, course_ids: list[str]) -> list[str]:
"""Sort courses respecting prerequisite order."""
course_set = set(course_ids)
in_degree = {c: 0 for c in course_ids}
for c in course_ids:
for prereq in self.prerequisites[c]:
if prereq in course_set:
in_degree[c] += 1
queue = deque([c for c in course_ids if in_degree[c] == 0])
result = []
while queue:
course = queue.popleft()
result.append(course)
for dependent in self.dependents[course]:
if dependent in course_set:
in_degree[dependent] -= 1
if in_degree[dependent] == 0:
queue.append(dependent)
return result
Knowledge Gap Analysis
class KnowledgeGapAnalyzer:
def __init__(self, skill_taxonomy: dict[str, list[str]]):
"""
skill_taxonomy: maps skill areas to specific skills
e.g., {"data_science": ["pandas", "numpy", "sklearn", "statistics"]}
"""
self.taxonomy = skill_taxonomy
def analyze(self, learner_skills: dict[str, float],
goal_profile: dict[str, float]) -> dict:
"""
Compare learner's current skills against goal requirements.
Skills are 0-1 mastery levels.
"""
gaps = {}
strengths = {}
for skill, required_level in goal_profile.items():
current_level = learner_skills.get(skill, 0.0)
delta = required_level - current_level
if delta > 0.1:
gaps[skill] = {
"current": round(current_level, 2),
"required": round(required_level, 2),
"gap": round(delta, 2),
"priority": self._priority_score(skill, delta),
}
elif current_level >= required_level:
strengths[skill] = round(current_level, 2)
# Sort gaps by priority
sorted_gaps = dict(sorted(gaps.items(), key=lambda x: -x[1]["priority"]))
return {
"gaps": sorted_gaps,
"strengths": strengths,
"readiness_score": round(len(strengths) / max(len(goal_profile), 1), 2),
"estimated_courses_needed": self._estimate_courses(sorted_gaps),
}
def _priority_score(self, skill: str, gap: float) -> float:
"""Higher priority for foundational skills with large gaps."""
is_foundational = any(
skill in skills[:2] # First 2 skills in each area are foundational
for skills in self.taxonomy.values()
)
base = gap * 10
return base * 1.5 if is_foundational else base
def _estimate_courses(self, gaps: dict) -> int:
"""Rough estimate of courses needed to fill gaps."""
total_gap = sum(g["gap"] for g in gaps.values())
return max(1, round(total_gap / 0.3)) # ~0.3 mastery gain per course
Complete Recommendation Engine
class CourseRecommender:
def __init__(self, mf_model: MFModel, content_model: CourseContentModel,
prereq_graph: PrerequisiteGraph, gap_analyzer: KnowledgeGapAnalyzer):
self.mf = mf_model
self.content = content_model
self.graph = prereq_graph
self.gap = gap_analyzer
def recommend(self, user_id: int, completed_courses: set[str],
learner_skills: dict[str, float],
goal_profile: dict[str, float] = None,
top_k: int = 10) -> list[dict]:
"""Generate ranked course recommendations."""
# Stage 1: Filter to prerequisite-satisfied courses
all_courses = set(self.graph.course_skills.keys())
available = self.graph.available_courses(completed_courses, all_courses)
if not available:
return []
# Stage 2: Score each available course
scored = []
gap_analysis = (self.gap.analyze(learner_skills, goal_profile)
if goal_profile else None)
for course_id in available:
score = self._score_course(user_id, course_id, completed_courses,
learner_skills, gap_analysis)
scored.append({"course_id": course_id, **score})
# Stage 3: Rank and diversify
scored.sort(key=lambda x: -x["final_score"])
diversified = self._diversify(scored, top_k)
return diversified
def _score_course(self, user_id: int, course_id: str,
completed: set[str], skills: dict[str, float],
gap_analysis: dict | None) -> dict:
"""Compute composite recommendation score."""
scores = {}
# Collaborative filtering score
try:
cf_score = predict_rating(self.mf, user_id, hash(course_id) % self.mf.item_factors.shape[0])
scores["cf_score"] = max(0, min(1, (cf_score - 1) / 4)) # Normalize to 0-1
except IndexError:
scores["cf_score"] = 0.5
# Content similarity to completed courses
content_sims = []
for completed_id in list(completed)[-5:]: # Recent completions
similar = self.content.find_similar(completed_id, top_k=50)
for s in similar:
if s["course_id"] == course_id:
content_sims.append(s["similarity"])
scores["content_score"] = max(content_sims) if content_sims else 0.3
# Goal alignment score
if gap_analysis:
course_skills = self.graph.course_skills.get(course_id, {})
teaches = course_skills.get("teaches", set())
gap_skills = set(gap_analysis["gaps"].keys())
overlap = teaches & gap_skills
scores["goal_score"] = len(overlap) / max(len(gap_skills), 1)
else:
scores["goal_score"] = 0.5
# Composite with weights
scores["final_score"] = (
0.35 * scores["cf_score"] +
0.25 * scores["content_score"] +
0.40 * scores["goal_score"]
)
return scores
def _diversify(self, scored: list[dict], top_k: int) -> list[dict]:
"""Ensure topic diversity in recommendations using MMR."""
if len(scored) <= top_k:
return scored
selected = [scored[0]]
candidates = scored[1:]
while len(selected) < top_k and candidates:
best_idx = 0
best_mmr = -1
for i, candidate in enumerate(candidates):
relevance = candidate["final_score"]
# Penalize similarity to already-selected courses
max_sim = max(
self._course_similarity(candidate["course_id"], s["course_id"])
for s in selected
)
mmr = 0.7 * relevance - 0.3 * max_sim
if mmr > best_mmr:
best_mmr = mmr
best_idx = i
selected.append(candidates.pop(best_idx))
return selected
def _course_similarity(self, course_a: str, course_b: str) -> float:
"""Quick similarity estimate between two courses."""
skills_a = self.graph.course_skills.get(course_a, {}).get("teaches", set())
skills_b = self.graph.course_skills.get(course_b, {}).get("teaches", set())
if not skills_a or not skills_b:
return 0.0
return len(skills_a & skills_b) / len(skills_a | skills_b)
Evaluation Framework
def evaluate_recommendations(recommender, test_enrollments: list[dict],
k: int = 10) -> dict:
"""Evaluate recommendation quality on held-out enrollment data."""
hits = 0
total = 0
precisions = []
goal_alignments = []
for test_case in test_enrollments:
user_id = test_case["user_id"]
held_out = set(test_case["future_courses"])
history = set(test_case["past_courses"])
skills = test_case.get("skills", {})
recs = recommender.recommend(user_id, history, skills, top_k=k)
rec_ids = {r["course_id"] for r in recs}
# Hit rate: did we recommend any course they actually took?
hit = len(rec_ids & held_out) > 0
hits += int(hit)
# Precision@k
precision = len(rec_ids & held_out) / k
precisions.append(precision)
total += 1
return {
"hit_rate": round(hits / max(total, 1), 3),
"precision_at_k": round(np.mean(precisions), 4),
"total_users": total,
}
The one thing to remember: Course recommendation in Python combines collaborative signals (what similar learners took) with knowledge-aware constraints (prerequisites, skill gaps, learning goals) into a multi-stage pipeline that ranks, diversifies, and serves personalized learning paths — not just popular courses.
See Also
- Python Adaptive Learning Systems How Python builds learning apps that adjust to each student like a personal tutor who knows exactly what you need next.
- Python Airflow Learn Airflow as a timetable manager that makes sure data tasks run in the right order every day.
- Python Altair Learn Altair through the idea of drawing charts by describing rules, not by hand-placing every visual element.
- Python Automated Grading How Python grades homework and exams automatically, from simple answer keys to understanding written essays.
- Python Batch Vs Stream Processing Batch processing is like doing laundry once a week; stream processing is like a self-cleaning shirt that cleans itself constantly.