Course Recommendation in Python — Deep Dive

Build a course recommendation engine in Python with matrix factorization, prerequisite-aware ranking, knowledge gap analysis, and learning path optimization.

Educational recommendation systems must satisfy constraints that entertainment recommenders ignore: prerequisite ordering, skill coverage, and learning efficiency. This guide builds a complete system in Python.

Matrix Factorization for Course Embeddings

Learn latent representations of students and courses from enrollment and rating data:

import numpy as np
from dataclasses import dataclass

@dataclass
class MFModel:
    user_factors: np.ndarray   # (num_users, k)
    item_factors: np.ndarray   # (num_items, k)
    user_bias: np.ndarray
    item_bias: np.ndarray
    global_mean: float

def train_matrix_factorization(
    interactions: list[tuple[int, int, float]],
    num_users: int, num_items: int,
    k: int = 50, epochs: int = 20,
    lr: float = 0.01, reg: float = 0.02
) -> MFModel:
    """Train matrix factorization with biases using SGD."""
    rng = np.random.default_rng(42)
    user_factors = rng.normal(0, 0.1, (num_users, k))
    item_factors = rng.normal(0, 0.1, (num_items, k))
    user_bias = np.zeros(num_users)
    item_bias = np.zeros(num_items)
    global_mean = np.mean([r for _, _, r in interactions])

    for epoch in range(epochs):
        rng.shuffle(interactions)
        total_loss = 0.0

        for user_id, item_id, rating in interactions:
            pred = (global_mean + user_bias[user_id] + item_bias[item_id]
                    + user_factors[user_id] @ item_factors[item_id])
            error = rating - pred
            total_loss += error ** 2

            # Update factors
            user_factors[user_id] += lr * (error * item_factors[item_id]
                                           - reg * user_factors[user_id])
            item_factors[item_id] += lr * (error * user_factors[user_id]
                                           - reg * item_factors[item_id])
            user_bias[user_id] += lr * (error - reg * user_bias[user_id])
            item_bias[item_id] += lr * (error - reg * item_bias[item_id])

        rmse = np.sqrt(total_loss / len(interactions))
        if epoch % 5 == 0:
            print(f"Epoch {epoch}: RMSE = {rmse:.4f}")

    return MFModel(user_factors, item_factors, user_bias, item_bias, global_mean)

def predict_rating(model: MFModel, user_id: int, item_id: int) -> float:
    """Predict a user's rating for a course."""
    return (model.global_mean + model.user_bias[user_id] + model.item_bias[item_id]
            + model.user_factors[user_id] @ model.item_factors[item_id])

Content-Based Similarity with Course Metadata

from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class CourseContentModel:
    def __init__(self):
        self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
        self.course_embeddings = {}
        self.courses = {}

    def index_courses(self, courses: list[dict]):
        """Build content-based index from course metadata."""
        for course in courses:
            self.courses[course["id"]] = course

        # Combine title, description, and tags into a single text
        texts = [
            f"{c['title']}. {c['description']}. Topics: {', '.join(c.get('tags', []))}"
            for c in courses
        ]
        embeddings = self.embedder.encode(texts, show_progress_bar=True)

        for course, emb in zip(courses, embeddings):
            self.course_embeddings[course["id"]] = emb

    def find_similar(self, course_id: str, top_k: int = 10) -> list[dict]:
        """Find courses most similar to a given course."""
        if course_id not in self.course_embeddings:
            return []

        target_emb = self.course_embeddings[course_id]
        scores = []
        for cid, emb in self.course_embeddings.items():
            if cid == course_id:
                continue
            sim = float(np.dot(target_emb, emb) / (
                np.linalg.norm(target_emb) * np.linalg.norm(emb)))
            scores.append((cid, sim))

        scores.sort(key=lambda x: -x[1])
        return [{"course_id": cid, "similarity": sim}
                for cid, sim in scores[:top_k]]

Prerequisite Graph and Topological Ordering

from collections import defaultdict, deque

class PrerequisiteGraph:
    def __init__(self):
        self.prerequisites = defaultdict(set)   # course -> set of prereq courses
        self.dependents = defaultdict(set)       # course -> courses that depend on it
        self.course_skills = {}                  # course -> {"teaches": [], "requires": []}

    def add_course(self, course_id: str, teaches: list[str], requires: list[str]):
        """Register a course with its skill relationships."""
        self.course_skills[course_id] = {"teaches": set(teaches), "requires": set(requires)}

    def add_prerequisite(self, course_id: str, prereq_id: str):
        """Add a direct prerequisite relationship."""
        self.prerequisites[course_id].add(prereq_id)
        self.dependents[prereq_id].add(course_id)

    def infer_prerequisites(self):
        """Infer prerequisite relationships from skill overlaps."""
        for course_id, skills in self.course_skills.items():
            for other_id, other_skills in self.course_skills.items():
                if course_id == other_id:
                    continue
                # If course requires skills that other teaches, other is a prereq
                if skills["requires"] & other_skills["teaches"]:
                    self.add_prerequisite(course_id, other_id)

    def is_ready(self, course_id: str, completed: set[str]) -> bool:
        """Check if a student has completed all prerequisites for a course."""
        return self.prerequisites[course_id].issubset(completed)

    def available_courses(self, completed: set[str], all_courses: set[str]) -> list[str]:
        """List courses the student is ready to take."""
        remaining = all_courses - completed
        return [c for c in remaining if self.is_ready(c, completed)]

    def learning_path(self, start_skills: set[str], target_skills: set[str],
                      max_courses: int = 20) -> list[str]:
        """Find shortest path of courses from current skills to target skills."""
        missing_skills = target_skills - start_skills
        if not missing_skills:
            return []

        # Find courses that teach missing skills
        relevant_courses = []
        for cid, skills in self.course_skills.items():
            if skills["teaches"] & missing_skills:
                relevant_courses.append(cid)

        # Topological sort of relevant courses respecting prerequisites
        return self._topological_sort(relevant_courses)

    def _topological_sort(self, course_ids: list[str]) -> list[str]:
        """Sort courses respecting prerequisite order."""
        course_set = set(course_ids)
        in_degree = {c: 0 for c in course_ids}
        for c in course_ids:
            for prereq in self.prerequisites[c]:
                if prereq in course_set:
                    in_degree[c] += 1

        queue = deque([c for c in course_ids if in_degree[c] == 0])
        result = []
        while queue:
            course = queue.popleft()
            result.append(course)
            for dependent in self.dependents[course]:
                if dependent in course_set:
                    in_degree[dependent] -= 1
                    if in_degree[dependent] == 0:
                        queue.append(dependent)

        return result

Knowledge Gap Analysis

class KnowledgeGapAnalyzer:
    def __init__(self, skill_taxonomy: dict[str, list[str]]):
        """
        skill_taxonomy: maps skill areas to specific skills
        e.g., {"data_science": ["pandas", "numpy", "sklearn", "statistics"]}
        """
        self.taxonomy = skill_taxonomy

    def analyze(self, learner_skills: dict[str, float],
                goal_profile: dict[str, float]) -> dict:
        """
        Compare learner's current skills against goal requirements.
        Skills are 0-1 mastery levels.
        """
        gaps = {}
        strengths = {}

        for skill, required_level in goal_profile.items():
            current_level = learner_skills.get(skill, 0.0)
            delta = required_level - current_level

            if delta > 0.1:
                gaps[skill] = {
                    "current": round(current_level, 2),
                    "required": round(required_level, 2),
                    "gap": round(delta, 2),
                    "priority": self._priority_score(skill, delta),
                }
            elif current_level >= required_level:
                strengths[skill] = round(current_level, 2)

        # Sort gaps by priority
        sorted_gaps = dict(sorted(gaps.items(), key=lambda x: -x[1]["priority"]))

        return {
            "gaps": sorted_gaps,
            "strengths": strengths,
            "readiness_score": round(len(strengths) / max(len(goal_profile), 1), 2),
            "estimated_courses_needed": self._estimate_courses(sorted_gaps),
        }

    def _priority_score(self, skill: str, gap: float) -> float:
        """Higher priority for foundational skills with large gaps."""
        is_foundational = any(
            skill in skills[:2]  # First 2 skills in each area are foundational
            for skills in self.taxonomy.values()
        )
        base = gap * 10
        return base * 1.5 if is_foundational else base

    def _estimate_courses(self, gaps: dict) -> int:
        """Rough estimate of courses needed to fill gaps."""
        total_gap = sum(g["gap"] for g in gaps.values())
        return max(1, round(total_gap / 0.3))  # ~0.3 mastery gain per course

Complete Recommendation Engine

class CourseRecommender:
    def __init__(self, mf_model: MFModel, content_model: CourseContentModel,
                 prereq_graph: PrerequisiteGraph, gap_analyzer: KnowledgeGapAnalyzer):
        self.mf = mf_model
        self.content = content_model
        self.graph = prereq_graph
        self.gap = gap_analyzer

    def recommend(self, user_id: int, completed_courses: set[str],
                  learner_skills: dict[str, float],
                  goal_profile: dict[str, float] = None,
                  top_k: int = 10) -> list[dict]:
        """Generate ranked course recommendations."""

        # Stage 1: Filter to prerequisite-satisfied courses
        all_courses = set(self.graph.course_skills.keys())
        available = self.graph.available_courses(completed_courses, all_courses)

        if not available:
            return []

        # Stage 2: Score each available course
        scored = []
        gap_analysis = (self.gap.analyze(learner_skills, goal_profile)
                       if goal_profile else None)

        for course_id in available:
            score = self._score_course(user_id, course_id, completed_courses,
                                       learner_skills, gap_analysis)
            scored.append({"course_id": course_id, **score})

        # Stage 3: Rank and diversify
        scored.sort(key=lambda x: -x["final_score"])
        diversified = self._diversify(scored, top_k)

        return diversified

    def _score_course(self, user_id: int, course_id: str,
                      completed: set[str], skills: dict[str, float],
                      gap_analysis: dict | None) -> dict:
        """Compute composite recommendation score."""
        scores = {}

        # Collaborative filtering score
        try:
            cf_score = predict_rating(self.mf, user_id, hash(course_id) % self.mf.item_factors.shape[0])
            scores["cf_score"] = max(0, min(1, (cf_score - 1) / 4))  # Normalize to 0-1
        except IndexError:
            scores["cf_score"] = 0.5

        # Content similarity to completed courses
        content_sims = []
        for completed_id in list(completed)[-5:]:  # Recent completions
            similar = self.content.find_similar(completed_id, top_k=50)
            for s in similar:
                if s["course_id"] == course_id:
                    content_sims.append(s["similarity"])
        scores["content_score"] = max(content_sims) if content_sims else 0.3

        # Goal alignment score
        if gap_analysis:
            course_skills = self.graph.course_skills.get(course_id, {})
            teaches = course_skills.get("teaches", set())
            gap_skills = set(gap_analysis["gaps"].keys())
            overlap = teaches & gap_skills
            scores["goal_score"] = len(overlap) / max(len(gap_skills), 1)
        else:
            scores["goal_score"] = 0.5

        # Composite with weights
        scores["final_score"] = (
            0.35 * scores["cf_score"] +
            0.25 * scores["content_score"] +
            0.40 * scores["goal_score"]
        )

        return scores

    def _diversify(self, scored: list[dict], top_k: int) -> list[dict]:
        """Ensure topic diversity in recommendations using MMR."""
        if len(scored) <= top_k:
            return scored

        selected = [scored[0]]
        candidates = scored[1:]

        while len(selected) < top_k and candidates:
            best_idx = 0
            best_mmr = -1

            for i, candidate in enumerate(candidates):
                relevance = candidate["final_score"]
                # Penalize similarity to already-selected courses
                max_sim = max(
                    self._course_similarity(candidate["course_id"], s["course_id"])
                    for s in selected
                )
                mmr = 0.7 * relevance - 0.3 * max_sim
                if mmr > best_mmr:
                    best_mmr = mmr
                    best_idx = i

            selected.append(candidates.pop(best_idx))

        return selected

    def _course_similarity(self, course_a: str, course_b: str) -> float:
        """Quick similarity estimate between two courses."""
        skills_a = self.graph.course_skills.get(course_a, {}).get("teaches", set())
        skills_b = self.graph.course_skills.get(course_b, {}).get("teaches", set())
        if not skills_a or not skills_b:
            return 0.0
        return len(skills_a & skills_b) / len(skills_a | skills_b)

Evaluation Framework

def evaluate_recommendations(recommender, test_enrollments: list[dict],
                              k: int = 10) -> dict:
    """Evaluate recommendation quality on held-out enrollment data."""
    hits = 0
    total = 0
    precisions = []
    goal_alignments = []

    for test_case in test_enrollments:
        user_id = test_case["user_id"]
        held_out = set(test_case["future_courses"])
        history = set(test_case["past_courses"])
        skills = test_case.get("skills", {})

        recs = recommender.recommend(user_id, history, skills, top_k=k)
        rec_ids = {r["course_id"] for r in recs}

        # Hit rate: did we recommend any course they actually took?
        hit = len(rec_ids & held_out) > 0
        hits += int(hit)

        # Precision@k
        precision = len(rec_ids & held_out) / k
        precisions.append(precision)
        total += 1

    return {
        "hit_rate": round(hits / max(total, 1), 3),
        "precision_at_k": round(np.mean(precisions), 4),
        "total_users": total,
    }

The one thing to remember: Course recommendation in Python combines collaborative signals (what similar learners took) with knowledge-aware constraints (prerequisites, skill gaps, learning goals) into a multi-stage pipeline that ranks, diversifies, and serves personalized learning paths — not just popular courses.

pythoncourse-recommendationeducation-technologyrecommender-systems