Automated Grading in Python — Deep Dive

Build code autograders with sandboxed execution, rubric-based short-answer scoring with embeddings, and essay grading pipelines in Python.

Automated grading systems require different architectures depending on the submission type. This guide implements code autograding with sandboxing, rubric-based short-answer scoring, and neural essay assessment.

Code Autograder with Sandboxing

Running untrusted student code requires strict isolation. Docker containers provide process-level sandboxing:

import subprocess
import tempfile
import json
from dataclasses import dataclass
from pathlib import Path

@dataclass
class TestCase:
    name: str
    input_data: str
    expected_output: str
    points: float
    timeout_seconds: int = 10

@dataclass
class GradeResult:
    test_name: str
    passed: bool
    points_earned: float
    points_possible: float
    actual_output: str
    error: str = ""

class CodeAutograder:
    def __init__(self, docker_image: str = "python:3.12-slim"):
        self.docker_image = docker_image

    def grade_submission(self, code: str, test_cases: list[TestCase],
                         language: str = "python") -> dict:
        """Grade a code submission against test cases in a sandboxed container."""
        results = []
        total_earned = 0.0
        total_possible = 0.0

        with tempfile.TemporaryDirectory() as tmpdir:
            code_path = Path(tmpdir) / "solution.py"
            code_path.write_text(code)

            for tc in test_cases:
                result = self._run_test(tmpdir, tc, language)
                results.append(result)
                total_earned += result.points_earned
                total_possible += result.points_possible

        return {
            "score": total_earned,
            "max_score": total_possible,
            "percentage": round(total_earned / max(total_possible, 1) * 100, 1),
            "results": results,
        }

    def _run_test(self, code_dir: str, tc: TestCase,
                  language: str) -> GradeResult:
        """Run a single test case in Docker."""
        try:
            proc = subprocess.run(
                [
                    "docker", "run", "--rm",
                    "--network", "none",            # No network access
                    "--memory", "256m",              # Memory limit
                    "--cpus", "0.5",                 # CPU limit
                    "--pids-limit", "50",            # Process limit
                    "--read-only",                   # Read-only filesystem
                    "--tmpfs", "/tmp:size=64m",      # Writable tmp
                    "-v", f"{code_dir}:/submission:ro",
                    self.docker_image,
                    "python", "/submission/solution.py",
                ],
                input=tc.input_data,
                capture_output=True,
                text=True,
                timeout=tc.timeout_seconds,
            )

            actual = proc.stdout.strip()
            expected = tc.expected_output.strip()
            passed = actual == expected

            return GradeResult(
                test_name=tc.name,
                passed=passed,
                points_earned=tc.points if passed else 0.0,
                points_possible=tc.points,
                actual_output=actual,
                error=proc.stderr[:500] if proc.stderr else "",
            )

        except subprocess.TimeoutExpired:
            return GradeResult(
                test_name=tc.name, passed=False,
                points_earned=0.0, points_possible=tc.points,
                actual_output="", error="Time limit exceeded",
            )
        except Exception as e:
            return GradeResult(
                test_name=tc.name, passed=False,
                points_earned=0.0, points_possible=tc.points,
                actual_output="", error=str(e)[:500],
            )

Static Analysis Grading

Beyond correctness, evaluate code quality:

import ast
import sys

class CodeQualityChecker:
    def __init__(self):
        self.issues = []

    def check(self, code: str, requirements: dict = None) -> dict:
        """Analyze code quality and style."""
        requirements = requirements or {}
        try:
            tree = ast.parse(code)
        except SyntaxError as e:
            return {"score": 0, "issues": [f"Syntax error: {e}"]}

        checks = {
            "has_docstrings": self._check_docstrings(tree),
            "function_count": self._count_functions(tree),
            "max_complexity": self._estimate_complexity(tree),
            "uses_required_constructs": self._check_constructs(tree, requirements),
            "line_count": len(code.strip().split("\n")),
        }

        quality_score = self._calculate_quality_score(checks, requirements)
        return {"score": quality_score, "checks": checks, "issues": self.issues}

    def _check_docstrings(self, tree: ast.AST) -> bool:
        """Check if functions have docstrings."""
        for node in ast.walk(tree):
            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
                if not (node.body and isinstance(node.body[0], ast.Expr)
                        and isinstance(node.body[0].value, ast.Constant)
                        and isinstance(node.body[0].value.value, str)):
                    self.issues.append(f"Function '{node.name}' lacks a docstring")
                    return False
        return True

    def _count_functions(self, tree: ast.AST) -> int:
        return sum(1 for node in ast.walk(tree)
                   if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)))

    def _estimate_complexity(self, tree: ast.AST) -> int:
        """Estimate cyclomatic complexity (simplified)."""
        complexity = 1
        for node in ast.walk(tree):
            if isinstance(node, (ast.If, ast.While, ast.For, ast.ExceptHandler,
                                ast.With, ast.Assert, ast.BoolOp)):
                complexity += 1
        return complexity

    def _check_constructs(self, tree: ast.AST, requirements: dict) -> bool:
        """Check if required language constructs are used."""
        required = requirements.get("required_constructs", [])
        found = set()
        for node in ast.walk(tree):
            if isinstance(node, ast.For):
                found.add("for_loop")
            elif isinstance(node, ast.While):
                found.add("while_loop")
            elif isinstance(node, ast.ListComp):
                found.add("list_comprehension")
            elif isinstance(node, ast.FunctionDef):
                found.add("function")
            elif isinstance(node, ast.ClassDef):
                found.add("class")
            elif isinstance(node, ast.Try):
                found.add("try_except")
        missing = set(required) - found
        if missing:
            self.issues.append(f"Missing required constructs: {missing}")
            return False
        return True

    def _calculate_quality_score(self, checks: dict, requirements: dict) -> float:
        score = 100.0
        if not checks["has_docstrings"]:
            score -= 15
        if checks["max_complexity"] > requirements.get("max_complexity", 15):
            score -= 20
        if not checks["uses_required_constructs"]:
            score -= 30
        return max(0, score)

Rubric-Based Short Answer Grading

Score written responses against rubric criteria using semantic similarity:

from sentence_transformers import SentenceTransformer, util
import numpy as np

class RubricGrader:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def grade(self, response: str, rubric: list[dict]) -> dict:
        """
        Grade a short answer against a rubric.
        rubric: [{"criterion": str, "key_concepts": [str], "points": float}]
        """
        response_emb = self.model.encode(response, convert_to_tensor=True)
        results = []
        total_earned = 0.0
        total_possible = 0.0

        for criterion in rubric:
            concept_scores = []
            for concept in criterion["key_concepts"]:
                concept_emb = self.model.encode(concept, convert_to_tensor=True)
                sim = float(util.cos_sim(response_emb, concept_emb)[0][0])
                concept_scores.append(sim)

            # Also check sentence-level matches
            sentences = [s.strip() for s in response.split(".") if len(s.strip()) > 10]
            if sentences:
                sent_embs = self.model.encode(sentences, convert_to_tensor=True)
                for concept in criterion["key_concepts"]:
                    c_emb = self.model.encode(concept, convert_to_tensor=True)
                    sims = util.cos_sim(c_emb, sent_embs)[0]
                    concept_scores.append(float(sims.max()))

            avg_match = np.mean(concept_scores) if concept_scores else 0.0
            # Map similarity to points (0.4+ starts earning points)
            point_fraction = max(0, min(1, (avg_match - 0.4) / 0.4))
            earned = round(criterion["points"] * point_fraction, 2)

            results.append({
                "criterion": criterion["criterion"],
                "score": earned,
                "max_score": criterion["points"],
                "confidence": round(avg_match, 3),
                "matched_concepts": [
                    c for c, s in zip(criterion["key_concepts"], concept_scores[:len(criterion["key_concepts"])])
                    if s > 0.5
                ],
            })
            total_earned += earned
            total_possible += criterion["points"]

        return {
            "total_score": round(total_earned, 1),
            "max_score": total_possible,
            "criteria_results": results,
        }

Neural Essay Scoring

For longer essays, use a fine-tuned transformer:

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class EssayScorer(nn.Module):
    def __init__(self, model_name: str = "bert-base-uncased", num_traits: int = 6):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        # Multi-trait scoring: content, organization, voice, conventions, etc.
        self.scoring_heads = nn.ModuleList([
            nn.Linear(768, 1) for _ in range(num_traits)
        ])
        self.trait_names = [
            "content", "organization", "word_choice",
            "sentence_fluency", "conventions", "overall"
        ]

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # CLS token
        pooled = self.dropout(pooled)
        scores = [head(pooled).squeeze(-1) for head in self.scoring_heads]
        return torch.stack(scores, dim=1)  # (batch, num_traits)

class EssayGradingPipeline:
    def __init__(self, model_path: str = None):
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.model = EssayScorer()
        if model_path:
            self.model.load_state_dict(torch.load(model_path, map_location="cpu"))
        self.model.eval()

    def grade(self, essay: str, score_range: tuple = (1, 6)) -> dict:
        """Grade an essay on multiple trait dimensions."""
        inputs = self.tokenizer(
            essay, return_tensors="pt",
            max_length=512, truncation=True, padding=True
        )

        with torch.no_grad():
            raw_scores = self.model(inputs["input_ids"], inputs["attention_mask"])

        # Scale raw outputs to score range
        min_score, max_score = score_range
        scaled = torch.sigmoid(raw_scores) * (max_score - min_score) + min_score
        scaled = scaled[0].tolist()

        return {
            trait: round(score, 1)
            for trait, score in zip(self.model.trait_names, scaled)
        }

Training uses the ASAP dataset (8 essay prompts, ~13,000 essays with human scores). Fine-tune BERT for 5 epochs with MSE loss. Evaluate using Quadratic Weighted Kappa (QWK), which measures agreement with human scores. State-of-the-art models achieve QWK > 0.8 on most ASAP prompts, comparable to human-human agreement.

Feedback Generation with LLMs

from openai import OpenAI

def generate_feedback(essay: str, rubric: list[dict],
                      scores: dict) -> str:
    """Generate constructive feedback using an LLM."""
    client = OpenAI()

    rubric_text = "\n".join(
        f"- {r['criterion']} ({r['points']} pts): {r.get('description', '')}"
        for r in rubric
    )

    prompt = f"""You are a supportive writing tutor. Provide specific, constructive feedback
on this student essay.

Rubric:
{rubric_text}

Automated scores: {json.dumps(scores, indent=2)}

Essay:
{essay}

Provide feedback in this format:
1. One specific strength (with a quote from the essay)
2. Two specific areas for improvement (with concrete suggestions)
3. One actionable next step

Keep feedback encouraging, specific, and under 200 words."""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=400,
    )
    return response.choices[0].message.content

Calibration and Reliability

Automated scores should be regularly calibrated against human graders. Reserve 10-15% of submissions for dual grading (both human and automated). If the automated system’s agreement with humans drops below acceptable QWK thresholds (typically 0.7), retrain or flag the system.

For high-stakes assessments, use automated grading as a second reader alongside one human grader. If the human and automated scores diverge beyond a threshold, route to a second human grader. This hybrid approach is used by ETS for portions of the GRE and TOEFL writing assessments.

The one thing to remember: Automated grading in Python ranges from deterministic test-case matching for code to probabilistic scoring for essays, and production systems combine multiple techniques with human-in-the-loop calibration to balance efficiency with fairness.

pythonautomated-gradingeducation-technologyassessment