Automated Grading in Python — Deep Dive
Automated grading systems require different architectures depending on the submission type. This guide implements code autograding with sandboxing, rubric-based short-answer scoring, and neural essay assessment.
Code Autograder with Sandboxing
Running untrusted student code requires strict isolation. Docker containers provide process-level sandboxing:
import subprocess
import tempfile
import json
from dataclasses import dataclass
from pathlib import Path
@dataclass
class TestCase:
name: str
input_data: str
expected_output: str
points: float
timeout_seconds: int = 10
@dataclass
class GradeResult:
test_name: str
passed: bool
points_earned: float
points_possible: float
actual_output: str
error: str = ""
class CodeAutograder:
def __init__(self, docker_image: str = "python:3.12-slim"):
self.docker_image = docker_image
def grade_submission(self, code: str, test_cases: list[TestCase],
language: str = "python") -> dict:
"""Grade a code submission against test cases in a sandboxed container."""
results = []
total_earned = 0.0
total_possible = 0.0
with tempfile.TemporaryDirectory() as tmpdir:
code_path = Path(tmpdir) / "solution.py"
code_path.write_text(code)
for tc in test_cases:
result = self._run_test(tmpdir, tc, language)
results.append(result)
total_earned += result.points_earned
total_possible += result.points_possible
return {
"score": total_earned,
"max_score": total_possible,
"percentage": round(total_earned / max(total_possible, 1) * 100, 1),
"results": results,
}
def _run_test(self, code_dir: str, tc: TestCase,
language: str) -> GradeResult:
"""Run a single test case in Docker."""
try:
proc = subprocess.run(
[
"docker", "run", "--rm",
"--network", "none", # No network access
"--memory", "256m", # Memory limit
"--cpus", "0.5", # CPU limit
"--pids-limit", "50", # Process limit
"--read-only", # Read-only filesystem
"--tmpfs", "/tmp:size=64m", # Writable tmp
"-v", f"{code_dir}:/submission:ro",
self.docker_image,
"python", "/submission/solution.py",
],
input=tc.input_data,
capture_output=True,
text=True,
timeout=tc.timeout_seconds,
)
actual = proc.stdout.strip()
expected = tc.expected_output.strip()
passed = actual == expected
return GradeResult(
test_name=tc.name,
passed=passed,
points_earned=tc.points if passed else 0.0,
points_possible=tc.points,
actual_output=actual,
error=proc.stderr[:500] if proc.stderr else "",
)
except subprocess.TimeoutExpired:
return GradeResult(
test_name=tc.name, passed=False,
points_earned=0.0, points_possible=tc.points,
actual_output="", error="Time limit exceeded",
)
except Exception as e:
return GradeResult(
test_name=tc.name, passed=False,
points_earned=0.0, points_possible=tc.points,
actual_output="", error=str(e)[:500],
)
Static Analysis Grading
Beyond correctness, evaluate code quality:
import ast
import sys
class CodeQualityChecker:
def __init__(self):
self.issues = []
def check(self, code: str, requirements: dict = None) -> dict:
"""Analyze code quality and style."""
requirements = requirements or {}
try:
tree = ast.parse(code)
except SyntaxError as e:
return {"score": 0, "issues": [f"Syntax error: {e}"]}
checks = {
"has_docstrings": self._check_docstrings(tree),
"function_count": self._count_functions(tree),
"max_complexity": self._estimate_complexity(tree),
"uses_required_constructs": self._check_constructs(tree, requirements),
"line_count": len(code.strip().split("\n")),
}
quality_score = self._calculate_quality_score(checks, requirements)
return {"score": quality_score, "checks": checks, "issues": self.issues}
def _check_docstrings(self, tree: ast.AST) -> bool:
"""Check if functions have docstrings."""
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
if not (node.body and isinstance(node.body[0], ast.Expr)
and isinstance(node.body[0].value, ast.Constant)
and isinstance(node.body[0].value.value, str)):
self.issues.append(f"Function '{node.name}' lacks a docstring")
return False
return True
def _count_functions(self, tree: ast.AST) -> int:
return sum(1 for node in ast.walk(tree)
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)))
def _estimate_complexity(self, tree: ast.AST) -> int:
"""Estimate cyclomatic complexity (simplified)."""
complexity = 1
for node in ast.walk(tree):
if isinstance(node, (ast.If, ast.While, ast.For, ast.ExceptHandler,
ast.With, ast.Assert, ast.BoolOp)):
complexity += 1
return complexity
def _check_constructs(self, tree: ast.AST, requirements: dict) -> bool:
"""Check if required language constructs are used."""
required = requirements.get("required_constructs", [])
found = set()
for node in ast.walk(tree):
if isinstance(node, ast.For):
found.add("for_loop")
elif isinstance(node, ast.While):
found.add("while_loop")
elif isinstance(node, ast.ListComp):
found.add("list_comprehension")
elif isinstance(node, ast.FunctionDef):
found.add("function")
elif isinstance(node, ast.ClassDef):
found.add("class")
elif isinstance(node, ast.Try):
found.add("try_except")
missing = set(required) - found
if missing:
self.issues.append(f"Missing required constructs: {missing}")
return False
return True
def _calculate_quality_score(self, checks: dict, requirements: dict) -> float:
score = 100.0
if not checks["has_docstrings"]:
score -= 15
if checks["max_complexity"] > requirements.get("max_complexity", 15):
score -= 20
if not checks["uses_required_constructs"]:
score -= 30
return max(0, score)
Rubric-Based Short Answer Grading
Score written responses against rubric criteria using semantic similarity:
from sentence_transformers import SentenceTransformer, util
import numpy as np
class RubricGrader:
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def grade(self, response: str, rubric: list[dict]) -> dict:
"""
Grade a short answer against a rubric.
rubric: [{"criterion": str, "key_concepts": [str], "points": float}]
"""
response_emb = self.model.encode(response, convert_to_tensor=True)
results = []
total_earned = 0.0
total_possible = 0.0
for criterion in rubric:
concept_scores = []
for concept in criterion["key_concepts"]:
concept_emb = self.model.encode(concept, convert_to_tensor=True)
sim = float(util.cos_sim(response_emb, concept_emb)[0][0])
concept_scores.append(sim)
# Also check sentence-level matches
sentences = [s.strip() for s in response.split(".") if len(s.strip()) > 10]
if sentences:
sent_embs = self.model.encode(sentences, convert_to_tensor=True)
for concept in criterion["key_concepts"]:
c_emb = self.model.encode(concept, convert_to_tensor=True)
sims = util.cos_sim(c_emb, sent_embs)[0]
concept_scores.append(float(sims.max()))
avg_match = np.mean(concept_scores) if concept_scores else 0.0
# Map similarity to points (0.4+ starts earning points)
point_fraction = max(0, min(1, (avg_match - 0.4) / 0.4))
earned = round(criterion["points"] * point_fraction, 2)
results.append({
"criterion": criterion["criterion"],
"score": earned,
"max_score": criterion["points"],
"confidence": round(avg_match, 3),
"matched_concepts": [
c for c, s in zip(criterion["key_concepts"], concept_scores[:len(criterion["key_concepts"])])
if s > 0.5
],
})
total_earned += earned
total_possible += criterion["points"]
return {
"total_score": round(total_earned, 1),
"max_score": total_possible,
"criteria_results": results,
}
Neural Essay Scoring
For longer essays, use a fine-tuned transformer:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
class EssayScorer(nn.Module):
def __init__(self, model_name: str = "bert-base-uncased", num_traits: int = 6):
super().__init__()
self.encoder = AutoModel.from_pretrained(model_name)
self.dropout = nn.Dropout(0.1)
# Multi-trait scoring: content, organization, voice, conventions, etc.
self.scoring_heads = nn.ModuleList([
nn.Linear(768, 1) for _ in range(num_traits)
])
self.trait_names = [
"content", "organization", "word_choice",
"sentence_fluency", "conventions", "overall"
]
def forward(self, input_ids, attention_mask):
outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
pooled = outputs.last_hidden_state[:, 0, :] # CLS token
pooled = self.dropout(pooled)
scores = [head(pooled).squeeze(-1) for head in self.scoring_heads]
return torch.stack(scores, dim=1) # (batch, num_traits)
class EssayGradingPipeline:
def __init__(self, model_path: str = None):
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
self.model = EssayScorer()
if model_path:
self.model.load_state_dict(torch.load(model_path, map_location="cpu"))
self.model.eval()
def grade(self, essay: str, score_range: tuple = (1, 6)) -> dict:
"""Grade an essay on multiple trait dimensions."""
inputs = self.tokenizer(
essay, return_tensors="pt",
max_length=512, truncation=True, padding=True
)
with torch.no_grad():
raw_scores = self.model(inputs["input_ids"], inputs["attention_mask"])
# Scale raw outputs to score range
min_score, max_score = score_range
scaled = torch.sigmoid(raw_scores) * (max_score - min_score) + min_score
scaled = scaled[0].tolist()
return {
trait: round(score, 1)
for trait, score in zip(self.model.trait_names, scaled)
}
Training uses the ASAP dataset (8 essay prompts, ~13,000 essays with human scores). Fine-tune BERT for 5 epochs with MSE loss. Evaluate using Quadratic Weighted Kappa (QWK), which measures agreement with human scores. State-of-the-art models achieve QWK > 0.8 on most ASAP prompts, comparable to human-human agreement.
Feedback Generation with LLMs
from openai import OpenAI
def generate_feedback(essay: str, rubric: list[dict],
scores: dict) -> str:
"""Generate constructive feedback using an LLM."""
client = OpenAI()
rubric_text = "\n".join(
f"- {r['criterion']} ({r['points']} pts): {r.get('description', '')}"
for r in rubric
)
prompt = f"""You are a supportive writing tutor. Provide specific, constructive feedback
on this student essay.
Rubric:
{rubric_text}
Automated scores: {json.dumps(scores, indent=2)}
Essay:
{essay}
Provide feedback in this format:
1. One specific strength (with a quote from the essay)
2. Two specific areas for improvement (with concrete suggestions)
3. One actionable next step
Keep feedback encouraging, specific, and under 200 words."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.7,
max_tokens=400,
)
return response.choices[0].message.content
Calibration and Reliability
Automated scores should be regularly calibrated against human graders. Reserve 10-15% of submissions for dual grading (both human and automated). If the automated system’s agreement with humans drops below acceptable QWK thresholds (typically 0.7), retrain or flag the system.
For high-stakes assessments, use automated grading as a second reader alongside one human grader. If the human and automated scores diverge beyond a threshold, route to a second human grader. This hybrid approach is used by ETS for portions of the GRE and TOEFL writing assessments.
The one thing to remember: Automated grading in Python ranges from deterministic test-case matching for code to probabilistic scoring for essays, and production systems combine multiple techniques with human-in-the-loop calibration to balance efficiency with fairness.
See Also
- Python Adaptive Learning Systems How Python builds learning apps that adjust to each student like a personal tutor who knows exactly what you need next.
- Python Airflow Learn Airflow as a timetable manager that makes sure data tasks run in the right order every day.
- Python Altair Learn Altair through the idea of drawing charts by describing rules, not by hand-placing every visual element.
- Python Batch Vs Stream Processing Batch processing is like doing laundry once a week; stream processing is like a self-cleaning shirt that cleans itself constantly.
- Python Bentoml Model Serving See BentoML as a packaging-and-delivery system that turns your Python model into a dependable service others can call.