Quiz Generation in Python — Deep Dive
Automatic question generation involves answer extraction, question formulation, distractor creation, and quality filtering. This guide builds each component with production-ready Python code.
Answer Extraction with spaCy
The first step is identifying answer-worthy spans in the source text. Good answers are named entities, key noun phrases, dates, quantities, and domain-specific terms.
import spacy
nlp = spacy.load("en_core_web_lg")
def extract_answer_candidates(text: str) -> list[dict]:
"""Extract potential answer spans ranked by importance."""
doc = nlp(text)
candidates = []
# Named entities
for ent in doc.ents:
if ent.label_ in {"PERSON", "ORG", "GPE", "DATE", "CARDINAL", "EVENT", "WORK_OF_ART"}:
candidates.append({
"text": ent.text,
"type": ent.label_,
"sentence": ent.sent.text,
"start": ent.start_char,
"priority": 1 # Entities are high-priority answers
})
# Key noun phrases (filter short/generic ones)
seen_texts = {c["text"].lower() for c in candidates}
for chunk in doc.noun_chunks:
if len(chunk.text.split()) >= 2 and chunk.text.lower() not in seen_texts:
candidates.append({
"text": chunk.text,
"type": "NOUN_PHRASE",
"sentence": chunk.sent.text,
"start": chunk.start_char,
"priority": 2
})
seen_texts.add(chunk.text.lower())
candidates.sort(key=lambda c: c["priority"])
return candidates
Template-Based Question Generation
For rapid, reliable generation without model dependencies:
QUESTION_TEMPLATES = {
"PERSON": [
"Who {verb_phrase}?",
"Which person {verb_phrase}?",
],
"DATE": [
"When {verb_phrase}?",
"In what year {verb_phrase}?",
],
"GPE": [
"Where {verb_phrase}?",
"In which location {verb_phrase}?",
],
"CARDINAL": [
"How many {subject} {verb_phrase}?",
],
"DEFAULT": [
"What {verb_phrase}?",
"Which {noun} {verb_phrase}?",
],
}
def generate_template_question(answer: dict) -> dict | None:
"""Generate a question by removing the answer from its sentence."""
sentence = answer["sentence"]
answer_text = answer["text"]
if answer_text not in sentence:
return None
# Create fill-in-the-blank
blank_sentence = sentence.replace(answer_text, "_______", 1)
# Create wh-question using templates
templates = QUESTION_TEMPLATES.get(answer["type"], QUESTION_TEMPLATES["DEFAULT"])
verb_phrase = sentence.replace(answer_text, "").strip().rstrip(".")
# Clean up the verb phrase
verb_phrase = verb_phrase.strip(",").strip()
if verb_phrase and verb_phrase[0].isupper():
verb_phrase = verb_phrase[0].lower() + verb_phrase[1:]
question_text = templates[0].format(
verb_phrase=verb_phrase,
subject=answer_text.lower(),
noun=answer.get("type", "thing").lower()
)
return {
"question": question_text,
"answer": answer_text,
"blank": blank_sentence,
"type": "template",
"source_sentence": sentence,
}
Neural Question Generation with T5
For higher-quality, more natural questions, use a fine-tuned T5 model:
from transformers import T5ForConditionalGeneration, T5Tokenizer
class NeuralQuestionGenerator:
def __init__(self, model_name: str = "valhalla/t5-base-qg-hl"):
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
self.model = T5ForConditionalGeneration.from_pretrained(model_name)
def generate(self, context: str, answer: str, num_questions: int = 3) -> list[str]:
"""Generate questions given context and answer."""
# The model expects answer highlighted in context
highlighted = context.replace(
answer, f"<hl>{answer}<hl>", 1
)
input_text = f"generate question: {highlighted}"
input_ids = self.tokenizer.encode(input_text, return_tensors="pt",
max_length=512, truncation=True)
outputs = self.model.generate(
input_ids,
max_length=64,
num_beams=num_questions * 2,
num_return_sequences=num_questions,
no_repeat_ngram_size=3,
early_stopping=True,
)
questions = []
for output in outputs:
question = self.tokenizer.decode(output, skip_special_tokens=True)
if question.strip() and question not in questions:
questions.append(question)
return questions
Semantic Distractor Generation
Good distractors are semantically related to the correct answer but clearly wrong in context:
import numpy as np
from sentence_transformers import SentenceTransformer
class DistractorGenerator:
def __init__(self):
self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
self.entity_cache = {}
def from_same_type(self, answer: str, answer_type: str,
document_entities: list[dict], n: int = 3) -> list[str]:
"""Select distractors of the same entity type from the document."""
same_type = [
e["text"] for e in document_entities
if e["type"] == answer_type and e["text"].lower() != answer.lower()
]
if len(same_type) >= n:
return same_type[:n]
return same_type
def from_embeddings(self, answer: str, candidate_pool: list[str],
n: int = 3) -> list[str]:
"""Select semantically similar but different candidates."""
if not candidate_pool:
return []
answer_emb = self.embedder.encode([answer])
pool_embs = self.embedder.encode(candidate_pool)
similarities = np.dot(pool_embs, answer_emb.T).flatten()
# Want similar but not identical — target 0.4-0.8 similarity
scores = -np.abs(similarities - 0.6) # Peak at 0.6 similarity
top_indices = np.argsort(scores)[::-1]
distractors = []
for idx in top_indices:
candidate = candidate_pool[idx]
if candidate.lower() != answer.lower() and candidate not in distractors:
distractors.append(candidate)
if len(distractors) >= n:
break
return distractors
def generate(self, answer: str, answer_type: str,
document_entities: list[dict],
fallback_pool: list[str] = None) -> list[str]:
"""Generate distractors using best available method."""
distractors = self.from_same_type(answer, answer_type, document_entities)
if len(distractors) < 3 and fallback_pool:
more = self.from_embeddings(answer, fallback_pool, n=3 - len(distractors))
distractors.extend(more)
return distractors[:3]
Quality Filtering Pipeline
Not every generated question is usable. Apply automated filters before human review:
from sentence_transformers import util
class QuestionQualityFilter:
def __init__(self):
self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
def is_answerable(self, question: str, answer: str, context: str) -> bool:
"""Check if the answer appears in or can be derived from context."""
return answer.lower() in context.lower()
def is_not_trivial(self, question: str, context: str) -> bool:
"""Reject questions answerable from a single keyword match."""
question_words = set(question.lower().split())
# If question shares >80% words with a single sentence, it is trivial
doc = nlp(context)
for sent in doc.sents:
sent_words = set(sent.text.lower().split())
overlap = len(question_words & sent_words) / max(len(question_words), 1)
if overlap > 0.8:
return False
return True
def is_not_duplicate(self, question: str, existing: list[str],
threshold: float = 0.85) -> bool:
"""Reject questions too similar to already-selected ones."""
if not existing:
return True
q_emb = self.embedder.encode([question])
e_embs = self.embedder.encode(existing)
sims = util.cos_sim(q_emb, e_embs)[0]
return float(sims.max()) < threshold
def filter_batch(self, questions: list[dict], context: str) -> list[dict]:
"""Apply all quality filters and return passing questions."""
passed = []
for q in questions:
if not self.is_answerable(q["question"], q["answer"], context):
continue
if not self.is_not_trivial(q["question"], context):
continue
existing_texts = [p["question"] for p in passed]
if not self.is_not_duplicate(q["question"], existing_texts):
continue
passed.append(q)
return passed
LLM-Powered Generation with Structured Output
For highest quality, use an LLM with structured prompting:
import json
from openai import OpenAI
def generate_quiz_llm(text: str, num_questions: int = 5,
difficulty: str = "intermediate") -> list[dict]:
"""Generate quiz questions using an LLM with structured output."""
client = OpenAI()
prompt = f"""Generate {num_questions} multiple-choice questions from the following text.
Difficulty level: {difficulty}
Requirements:
- Each question tests an important concept, not a trivial detail
- Exactly 4 options (A-D) with one correct answer
- Distractors should be plausible but clearly wrong
- Questions should require understanding, not just keyword matching
Text:
{text}
Return JSON array with objects containing:
- "question": the question text
- "options": {{"A": "...", "B": "...", "C": "...", "D": "..."}}
- "correct": the letter of the correct answer
- "explanation": why the correct answer is right (1 sentence)
- "bloom_level": taxonomy level (remember/understand/apply/analyze)
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
temperature=0.7,
)
result = json.loads(response.choices[0].message.content)
return result.get("questions", result) if isinstance(result, dict) else result
Full Pipeline Integration
class QuizGenerator:
def __init__(self, method: str = "hybrid"):
self.method = method
self.neural_qg = NeuralQuestionGenerator() if method != "template" else None
self.distractor_gen = DistractorGenerator()
self.quality_filter = QuestionQualityFilter()
def generate_quiz(self, text: str, num_questions: int = 10) -> list[dict]:
"""Generate a complete quiz from source text."""
# Step 1: Extract answer candidates
candidates = extract_answer_candidates(text)
# Step 2: Generate questions for each candidate
raw_questions = []
for candidate in candidates[:num_questions * 3]: # Overgenerate
if self.method == "template":
q = generate_template_question(candidate)
if q:
raw_questions.append(q)
else:
neural_qs = self.neural_qg.generate(
candidate["sentence"], candidate["text"], num_questions=2
)
for nq in neural_qs:
raw_questions.append({
"question": nq,
"answer": candidate["text"],
"type": "neural",
"source_sentence": candidate["sentence"],
})
# Step 3: Generate distractors
all_entities = candidates
for q in raw_questions:
distractors = self.distractor_gen.generate(
q["answer"],
next((c["type"] for c in candidates if c["text"] == q["answer"]), "DEFAULT"),
all_entities,
)
q["distractors"] = distractors
# Step 4: Quality filter
filtered = self.quality_filter.filter_batch(raw_questions, text)
return filtered[:num_questions]
Bloom’s Taxonomy Classification
Tag questions by cognitive level for balanced assessments:
BLOOM_KEYWORDS = {
"remember": {"what", "who", "when", "where", "list", "name", "define", "identify"},
"understand": {"explain", "describe", "summarize", "compare", "why", "how"},
"apply": {"calculate", "solve", "use", "demonstrate", "implement"},
"analyze": {"analyze", "examine", "differentiate", "distinguish", "relationship"},
"evaluate": {"evaluate", "judge", "justify", "assess", "argue"},
"create": {"design", "create", "propose", "develop", "construct"},
}
def classify_bloom_level(question: str) -> str:
"""Classify a question's Bloom's taxonomy level from its wording."""
words = set(question.lower().split())
best_level = "remember"
best_overlap = 0
for level, keywords in BLOOM_KEYWORDS.items():
overlap = len(words & keywords)
if overlap > best_overlap:
best_overlap = overlap
best_level = level
return best_level
Performance and Scaling
Template generation processes 1,000 passages per second on a single CPU core. Neural generation with T5-base handles roughly 10 questions per second on a GPU. LLM generation through an API is rate-limited and costs approximately $0.01-0.05 per question at GPT-4o pricing.
For batch processing (generating quizzes for an entire textbook), run template and neural generation locally in parallel, then use LLM generation selectively for chapters where local methods produce insufficient quality. Store generated questions in a database with metadata (source passage, answer span, generation method, quality scores) to enable retrieval and reuse.
The one thing to remember: A robust quiz generation pipeline overproduces candidates using multiple methods (templates, neural models, LLMs), then filters aggressively for quality — because generating a passable question is easy, but generating a pedagogically valuable one requires both automated checks and human judgment.
See Also
- Python Adaptive Learning Systems How Python builds learning apps that adjust to each student like a personal tutor who knows exactly what you need next.
- Python Airflow Learn Airflow as a timetable manager that makes sure data tasks run in the right order every day.
- Python Altair Learn Altair through the idea of drawing charts by describing rules, not by hand-placing every visual element.
- Python Automated Grading How Python grades homework and exams automatically, from simple answer keys to understanding written essays.
- Python Batch Vs Stream Processing Batch processing is like doing laundry once a week; stream processing is like a self-cleaning shirt that cleans itself constantly.