Search Ranking Algorithms in Python — Deep Dive
Search ranking at production quality requires understanding the tradeoffs between statistical methods, tree-based learning-to-rank, and neural re-rankers. This guide implements each approach in Python with evaluation methodology.
1) BM25 variants and tuning
BM25L — better handling of long documents
Standard BM25 can over-penalize long documents. BM25L adds a floor to term frequency normalization:
import math
def bm25l_score(query_terms, doc_tf, doc_length, avg_doc_length,
doc_freq, total_docs, k1=1.5, b=0.75, delta=0.5):
score = 0.0
for term in query_terms:
tf = doc_tf.get(term, 0)
df = doc_freq.get(term, 0)
idf = math.log((total_docs + 1) / (df + 0.5))
ctf = tf / (1 - b + b * doc_length / avg_doc_length)
tf_component = (k1 + 1) * (ctf + delta) / (k1 + ctf + delta)
score += idf * tf_component
return score
The delta parameter (typically 0.5-1.0) prevents the score from dropping too low for long documents that mention the term only a few times.
BM25+ — addressing the lower-bound issue
BM25+ ensures that adding more occurrences of a query term always increases the score (fixing a theoretical issue in standard BM25):
def bm25plus_score(query_terms, doc_tf, doc_length, avg_doc_length,
doc_freq, total_docs, k1=1.5, b=0.75, delta=1.0):
score = 0.0
for term in query_terms:
tf = doc_tf.get(term, 0)
df = doc_freq.get(term, 0)
idf = math.log((total_docs + 1) / (df + 0.5))
tf_norm = ((k1 + 1) * tf) / (k1 * (1 - b + b * doc_length / avg_doc_length) + tf)
score += idf * (tf_norm + delta)
return score
Parameter tuning with grid search
from rank_bm25 import BM25Okapi
import numpy as np
def evaluate_bm25(corpus, queries, relevance_judgments, k1, b):
bm25 = BM25Okapi(corpus, k1=k1, b=b)
ndcg_scores = []
for query, relevant_docs in zip(queries, relevance_judgments):
scores = bm25.get_scores(query.split())
ranked = np.argsort(-scores)[:10]
ndcg_scores.append(ndcg_at_k(ranked, relevant_docs, k=10))
return np.mean(ndcg_scores)
# Grid search
best_score, best_params = 0, {}
for k1 in [0.5, 1.0, 1.2, 1.5, 2.0]:
for b in [0.0, 0.25, 0.5, 0.75, 1.0]:
score = evaluate_bm25(corpus, queries, relevance, k1, b)
if score > best_score:
best_score = score
best_params = {'k1': k1, 'b': b}
2) Learning to rank with LightGBM
LambdaMART is the workhorse of production LTR. LightGBM provides an efficient implementation.
Feature engineering
def extract_ranking_features(query, document):
"""Extract features for a (query, document) pair."""
query_terms = query.lower().split()
doc_text = document['content'].lower()
title_text = document['title'].lower()
features = {
# Text match features
'bm25_body': compute_bm25(query_terms, doc_text),
'bm25_title': compute_bm25(query_terms, title_text),
'exact_match_title': int(query.lower() in title_text),
'term_coverage': len([t for t in query_terms if t in doc_text]) / len(query_terms),
'query_terms_in_title': sum(1 for t in query_terms if t in title_text) / len(query_terms),
# Document quality features
'doc_length': len(doc_text.split()),
'title_length': len(title_text.split()),
'freshness_days': (datetime.now() - document['published']).days,
'click_through_rate': document.get('ctr', 0.0),
'avg_time_on_page': document.get('avg_dwell_time', 0.0),
# Popularity features
'total_clicks': document.get('total_clicks', 0),
'unique_visitors': document.get('unique_visitors', 0),
'backlink_count': document.get('backlinks', 0),
}
return features
Training
import lightgbm as lgb
import numpy as np
# X_train: feature matrix, shape (n_pairs, n_features)
# y_train: relevance labels (0, 1, 2, 3)
# groups: number of documents per query
train_data = lgb.Dataset(
X_train, label=y_train, group=groups_train,
feature_name=feature_names
)
val_data = lgb.Dataset(
X_val, label=y_val, group=groups_val,
reference=train_data
)
params = {
'objective': 'lambdarank',
'metric': 'ndcg',
'eval_at': [5, 10],
'num_leaves': 63,
'learning_rate': 0.05,
'min_data_in_leaf': 50,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'lambdarank_truncation_level': 10,
'verbose': -1
}
model = lgb.train(
params,
train_data,
num_boost_round=500,
valid_sets=[val_data],
callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)]
)
# Feature importance
importance = dict(zip(feature_names, model.feature_importance(importance_type='gain')))
sorted_imp = sorted(importance.items(), key=lambda x: -x[1])
for name, imp in sorted_imp[:10]:
print(f"{name}: {imp:.0f}")
Serving
class LTRRanker:
def __init__(self, model_path):
self.model = lgb.Booster(model_file=model_path)
def rank(self, query, candidates):
features = np.array([
extract_ranking_features(query, doc) for doc in candidates
])
scores = self.model.predict(features)
ranked_indices = np.argsort(-scores)
return [(candidates[i], scores[i]) for i in ranked_indices]
3) Neural re-ranking with cross-encoders
Cross-encoders process query and document together through a transformer, producing more accurate relevance scores than bi-encoders — but at higher computational cost.
from sentence_transformers import CrossEncoder
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
def neural_rerank(query, candidates, top_k=10):
pairs = [(query, doc['content'][:500]) for doc in candidates]
scores = reranker.predict(pairs, batch_size=32)
ranked = sorted(zip(candidates, scores), key=lambda x: -x[1])
return ranked[:top_k]
Two-stage architecture
class ProductionRanker:
def __init__(self, bm25_index, ltr_model, cross_encoder):
self.bm25 = bm25_index
self.ltr = ltr_model
self.reranker = cross_encoder
def search(self, query, final_k=10):
# Stage 1: BM25 retrieves 1000 candidates (fast)
candidates = self.bm25.retrieve(query, k=1000)
# Stage 2: LTR re-ranks to top 100 (moderate cost)
ltr_features = self._extract_features(query, candidates)
ltr_scores = self.ltr.predict(ltr_features)
top_100 = [candidates[i] for i in np.argsort(-ltr_scores)[:100]]
# Stage 3: Cross-encoder re-ranks top 100 to final list (expensive but precise)
pairs = [(query, doc['content'][:500]) for doc in top_100]
ce_scores = self.reranker.predict(pairs, batch_size=32)
final = [top_100[i] for i in np.argsort(-ce_scores)[:final_k]]
return final
This three-stage cascade (BM25 → LTR → cross-encoder) balances recall, precision, and latency. Each stage handles fewer documents but applies more expensive scoring.
4) Online evaluation with interleaving
A/B testing ranking changes requires large sample sizes because per-query differences are noisy. Interleaving provides faster signal:
import random
def team_draft_interleave(ranking_a, ranking_b, k=10):
"""Interleave two rankings using Team Draft method."""
interleaved = []
team_a, team_b = set(), set()
idx_a, idx_b = 0, 0
for _ in range(k):
if len(team_a) <= len(team_b):
# Team A picks next
while idx_a < len(ranking_a) and ranking_a[idx_a] in (team_a | team_b):
idx_a += 1
if idx_a < len(ranking_a):
interleaved.append(ranking_a[idx_a])
team_a.add(ranking_a[idx_a])
else:
while idx_b < len(ranking_b) and ranking_b[idx_b] in (team_a | team_b):
idx_b += 1
if idx_b < len(ranking_b):
interleaved.append(ranking_b[idx_b])
team_b.add(ranking_b[idx_b])
return interleaved, team_a, team_b
def compute_interleaving_result(clicks, team_a, team_b):
"""Count clicks attributed to each ranking."""
a_wins = sum(1 for c in clicks if c in team_a)
b_wins = sum(1 for c in clicks if c in team_b)
return {'a_wins': a_wins, 'b_wins': b_wins, 'winner': 'A' if a_wins > b_wins else 'B'}
Interleaving detects ranking quality differences with 10x fewer queries than traditional A/B tests.
5) Offline evaluation pipeline
from collections import defaultdict
import numpy as np
class RankingEvaluator:
def __init__(self, queries, relevance_judgments):
self.queries = queries
self.judgments = relevance_judgments # {query_id: {doc_id: relevance}}
def evaluate(self, ranker, metrics=['ndcg@10', 'map@10', 'mrr']):
results = defaultdict(list)
for qid, query in self.queries.items():
ranked_docs = ranker.search(query, final_k=10)
doc_ids = [d['id'] for d in ranked_docs]
relevances = [self.judgments.get(qid, {}).get(did, 0) for did in doc_ids]
if 'ndcg@10' in metrics:
results['ndcg@10'].append(self._ndcg(relevances, 10))
if 'map@10' in metrics:
results['map@10'].append(self._avg_precision(relevances, 10))
if 'mrr' in metrics:
results['mrr'].append(self._reciprocal_rank(relevances))
return {m: np.mean(v) for m, v in results.items()}
def _ndcg(self, relevances, k):
dcg = sum(r / np.log2(i + 2) for i, r in enumerate(relevances[:k]))
ideal = sorted(relevances, reverse=True)[:k]
idcg = sum(r / np.log2(i + 2) for i, r in enumerate(ideal))
return dcg / idcg if idcg > 0 else 0.0
def _avg_precision(self, relevances, k):
relevant_count = 0
precision_sum = 0.0
for i, r in enumerate(relevances[:k]):
if r > 0:
relevant_count += 1
precision_sum += relevant_count / (i + 1)
return precision_sum / relevant_count if relevant_count > 0 else 0.0
def _reciprocal_rank(self, relevances):
for i, r in enumerate(relevances):
if r > 0:
return 1.0 / (i + 1)
return 0.0
Run evaluation on every ranking change before deploying. Track metrics over time to detect regressions.
One thing to remember: production search ranking is a pipeline problem — BM25 gets you 80% of the way, LTR features push you to 90%, and neural re-ranking closes the final gap, but only if you have the evaluation infrastructure to measure whether each layer actually improves results for your users.
See Also
- Activation Functions Why neural networks need these tiny mathematical functions — and how ReLU's simplicity accidentally made deep learning possible.
- Ai Agents Architecture How AI systems go from answering questions to actually doing things — the design patterns that turn language models into autonomous agents that browse, code, and plan.
- Ai Agents ChatGPT answers questions. AI agents actually do things — browse the web, write code, send emails, and keep going until the job is done. Here's the difference.
- Ai Ethics Why building AI fairly is harder than it sounds — bias, accountability, privacy, and who gets to decide what AI is allowed to do.
- Ai Hallucinations ChatGPT sometimes makes up facts with total confidence. Here's the weird reason why — and why it's not as simple as 'the AI lied.'