Search Ranking Algorithms in Python — Deep Dive

Implement production search ranking in Python: BM25 variants, LambdaMART with LightGBM, neural re-ranking with cross-encoders, and end-to-end evaluation pipelines.

Search ranking at production quality requires understanding the tradeoffs between statistical methods, tree-based learning-to-rank, and neural re-rankers. This guide implements each approach in Python with evaluation methodology.

1) BM25 variants and tuning

BM25L — better handling of long documents

Standard BM25 can over-penalize long documents. BM25L adds a floor to term frequency normalization:

import math

def bm25l_score(query_terms, doc_tf, doc_length, avg_doc_length,
                doc_freq, total_docs, k1=1.5, b=0.75, delta=0.5):
    score = 0.0
    for term in query_terms:
        tf = doc_tf.get(term, 0)
        df = doc_freq.get(term, 0)

        idf = math.log((total_docs + 1) / (df + 0.5))
        ctf = tf / (1 - b + b * doc_length / avg_doc_length)
        tf_component = (k1 + 1) * (ctf + delta) / (k1 + ctf + delta)
        score += idf * tf_component

    return score

The delta parameter (typically 0.5-1.0) prevents the score from dropping too low for long documents that mention the term only a few times.

BM25+ — addressing the lower-bound issue

BM25+ ensures that adding more occurrences of a query term always increases the score (fixing a theoretical issue in standard BM25):

def bm25plus_score(query_terms, doc_tf, doc_length, avg_doc_length,
                   doc_freq, total_docs, k1=1.5, b=0.75, delta=1.0):
    score = 0.0
    for term in query_terms:
        tf = doc_tf.get(term, 0)
        df = doc_freq.get(term, 0)

        idf = math.log((total_docs + 1) / (df + 0.5))
        tf_norm = ((k1 + 1) * tf) / (k1 * (1 - b + b * doc_length / avg_doc_length) + tf)
        score += idf * (tf_norm + delta)

    return score

Parameter tuning with grid search

from rank_bm25 import BM25Okapi
import numpy as np

def evaluate_bm25(corpus, queries, relevance_judgments, k1, b):
    bm25 = BM25Okapi(corpus, k1=k1, b=b)
    ndcg_scores = []

    for query, relevant_docs in zip(queries, relevance_judgments):
        scores = bm25.get_scores(query.split())
        ranked = np.argsort(-scores)[:10]
        ndcg_scores.append(ndcg_at_k(ranked, relevant_docs, k=10))

    return np.mean(ndcg_scores)

# Grid search
best_score, best_params = 0, {}
for k1 in [0.5, 1.0, 1.2, 1.5, 2.0]:
    for b in [0.0, 0.25, 0.5, 0.75, 1.0]:
        score = evaluate_bm25(corpus, queries, relevance, k1, b)
        if score > best_score:
            best_score = score
            best_params = {'k1': k1, 'b': b}

2) Learning to rank with LightGBM

LambdaMART is the workhorse of production LTR. LightGBM provides an efficient implementation.

Feature engineering

def extract_ranking_features(query, document):
    """Extract features for a (query, document) pair."""
    query_terms = query.lower().split()
    doc_text = document['content'].lower()
    title_text = document['title'].lower()

    features = {
        # Text match features
        'bm25_body': compute_bm25(query_terms, doc_text),
        'bm25_title': compute_bm25(query_terms, title_text),
        'exact_match_title': int(query.lower() in title_text),
        'term_coverage': len([t for t in query_terms if t in doc_text]) / len(query_terms),
        'query_terms_in_title': sum(1 for t in query_terms if t in title_text) / len(query_terms),

        # Document quality features
        'doc_length': len(doc_text.split()),
        'title_length': len(title_text.split()),
        'freshness_days': (datetime.now() - document['published']).days,
        'click_through_rate': document.get('ctr', 0.0),
        'avg_time_on_page': document.get('avg_dwell_time', 0.0),

        # Popularity features
        'total_clicks': document.get('total_clicks', 0),
        'unique_visitors': document.get('unique_visitors', 0),
        'backlink_count': document.get('backlinks', 0),
    }
    return features

Training

import lightgbm as lgb
import numpy as np

# X_train: feature matrix, shape (n_pairs, n_features)
# y_train: relevance labels (0, 1, 2, 3)
# groups: number of documents per query

train_data = lgb.Dataset(
    X_train, label=y_train, group=groups_train,
    feature_name=feature_names
)
val_data = lgb.Dataset(
    X_val, label=y_val, group=groups_val,
    reference=train_data
)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'eval_at': [5, 10],
    'num_leaves': 63,
    'learning_rate': 0.05,
    'min_data_in_leaf': 50,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambdarank_truncation_level': 10,
    'verbose': -1
}

model = lgb.train(
    params,
    train_data,
    num_boost_round=500,
    valid_sets=[val_data],
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)]
)

# Feature importance
importance = dict(zip(feature_names, model.feature_importance(importance_type='gain')))
sorted_imp = sorted(importance.items(), key=lambda x: -x[1])
for name, imp in sorted_imp[:10]:
    print(f"{name}: {imp:.0f}")

Serving

class LTRRanker:
    def __init__(self, model_path):
        self.model = lgb.Booster(model_file=model_path)

    def rank(self, query, candidates):
        features = np.array([
            extract_ranking_features(query, doc) for doc in candidates
        ])
        scores = self.model.predict(features)
        ranked_indices = np.argsort(-scores)
        return [(candidates[i], scores[i]) for i in ranked_indices]

3) Neural re-ranking with cross-encoders

Cross-encoders process query and document together through a transformer, producing more accurate relevance scores than bi-encoders — but at higher computational cost.

from sentence_transformers import CrossEncoder

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)

def neural_rerank(query, candidates, top_k=10):
    pairs = [(query, doc['content'][:500]) for doc in candidates]
    scores = reranker.predict(pairs, batch_size=32)

    ranked = sorted(zip(candidates, scores), key=lambda x: -x[1])
    return ranked[:top_k]

Two-stage architecture

class ProductionRanker:
    def __init__(self, bm25_index, ltr_model, cross_encoder):
        self.bm25 = bm25_index
        self.ltr = ltr_model
        self.reranker = cross_encoder

    def search(self, query, final_k=10):
        # Stage 1: BM25 retrieves 1000 candidates (fast)
        candidates = self.bm25.retrieve(query, k=1000)

        # Stage 2: LTR re-ranks to top 100 (moderate cost)
        ltr_features = self._extract_features(query, candidates)
        ltr_scores = self.ltr.predict(ltr_features)
        top_100 = [candidates[i] for i in np.argsort(-ltr_scores)[:100]]

        # Stage 3: Cross-encoder re-ranks top 100 to final list (expensive but precise)
        pairs = [(query, doc['content'][:500]) for doc in top_100]
        ce_scores = self.reranker.predict(pairs, batch_size=32)
        final = [top_100[i] for i in np.argsort(-ce_scores)[:final_k]]

        return final

This three-stage cascade (BM25 → LTR → cross-encoder) balances recall, precision, and latency. Each stage handles fewer documents but applies more expensive scoring.

4) Online evaluation with interleaving

A/B testing ranking changes requires large sample sizes because per-query differences are noisy. Interleaving provides faster signal:

import random

def team_draft_interleave(ranking_a, ranking_b, k=10):
    """Interleave two rankings using Team Draft method."""
    interleaved = []
    team_a, team_b = set(), set()
    idx_a, idx_b = 0, 0

    for _ in range(k):
        if len(team_a) <= len(team_b):
            # Team A picks next
            while idx_a < len(ranking_a) and ranking_a[idx_a] in (team_a | team_b):
                idx_a += 1
            if idx_a < len(ranking_a):
                interleaved.append(ranking_a[idx_a])
                team_a.add(ranking_a[idx_a])
        else:
            while idx_b < len(ranking_b) and ranking_b[idx_b] in (team_a | team_b):
                idx_b += 1
            if idx_b < len(ranking_b):
                interleaved.append(ranking_b[idx_b])
                team_b.add(ranking_b[idx_b])

    return interleaved, team_a, team_b

def compute_interleaving_result(clicks, team_a, team_b):
    """Count clicks attributed to each ranking."""
    a_wins = sum(1 for c in clicks if c in team_a)
    b_wins = sum(1 for c in clicks if c in team_b)
    return {'a_wins': a_wins, 'b_wins': b_wins, 'winner': 'A' if a_wins > b_wins else 'B'}

Interleaving detects ranking quality differences with 10x fewer queries than traditional A/B tests.

5) Offline evaluation pipeline

from collections import defaultdict
import numpy as np

class RankingEvaluator:
    def __init__(self, queries, relevance_judgments):
        self.queries = queries
        self.judgments = relevance_judgments  # {query_id: {doc_id: relevance}}

    def evaluate(self, ranker, metrics=['ndcg@10', 'map@10', 'mrr']):
        results = defaultdict(list)

        for qid, query in self.queries.items():
            ranked_docs = ranker.search(query, final_k=10)
            doc_ids = [d['id'] for d in ranked_docs]
            relevances = [self.judgments.get(qid, {}).get(did, 0) for did in doc_ids]

            if 'ndcg@10' in metrics:
                results['ndcg@10'].append(self._ndcg(relevances, 10))
            if 'map@10' in metrics:
                results['map@10'].append(self._avg_precision(relevances, 10))
            if 'mrr' in metrics:
                results['mrr'].append(self._reciprocal_rank(relevances))

        return {m: np.mean(v) for m, v in results.items()}

    def _ndcg(self, relevances, k):
        dcg = sum(r / np.log2(i + 2) for i, r in enumerate(relevances[:k]))
        ideal = sorted(relevances, reverse=True)[:k]
        idcg = sum(r / np.log2(i + 2) for i, r in enumerate(ideal))
        return dcg / idcg if idcg > 0 else 0.0

    def _avg_precision(self, relevances, k):
        relevant_count = 0
        precision_sum = 0.0
        for i, r in enumerate(relevances[:k]):
            if r > 0:
                relevant_count += 1
                precision_sum += relevant_count / (i + 1)
        return precision_sum / relevant_count if relevant_count > 0 else 0.0

    def _reciprocal_rank(self, relevances):
        for i, r in enumerate(relevances):
            if r > 0:
                return 1.0 / (i + 1)
        return 0.0

Run evaluation on every ranking change before deploying. Track metrics over time to detect regressions.

One thing to remember: production search ranking is a pipeline problem — BM25 gets you 80% of the way, LTR features push you to 90%, and neural re-ranking closes the final gap, but only if you have the evaluation infrastructure to measure whether each layer actually improves results for your users.

pythonsearch-rankinglearning-to-rankcross-encoder