Python C Extension Performance — Deep Dive

Writing C extensions for Python is the most powerful performance optimization available, but it comes with real complexity. This guide covers the mechanics, pitfalls, and modern best practices for shipping C-accelerated Python code.

The CPython C API

Basic extension structure

A minimal C extension module:

#define PY_SSIZE_T_CLEAN
#include <Python.h>

static PyObject* fast_sum(PyObject* self, PyObject* args) {
    PyObject* list_obj;
    if (!PyArg_ParseTuple(args, "O", &list_obj))
        return NULL;

    if (!PyList_Check(list_obj)) {
        PyErr_SetString(PyExc_TypeError, "expected a list");
        return NULL;
    }

    Py_ssize_t n = PyList_Size(list_obj);
    double total = 0.0;

    for (Py_ssize_t i = 0; i < n; i++) {
        PyObject* item = PyList_GetItem(list_obj, i);  // borrowed ref
        total += PyFloat_AsDouble(item);
        if (PyErr_Occurred())
            return NULL;
    }

    return PyFloat_FromValue(total);
}

static PyMethodDef module_methods[] = {
    {"fast_sum", fast_sum, METH_VARARGS, "Sum a list of floats quickly"},
    {NULL, NULL, 0, NULL}
};

static struct PyModuleDef module_def = {
    PyModuleDef_HEAD_INIT, "fastmath", NULL, -1, module_methods
};

PyMODINIT_FUNC PyInit_fastmath(void) {
    return PyModule_Create(&module_def);
}

Reference counting rules

The most common source of bugs in C extensions:

// RULE 1: Functions that return "new references" — you own it, you must DECREF
PyObject* result = PyObject_CallFunction(func, "i", 42);
// ... use result ...
Py_DECREF(result);  // you must do this

// RULE 2: Functions that return "borrowed references" — don't DECREF
PyObject* item = PyList_GetItem(list, 0);  // borrowed
// ... use item but DON'T Py_DECREF(item) ...

// RULE 3: When storing a borrowed reference, INCREF it
PyObject* item = PyList_GetItem(list, 0);  // borrowed
Py_INCREF(item);  // now you own a reference
// ... later ...
Py_DECREF(item);  // release your reference

// COMMON BUG: forgetting to DECREF in error paths
static PyObject* leaky_function(PyObject* self, PyObject* args) {
    PyObject* temp = PyList_New(10);
    if (some_condition) {
        // BUG: temp is leaked because we return without DECREF
        PyErr_SetString(PyExc_ValueError, "bad input");
        return NULL;
    }
    // ... use temp ...
    Py_DECREF(temp);
    Py_RETURN_NONE;
}

// FIXED: use goto cleanup pattern
static PyObject* safe_function(PyObject* self, PyObject* args) {
    PyObject* temp = NULL;
    PyObject* result = NULL;

    temp = PyList_New(10);
    if (!temp) goto cleanup;

    if (some_condition) {
        PyErr_SetString(PyExc_ValueError, "bad input");
        goto cleanup;
    }

    result = process(temp);

cleanup:
    Py_XDECREF(temp);  // XDECREF handles NULL safely
    return result;
}

GIL management for parallelism

Releasing the GIL

static PyObject* parallel_compute(PyObject* self, PyObject* args) {
    double* data;
    Py_ssize_t n;

    // Parse args while holding GIL
    PyObject* array_obj;
    if (!PyArg_ParseTuple(args, "O", &array_obj))
        return NULL;

    // Get buffer (requires GIL)
    Py_buffer view;
    if (PyObject_GetBuffer(array_obj, &view, PyBUF_C_CONTIGUOUS | PyBUF_FORMAT) < 0)
        return NULL;

    data = (double*)view.buf;
    n = view.len / sizeof(double);

    double result;

    // Release GIL for pure C computation
    Py_BEGIN_ALLOW_THREADS
    result = 0.0;
    for (Py_ssize_t i = 0; i < n; i++) {
        result += data[i] * data[i];
    }
    Py_END_ALLOW_THREADS

    PyBuffer_Release(&view);
    return PyFloat_FromDouble(result);
}

Between Py_BEGIN_ALLOW_THREADS and Py_END_ALLOW_THREADS, you must not touch any Python objects — they may be modified or freed by other threads.

Thread safety with free-threading (Python 3.13+)

The experimental free-threaded build removes the GIL entirely. C extensions must be updated:

// Mark extension as supporting free-threading
static struct PyModuleDef module_def = {
    .m_base = PyModuleDef_HEAD_INIT,
    .m_name = "fastmath",
    .m_size = -1,
    .m_methods = module_methods,
    // New: declare GIL policy
    .m_slots = (PyModuleDef_Slot[]){
        {Py_mod_gil, Py_MOD_GIL_NOT_USED},
        {0, NULL}
    },
};

The buffer protocol for zero-copy data access

Instead of converting Python lists to C arrays (which copies data), use the buffer protocol:

static PyObject* process_buffer(PyObject* self, PyObject* args) {
    Py_buffer view;

    if (!PyArg_ParseTuple(args, "y*", &view))  // y* = bytes-like buffer
        return NULL;

    // Direct access to memory — no copy
    unsigned char* data = (unsigned char*)view.buf;
    Py_ssize_t len = view.len;

    // Process data...
    unsigned long checksum = 0;
    Py_BEGIN_ALLOW_THREADS
    for (Py_ssize_t i = 0; i < len; i++) {
        checksum += data[i];
    }
    Py_END_ALLOW_THREADS

    PyBuffer_Release(&view);
    return PyLong_FromUnsignedLong(checksum);
}

This works with bytes, bytearray, NumPy arrays, and any object implementing the buffer protocol.

Cython: the practical middle ground

Typed memoryviews for NumPy integration

# fast_ops.pyx
import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt

def pairwise_distances(double[:, :] points):
    """Compute pairwise Euclidean distances.

    50-100× faster than pure Python equivalent.
    """
    cdef Py_ssize_t n = points.shape[0]
    cdef Py_ssize_t dims = points.shape[1]
    cdef double[:, :] result = np.zeros((n, n), dtype=np.float64)
    cdef double dist, diff
    cdef Py_ssize_t i, j, d

    with nogil:
        for i in range(n):
            for j in range(i + 1, n):
                dist = 0.0
                for d in range(dims):
                    diff = points[i, d] - points[j, d]
                    dist += diff * diff
                result[i, j] = sqrt(dist)
                result[j, i] = result[i, j]

    return np.asarray(result)

Cython compilation with pyproject.toml

[build-system]
requires = ["setuptools>=68", "cython>=3.0", "numpy>=1.24"]
build-backend = "setuptools.build_meta"

[tool.cython]
language_level = "3"
# setup.py
from setuptools import setup, Extension
from Cython.Build import cythonize
import numpy as np

extensions = [
    Extension(
        "fast_ops",
        ["fast_ops.pyx"],
        include_dirs=[np.get_include()],
    )
]

setup(ext_modules=cythonize(extensions))

pybind11: C++ integration

NumPy array access without copying

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <cmath>

namespace py = pybind11;

py::array_t<double> normalize(py::array_t<double, py::array::c_style> input) {
    auto buf = input.request();
    double* ptr = static_cast<double*>(buf.ptr);
    size_t n = buf.size;

    auto result = py::array_t<double>(buf.size);
    auto res_buf = result.request();
    double* res_ptr = static_cast<double*>(res_buf.ptr);

    // Compute sum of squares
    double sum_sq = 0.0;
    for (size_t i = 0; i < n; i++) {
        sum_sq += ptr[i] * ptr[i];
    }
    double norm = std::sqrt(sum_sq);

    // Release GIL for the computation
    py::gil_scoped_release release;
    for (size_t i = 0; i < n; i++) {
        res_ptr[i] = ptr[i] / norm;
    }

    return result;
}

PYBIND11_MODULE(linalg, m) {
    m.def("normalize", &normalize, "Normalize a vector",
          py::arg("input"));
}

Build and distribution

Cross-platform wheels with cibuildwheel

# .github/workflows/build.yml
jobs:
  build_wheels:
    strategy:
      matrix:
        os: [ubuntu-latest, windows-latest, macos-latest]
    runs-on: ${{ matrix.os }}
    steps:
      - uses: actions/checkout@v4
      - uses: pypa/cibuildwheel@v2
        env:
          CIBW_BUILD: "cp310-* cp311-* cp312-*"
          CIBW_TEST_COMMAND: "pytest {project}/tests"

cibuildwheel builds wheels for Linux (manylinux), macOS (universal2), and Windows across multiple Python versions.

Providing a pure-Python fallback

# __init__.py
try:
    from ._c_extension import fast_sum  # try C version
except ImportError:
    def fast_sum(data):  # pure Python fallback
        return sum(data)

This ensures your package installs everywhere, even where compilation fails.

Debugging C extensions

Using valgrind

PYTHONMALLOC=malloc valgrind --tool=memcheck \
    --suppressions=$(python -c "import valgrind; print(valgrind.__path__[0])")/python.supp \
    python -c "import my_extension; my_extension.test()"

Using Python’s debug build

# Build Python with debug symbols
./configure --with-pydebug
make -j$(nproc)

# Run with reference count debugging
./python -X dev -c "import my_extension"

The debug build adds assertions for reference counting errors, buffer overflows, and other common C extension bugs.

Performance measurement

import timeit

# Compare pure Python vs C extension
setup = "from fastmath import fast_sum; data = [float(i) for i in range(100000)]"

py_time = timeit.timeit("sum(data)", setup=setup, number=1000)
c_time = timeit.timeit("fast_sum(data)", setup=setup, number=1000)

print(f"Python: {py_time:.3f}s")
print(f"C ext:  {c_time:.3f}s")
print(f"Speedup: {py_time/c_time:.1f}×")

Typical speedups: 5-10× for simple loops, 50-200× for numerical computation with GIL release, 1-2× for code that’s mostly Python object manipulation (C API overhead eats the benefit).

The one thing to remember: C extensions provide the ultimate Python performance escape hatch — use Cython for gradual optimization of Python code, pybind11 for wrapping existing C++ libraries, and always release the GIL during pure computation to unlock multi-threaded parallelism.

pythonperformancec-extensions

See Also