Test Data Factories — Deep Dive

Factory architecture for large codebases

In serious projects, factories need organization. A common layout:

tests/
├── factories/
│   ├── __init__.py          # Re-exports all factories
│   ├── base.py              # Base factory classes
│   ├── users.py             # User-related factories
│   ├── orders.py            # Order-related factories
│   ├── products.py          # Product-related factories
│   └── payments.py          # Payment-related factories
├── conftest.py              # Registers factories as fixtures
└── ...

The base module establishes conventions:

# tests/factories/base.py
import factory
from myapp.database import Session


class BaseFactory(factory.alchemy.SQLAlchemyModelFactory):
    class Meta:
        abstract = True
        sqlalchemy_session = Session
        sqlalchemy_session_persistence = "commit"

All concrete factories inherit from this, ensuring consistent database session handling.

SQLAlchemy integration patterns

factory_boy’s SQLAlchemy integration handles the ORM lifecycle:

# tests/factories/users.py
import factory
from myapp.models import User, Address, Subscription
from .base import BaseFactory


class AddressFactory(BaseFactory):
    class Meta:
        model = Address

    street = factory.Faker("street_address")
    city = factory.Faker("city")
    state = factory.Faker("state_abbr")
    zip_code = factory.Faker("zipcode")
    country = "US"


class SubscriptionFactory(BaseFactory):
    class Meta:
        model = Subscription

    plan = "free"
    started_at = factory.Faker("date_time_this_year")
    expires_at = None
    auto_renew = False

    class Params:
        premium = factory.Trait(
            plan="premium",
            auto_renew=True,
            expires_at=factory.Faker(
                "date_time_between",
                start_date="+30d",
                end_date="+365d",
            ),
        )


class UserFactory(BaseFactory):
    class Meta:
        model = User

    name = factory.Faker("name")
    email = factory.LazyAttribute(
        lambda o: f"{o.name.lower().replace(' ', '.')}@test.com"
    )
    password_hash = "$2b$12$fixed_hash_for_testpass123"
    is_active = True
    created_at = factory.Faker("date_time_this_year")

    address = factory.SubFactory(AddressFactory)
    subscription = factory.SubFactory(SubscriptionFactory)

    class Params:
        premium = factory.Trait(
            subscription=factory.SubFactory(
                SubscriptionFactory, premium=True
            ),
        )
        with_orders = factory.Trait(
            orders=factory.RelatedFactoryList(
                "tests.factories.orders.OrderFactory",
                factory_related_name="customer",
                size=3,
            ),
        )

The premium trait and with_orders trait compose: UserFactory(premium=True, with_orders=True) creates a premium user with three orders in one call.

Complex relationship graphs

Real applications have deep object graphs. Factories manage this through layered SubFactories:

# tests/factories/orders.py
from .base import BaseFactory
from .products import ProductFactory
from myapp.models import Order, OrderItem, Payment
import factory


class OrderItemFactory(BaseFactory):
    class Meta:
        model = OrderItem

    product = factory.SubFactory(ProductFactory)
    quantity = factory.Faker("random_int", min=1, max=5)
    unit_price = factory.LazyAttribute(lambda o: o.product.price)
    total = factory.LazyAttribute(
        lambda o: o.unit_price * o.quantity
    )


class PaymentFactory(BaseFactory):
    class Meta:
        model = Payment

    method = "credit_card"
    status = "completed"
    amount = factory.LazyAttribute(lambda o: o.order.total)
    processed_at = factory.Faker("date_time_this_month")
    transaction_id = factory.Faker("uuid4")

    class Params:
        failed = factory.Trait(
            status="failed",
            error_code="card_declined",
            processed_at=None,
        )


class OrderFactory(BaseFactory):
    class Meta:
        model = Order

    customer = factory.SubFactory(
        "tests.factories.users.UserFactory"
    )
    status = "completed"
    created_at = factory.Faker("date_time_this_month")

    items = factory.RelatedFactoryList(
        OrderItemFactory,
        factory_related_name="order",
        size=2,
    )
    payment = factory.RelatedFactory(
        PaymentFactory,
        factory_related_name="order",
    )

    @factory.lazy_attribute
    def total(self):
        # Calculated after items are created
        return sum(
            item.total for item in self.items
        ) if hasattr(self, '_items') else 99.99

    class Params:
        unpaid = factory.Trait(
            status="pending",
            payment=factory.RelatedFactory(
                PaymentFactory,
                factory_related_name="order",
                failed=True,
            ),
        )

Creating OrderFactory() generates a full object graph: Customer → Order → 2 OrderItems → 2 Products → Payment.

Custom strategies for edge cases

Build specialized factories for testing specific scenarios:

class EdgeCaseUserFactory(UserFactory):
    """Users with data that commonly causes bugs."""

    class Params:
        unicode_name = factory.Trait(
            name="José María García-López",
            email="jose.garcia@ejemplo.com",
        )
        extremely_long = factory.Trait(
            name="A" * 200,
            email="a" * 200 + "@example.com",
        )
        special_chars = factory.Trait(
            name="O'Brien-Smith <script>alert('xss')</script>",
            email="user+tag@sub.domain.example.com",
        )
        empty_optional = factory.Trait(
            phone=None,
            address=None,
            bio="",
        )
        future_dates = factory.Trait(
            created_at=factory.Faker(
                "date_time_between",
                start_date="+1y",
                end_date="+5y",
            ),
        )

Use these in parameterized tests:

@pytest.mark.parametrize("trait", [
    "unicode_name",
    "extremely_long",
    "special_chars",
    "empty_optional",
    "future_dates",
])
def test_user_profile_handles_edge_cases(db_session, trait):
    user = EdgeCaseUserFactory(**{trait: True})
    response = client.get(f"/api/users/{user.id}")
    assert response.status_code == 200

Performance optimization

Factories that hit the database on every call slow down large test suites. Strategies to speed things up:

Build instead of create

Factory.build() creates in-memory objects without database inserts:

# 100x faster than create for unit tests
user = UserFactory.build()
assert user.email.endswith("@test.com")

Bulk creation

For tests that need large datasets:

def test_pagination_with_many_users(db_session):
    UserFactory.create_batch(500)
    db_session.commit()

    response = client.get("/api/users?page=5&per_page=20")
    assert len(response.json()["items"]) == 20
    assert response.json()["total"] == 500

Shared reference objects

Avoid creating duplicate parent objects:

def test_orders_for_same_customer(db_session):
    customer = UserFactory.create()
    # Reuse the same customer instead of creating 10 new ones
    orders = OrderFactory.create_batch(
        10, customer=customer
    )
    assert all(o.customer_id == customer.id for o in orders)

Factory fixtures with scope

# conftest.py
@pytest.fixture(scope="module")
def reference_product(db_session):
    """Shared product for all tests in this module."""
    return ProductFactory.create(
        name="Standard Widget",
        price=29.99,
    )


@pytest.fixture
def order(db_session, reference_product):
    """Order using the shared product."""
    return OrderFactory.create(
        items__0__product=reference_product,
    )

Faker providers for domain-specific data

Create custom Faker providers for your domain:

from faker.providers import BaseProvider


class EcommerceProvider(BaseProvider):
    PRODUCT_CATEGORIES = [
        "Electronics", "Clothing", "Books",
        "Home & Garden", "Sports", "Toys",
    ]
    COUPON_PREFIXES = ["SAVE", "DEAL", "PROMO", "WELCOME"]

    def product_category(self) -> str:
        return self.random_element(self.PRODUCT_CATEGORIES)

    def coupon_code(self) -> str:
        prefix = self.random_element(self.COUPON_PREFIXES)
        number = self.random_int(min=10, max=99)
        return f"{prefix}{number}"

    def tracking_number(self) -> str:
        carrier = self.random_element(["1Z", "94", "JD"])
        digits = "".join(
            str(self.random_int(0, 9)) for _ in range(18)
        )
        return f"{carrier}{digits}"


# Register the provider
from faker import Faker
fake = Faker()
fake.add_provider(EcommerceProvider)

# Use in factories
class ProductFactory(BaseFactory):
    category = factory.Faker("product_category")

class ShipmentFactory(BaseFactory):
    tracking_number = factory.Faker("tracking_number")

Testing the factories themselves

Factories are code — they can have bugs too:

class TestFactoryIntegrity:
    def test_user_factory_creates_valid_user(self, db_session):
        user = UserFactory.create()
        assert user.id is not None
        assert "@" in user.email
        assert user.is_active is True

    def test_user_factory_creates_unique_emails(self, db_session):
        users = UserFactory.create_batch(100)
        emails = [u.email for u in users]
        assert len(set(emails)) == 100

    def test_order_factory_creates_full_graph(self, db_session):
        order = OrderFactory.create()
        assert order.customer is not None
        assert len(order.items) == 2
        assert order.payment is not None
        assert order.payment.status == "completed"

    def test_premium_trait_sets_correct_fields(self, db_session):
        user = UserFactory.create(premium=True)
        assert user.subscription.plan == "premium"
        assert user.subscription.auto_renew is True

Run factory integrity tests as part of CI to catch issues early — a broken factory can cascade failures across hundreds of tests.

The one thing to remember: Well-architected test data factories centralize object creation logic, manage complex relationship graphs through SubFactories and traits, and use build mode for speed and create mode for integration tests — becoming the foundation that keeps your entire test suite maintainable.

pythontestingquality

See Also