Skip to content

Getting Started with Pydapter

Pydapter is a powerful adapter library that lets you easily convert between Pydantic models and various data formats. With the new field system (v0.3.0), creating robust models is easier than ever!

Installation

First, let's install pydapter and its dependencies:

# Create a virtual environment (optional but recommended)
python -m venv pydapter-demo
source pydapter-demo/bin/activate  # On Windows: pydapter-demo\Scripts\activate

# Install pydapter and dependencies
uv pip install pydapter
uv pip install pandas  # For DataFrameAdapter and SeriesAdapter
uv pip install xlsxwriter  # For ExcelAdapter
uv pip install openpyxl  # Also needed for Excel support

# Install optional modules
uv pip install "pydapter[protocols]"      # For standardized model interfaces
uv pip install "pydapter[migrations-sql]" # For database schema migrations
uv pip install "pydapter[memvid]"         # For video-based AI memory storage
uv pip install "pydapter[memvid-pulsar]"  # For enterprise streaming video memory

# or install all adapters at once
uv pip install "pydapter[all]"

Creating Models with the Field System

Option 1: Using Field Families (New!)

from pydapter.fields import DomainModelBuilder, FieldTemplate
from pydapter.protocols import (
    create_protocol_model_class,
    IDENTIFIABLE,
    TEMPORAL
)

# Build a model with field families
User = (
    DomainModelBuilder("User")
    .with_entity_fields()  # Adds id, created_at, updated_at
    .add_field("name", FieldTemplate(base_type=str))
    .add_field("email", FieldTemplate(base_type=str))
    .add_field("active", FieldTemplate(base_type=bool, default=True))
    .add_field("tags", FieldTemplate(base_type=list[str], default_factory=list))
    .build()
)

# Or create a protocol-compliant model with behaviors
User = create_protocol_model_class(
    "User",
    IDENTIFIABLE,  # Adds id field
    TEMPORAL,      # Adds created_at, updated_at + update_timestamp() method
    name=FieldTemplate(base_type=str),
    email=FieldTemplate(base_type=str),
    active=FieldTemplate(base_type=bool, default=True),
    tags=FieldTemplate(base_type=list[str], default_factory=list)
)

Option 2: Traditional Pydantic Models

from pydantic import BaseModel, Field
from typing import List
from pydapter.adapters.json_ import JsonAdapter

# Define a traditional Pydantic model
class User(BaseModel):
    id: int
    name: str
    email: str
    active: bool = True
    tags: List[str] = []

Using Adapters

Once you have your models, you can use pydapter's adapters to convert data:

from pydapter.adapters.json_ import JsonAdapter

# Create some test data
users = [
    User(id=1, name="Alice", email="alice@example.com", tags=["admin", "staff"]),
    User(id=2, name="Bob", email="bob@example.com", active=False),
    User(id=3, name="Charlie", email="charlie@example.com", tags=["staff"]),
]

# If using protocol models with behaviors
if hasattr(users[0], 'update_timestamp'):
    users[0].update_timestamp()  # Updates the updated_at field

# Convert models to JSON
json_data = JsonAdapter.to_obj(users, many=True)
print("JSON Output:")
print(json_data)

# Convert JSON back to models
loaded_users = JsonAdapter.from_obj(User, json_data, many=True)
print("\nLoaded users:")
for user in loaded_users:
    print(f"{user.name} ({user.email}): Active={user.active}, Tags={user.tags}")

Using the Adaptable Mixin for Better Ergonomics

Pydapter provides an Adaptable mixin that makes the API more ergonomic:

from pydantic import BaseModel
from typing import List
from pydapter.core import Adaptable
from pydapter.adapters.json_ import JsonAdapter

# Define a model with the Adaptable mixin
class Product(BaseModel, Adaptable):
    id: int
    name: str
    price: float
    in_stock: bool = True

# Register the JSON adapter
Product.register_adapter(JsonAdapter)

# Create a product
product = Product(id=101, name="Laptop", price=999.99)

# Convert to JSON using the mixin method
json_data = product.adapt_to(obj_key="json")
print("JSON Output:")
print(json_data)

# Convert back to a model
loaded_product = Product.adapt_from(json_data, obj_key="json")
print(f"\nLoaded product: {loaded_product.name} (${loaded_product.price})")

Working with CSV

Here's how to use the CSV adapter:

from pydantic import BaseModel
from pydapter.adapters.csv_ import CsvAdapter

# Define a Pydantic model
class Employee(Adaptable, BaseModel):
    id: int
    name: str
    department: str
    salary: float
    hire_date: str

# Create some sample data
employees = [
    Employee(
        id=1, name="Alice", department="Engineering",
        salary=85000, hire_date="2020-01-15"
    ),
    Employee(
        id=2, name="Bob", department="Marketing",
        salary=75000, hire_date="2021-03-20"
    ),
    Employee(
        id=3, name="Charlie", department="Finance",
        salary=95000, hire_date="2019-11-01"
    ),
]

csv_data = CsvAdapter.to_obj(employees, many=True)
print("CSV Output:")
print(csv_data)

# Convert CSV back to models
loaded_employees = CsvAdapter.from_obj(Employee, csv_data, many=True)
print("\nLoaded employees:")
for employee in loaded_employees:
    print(f"{employee.name} - {employee.department} (${employee.salary})")

# You can also save to a file and read from a file
from pathlib import Path

# Save to file
Path("employees.csv").write_text(csv_data)

# Read from file
file_employees = CsvAdapter.from_obj(Employee, Path("employees.csv"), many=True)

Working with TOML

Here's how to use the TOML adapter:

from pydantic import BaseModel
from typing import List, Dict, Optional
from pydapter.adapters.toml_ import TomlAdapter

# Define a Pydantic model

class AppConfig(BaseModel):
    app_name: str
    version: str
    debug: bool = False
    database: Dict[str, str] = {}
    allowed_hosts: List[str] = []

# Create a config

config = AppConfig(
    app_name="MyApp",
    version="1.0.0",
    debug=True,
    database={"host": "localhost", "port": "5432", "name": "myapp"},
    allowed_hosts=["localhost", "example.com"]
)

# Convert to TOML
toml_data = TomlAdapter.to_obj(config)
print("TOML Output:")
print(toml_data)

# Convert TOML back to model
loaded_config = TomlAdapter.from_obj(AppConfig, toml_data)
print("\nLoaded config:")
print(f"App: {loaded_config.app_name} v{loaded_config.version}")
print(f"Debug mode: {loaded_config.debug}")
print(f"Database: {loaded_config.database}")
print(f"Allowed hosts: {loaded_config.allowed_hosts}")

# Save to file
Path("config.toml").write_text(toml_data)

# Read from file
file_config = TomlAdapter.from_obj(AppConfig, Path("config.toml"))

Working with Pandas DataFrame

Here's how to use the DataFrame adapter:

import pandas as pd
from pydantic import BaseModel
from pydapter.extras.pandas_ import DataFrameAdapter

# Define a Pydantic model
class SalesRecord(BaseModel):
    id: int
    product: str
    quantity: int
    price: float
    date: str

# Create a sample DataFrame

df = pd.DataFrame([
    {
        "id": 1, "product": "Laptop", "quantity": 2,
        "price": 999.99, "date": "2023-01-15"
    },
    {
        "id": 2, "product": "Monitor", "quantity": 3,
        "price": 249.99, "date": "2023-01-20"
    },
    {
        "id": 3, "product": "Mouse", "quantity": 5,
        "price": 29.99, "date": "2023-01-25"
    }
])

# Convert DataFrame to models
sales_records = DataFrameAdapter.from_obj(SalesRecord, df, many=True)
print("DataFrame to Models:")
for record in sales_records:
    print(f"{record.id}: {record.quantity} x {record.product} at ${record.price}")

# Convert models back to DataFrame
new_df = DataFrameAdapter.to_obj(sales_records, many=True)
print("\nModels to DataFrame:")
print(new_df)

Working with Excel Files

Here's how to use the Excel adapter:

from pydantic import BaseModel
from typing import List, Optional
from pydapter.extras.excel_ import ExcelAdapter
from pathlib import Path

# Define a Pydantic model
class Student(BaseModel):
    id: int
    name: str
    grade: str
    score: float

# Create some sample data

students = [
    Student(id=1, name="Alice", grade="A", score=92.5),
    Student(id=2, name="Bob", grade="B", score=85.0),
    Student(id=3, name="Charlie", grade="A-", score=90.0),
]

# Convert to Excel and save to file

excel_data = ExcelAdapter.to_obj(students, many=True, sheet_name="Students")
with open("students.xlsx", "wb") as f:
    f.write(excel_data)

print("Excel file saved as 'students.xlsx'")

# Read from Excel file

loaded_students = ExcelAdapter.from_obj(
    Student, Path("students.xlsx"), many=True
)
print("\nLoaded students:")
for student in loaded_students:
    print(f"{student.name}: {student.grade} ({student.score})")

Working with Video Memory (Memvid)

Pydapter now supports video-based AI memory through the Memvid adapter. This allows you to encode text into video files for efficient storage and semantic search:

from pydantic import BaseModel
from pydapter.extras.memvid_ import MemvidAdapter

# First install memvid: pip install memvid

# Define a document model
class Document(BaseModel):
    id: str
    text: str
    category: str
    source: str

# Create some documents
documents = [
    Document(
        id="1",
        text="Artificial intelligence is transforming how we work with data",
        category="tech",
        source="blog"
    ),
    Document(
        id="2",
        text="Machine learning algorithms can detect patterns in large datasets",
        category="tech",
        source="paper"
    ),
    Document(
        id="3",
        text="Natural language processing enables computers to understand text",
        category="ai",
        source="tutorial"
    ),
]

# Build video memory from documents
build_result = MemvidAdapter.to_obj(
    documents,
    video_file="knowledge_base.mp4",
    index_file="knowledge_index.json",
    text_field="text",  # Field containing text to encode
    chunk_size=512,     # Size of text chunks
    overlap=32         # Overlap between chunks
)

print(f"Encoded {build_result['encoded_count']} documents into video memory")

# Search the video memory
search_config = {
    "video_file": "knowledge_base.mp4",
    "index_file": "knowledge_index.json",
    "query": "machine learning algorithms",
    "top_k": 2  # Return top 2 results
}

results = MemvidAdapter.from_obj(Document, search_config, many=True)
print(f"\nFound {len(results)} relevant documents:")
for doc in results:
    print(f"- {doc.text}")

Advanced: Using Pulsar for Streaming Video Memory

For enterprise use cases, you can use the Pulsar-enhanced Memvid adapter for distributed video memory operations:

import asyncio
from pydapter.extras.async_memvid_pulsar import AsyncPulsarMemvidAdapter

# First install dependencies: pip install memvid pulsar-client

async def demo_streaming_memory():
    # Stream documents for video memory creation
    stream_result = await AsyncPulsarMemvidAdapter.to_obj(
        documents,
        pulsar_url="pulsar://localhost:6650",
        topic="memory-operations",
        memory_id="knowledge-base-v1",
        video_file="memories/knowledge.mp4",
        index_file="memories/knowledge.json",
        async_processing=False  # Process immediately for demo
    )

    print(f"Streaming result: {stream_result['success']}")

    # Search with direct query
    search_result = await AsyncPulsarMemvidAdapter.from_obj(
        Document,
        {
            "pulsar_url": "pulsar://localhost:6650",
            "query": "artificial intelligence",
            "video_file": "memories/knowledge.mp4",
            "index_file": "memories/knowledge.json",
            "memory_id": "knowledge-base-v1"
        },
        many=True
    )

    print(f"Found {len(search_result)} documents via streaming search")

# Run the async demo
# asyncio.run(demo_streaming_memory())

Error Handling

Let's demonstrate proper error handling:

from pydantic import BaseModel, Field
from pydapter.adapters.json_ import JsonAdapter
from pydapter.exceptions import ParseError, ValidationError as AdapterValidationError

# Define a model with validation constraints
class Product(BaseModel):
    id: int = Field(gt=0)  # Must be greater than 0
    name: str = Field(min_length=3)  # Must be at least 3 characters
    price: float = Field(gt=0.0)  # Must be greater than 0

# Handle parsing errors

try:
    # Try to parse invalid JSON
    invalid_json = "{ 'id': 1, 'name': 'Laptop', price: 999.99 }"  # Note the
                                                                   # missing
                                                                   # quotes
                                                                   # around
                                                                   # 'price'
    product = JsonAdapter.from_obj(Product, invalid_json)
except ParseError as e:
    print(f"Parsing error: {e}")

# Handle validation errors

try:
    # Try to create a model with invalid data
    valid_json = '{"id": 0, "name": "A", "price": -10.0}'  # All fields
                                                           # violate
                                                           # constraints
    product = JsonAdapter.from_obj(Product, valid_json)
except AdapterValidationError as e:
    print(f"Validation error: {e}")
    if hasattr(e, 'errors') and callable(e.errors):
        for error in e.errors():
            print(f"  - {error['loc']}: {error['msg']}")

Using Protocols

Pydapter provides a set of standardized interfaces through the protocols module. These protocols allow you to add common capabilities to your models:

from pydapter.protocols import Identifiable, Temporal

# Define a model with standardized interfaces

class User(Identifiable, Temporal):
    name: str
    email: str

# Create a user

user = User(name="Alice", email="alice@example.com")

# Access standardized properties
print(f"User ID: {user.id}")  # Automatically generated UUID
print(f"Created at: {user.created_at}")  # Automatically set timestamp

# Update the timestamp

user.name = "Alicia"
user.update_timestamp()
print(f"Updated at: {user.updated_at}")

For more details, see the Protocols documentation and the Using Protocols tutorial.

Using Migrations

Pydapter provides tools for managing database schema changes through the migrations module:

from pydapter.migrations import AlembicAdapter
import mymodels  # Module containing your SQLAlchemy models

# Initialize migrations

AlembicAdapter.init_migrations(
    directory="migrations",
    connection_string="postgresql://user:pass@localhost/mydb",
    models_module=mymodels
)

# Create a migration

revision = AlembicAdapter.create_migration(
    message="Create users table",
    autogenerate=True,
    directory="migrations",
    connection_string="postgresql://user:pass@localhost/mydb"
)

# Apply migrations

AlembicAdapter.upgrade(
    revision="head",
    directory="migrations",
    connection_string="postgresql://user:pass@localhost/mydb"
)

For more details, see the Migrations documentation and the Using Migrations tutorial.

Video-based AI Memory with Memvid

Pydapter includes support for video-based AI memory through the Memvid adapter, which allows you to encode text data into video format for semantic search and retrieval.

Basic Memvid Adapter

The MemvidAdapter converts text data to video-based memory and enables semantic search across encoded content.

from pydantic import BaseModel
from pydapter.core import Adaptable
from pydapter.extras.memvid_ import MemvidAdapter

class Document(Adaptable, BaseModel):
    id: str
    text: str
    category: str = "general"

# Register the adapter
Document.register_adapter(MemvidAdapter)

# Create documents
docs = [
    Document(
        id="1",
        text="Machine learning is transforming how we process data",
        category="tech"
    ),
    Document(
        id="2",
        text="Natural language processing enables computers to understand text",
        category="ai"
    ),
    Document(
        id="3",
        text="Computer vision allows machines to interpret visual information",
        category="cv"
    )
]

# Build video memory from documents
result = MemvidAdapter.to_obj(
    docs,
    video_file="knowledge_base.mp4",
    index_file="knowledge_index.json",
    chunk_size=512,
    overlap=50,
    codec="h265"
)
print(f"Encoded {result['encoded_count']} documents")
print(f"Created {result['chunks']} chunks across {result['frames']} frames")

# Search the video memory
search_results = MemvidAdapter.from_obj(
    Document,
    {
        "video_file": "knowledge_base.mp4",
        "index_file": "knowledge_index.json",
        "query": "machine learning data processing",
        "top_k": 3
    },
    many=True
)

for doc in search_results:
    print(f"Found: {doc.text[:50]}...")

Enterprise Async Pulsar Memvid Adapter

For enterprise applications, the AsyncPulsarMemvidAdapter provides streaming video memory creation and search through Apache Pulsar.

import asyncio
from pydantic import BaseModel
from pydapter.async_core import AsyncAdaptable
from pydapter.extras.async_memvid_pulsar import AsyncPulsarMemvidAdapter

class Document(AsyncAdaptable, BaseModel):
    id: str
    text: str
    category: str = "general"
    source: str = "system"

# Register the async adapter
Document.register_async_adapter(AsyncPulsarMemvidAdapter)

async def build_distributed_memory():
    """Build video memory using distributed Pulsar streaming."""

    # Create sample documents
    docs = [
        Document(
            id="doc1",
            text="Advanced neural networks enable complex pattern recognition",
            category="deep_learning",
            source="research"
        ),
        Document(
            id="doc2",
            text="Transformer architectures revolutionized natural language",
            category="nlp",
            source="papers"
        )
    ]

    # Stream to Pulsar for async video memory creation
    result = await AsyncPulsarMemvidAdapter.to_obj(
        docs,
        pulsar_url="pulsar://localhost:6650",
        topic="memory-updates",
        memory_id="enterprise-kb",
        video_file="/data/memories/enterprise_kb.mp4",
        index_file="/data/memories/enterprise_kb.json",
        async_processing=True  # Process asynchronously
    )

    print(f"Streaming result: {result}")

    # Search using streaming
    search_results = await AsyncPulsarMemvidAdapter.from_obj(
        Document,
        {
            "pulsar_url": "pulsar://localhost:6650",
            "search_topic": "search-queries",
            "video_file": "/data/memories/enterprise_kb.mp4",
            "index_file": "/data/memories/enterprise_kb.json",
            "query": "neural networks pattern recognition",
            "top_k": 5
        },
        many=True
    )

    return search_results

# Run the async example
# results = asyncio.run(build_distributed_memory())

Installation

To use Memvid adapters, install the required dependencies:

# For basic Memvid support
pip install memvid

# For enterprise Pulsar streaming (optional)
pip install pulsar-client

Key Features

  • Video-based encoding: Convert text to searchable video format
  • Semantic search: Find similar content using embeddings
  • Chunking support: Automatic text chunking with overlap
  • Multiple codecs: Support for H.264, H.265, and other formats
  • Async streaming: Enterprise-grade processing with Apache Pulsar
  • Distributed processing: Scale across multiple workers
  • Error handling: Robust error recovery and validation

Use Cases

  • Knowledge bases: Build searchable video memories from documents
  • Content libraries: Encode and search large text collections
  • Research databases: Semantic search across academic papers
  • Enterprise search: Distributed text processing and retrieval
  • AI training data: Prepare text data for machine learning models