Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/duckdb/duckdb/llms.txt

Use this file to discover all available pages before exploring further.

Python API Examples

This guide provides comprehensive examples of using DuckDB’s Python API, including basic SQL operations, integration with pandas and NumPy, and the relational API.

Installation

Install DuckDB using pip:
pip install duckdb
For development or specific versions:
pip install duckdb==0.10.0

Basic SQL API

Creating a Connection

import duckdb

# Connect to an in-memory database
conn = duckdb.connect()

# Or connect to a persistent database
# conn = duckdb.connect('my_database.db')

Creating Tables and Inserting Data

import duckdb

conn = duckdb.connect()

# Create a table
conn.execute("CREATE TABLE test_table (i INTEGER, j STRING)")

# Insert single row
conn.execute("INSERT INTO test_table VALUES (1, 'one')")

# Use placeholders for parameters
conn.execute("INSERT INTO test_table VALUES (?, ?)", [2, 'two'])

# Insert multiple rows with executemany()
conn.executemany("INSERT INTO test_table VALUES (?, ?)", 
                 [[3, 'three'], [4, 'four']])

Querying Data

# Fetch as pandas DataFrame
df = conn.execute("SELECT * FROM test_table").fetchdf()
print(df)

# Fetch as NumPy arrays (better for handling NULLs)
arrays = conn.execute("SELECT * FROM test_table").fetchnumpy()
print(arrays)

# Fetch one row at a time
result = conn.execute("SELECT * FROM test_table")
row = result.fetchone()
print(row)

# Fetch all rows
rows = conn.execute("SELECT * FROM test_table").fetchall()
print(rows)

Integration with Pandas

Querying Pandas DataFrames

import duckdb
import pandas as pd

conn = duckdb.connect()

# Create a pandas DataFrame
test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4], 
    "j": ["one", "two", "three", "four"]
})

# Register DataFrame as a virtual table
conn.register("test_df", test_df)

# Query the DataFrame using SQL
result = conn.execute("SELECT j FROM test_df WHERE i > 1").fetchdf()
print(result)

Direct DataFrame Queries

import duckdb
import pandas as pd

test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

# Query DataFrame directly without explicit registration
result = duckdb.query_df(test_df, 'my_df', 'SELECT * FROM my_df WHERE i > 2')
print(result.df())

Relational API

The relational API provides a programmatic way to build queries using lazy evaluation.

Creating Relations

import duckdb
import pandas as pd

conn = duckdb.connect()
test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

# Create relation from DataFrame
rel = conn.from_df(test_df)
print(rel)

# Alternative: use built-in default connection
rel = duckdb.df(test_df)
print(rel)

# Create relation from existing table
conn.execute("CREATE TABLE test_table (i INTEGER, j STRING)")
conn.executemany("INSERT INTO test_table VALUES (?, ?)",
                [[1, 'one'], [2, 'two'], [3, 'three'], [4, 'four']])
rel = conn.table("test_table")
print(rel)

Creating Relations from CSV

import duckdb
import pandas as pd
import tempfile
import os

test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

# Create temporary CSV file
temp_file_name = os.path.join(tempfile.mkdtemp(), 'data.csv')
test_df.to_csv(temp_file_name, index=False)

# Create relation from CSV
rel = duckdb.from_csv_auto(temp_file_name)
print(rel)

Relation Metadata

import duckdb

conn = duckdb.connect()
conn.execute("CREATE TABLE test_table (i INTEGER, j STRING)")
rel = conn.table("test_table")

# Get relation alias
print(rel.alias)  # 'test_table'

# Change alias
rel2 = rel.set_alias('new_alias')
print(rel2.alias)  # 'new_alias'

# Inspect relation type
print(rel.type)

# Get column names
print(rel.columns)  # ['i', 'j']

# Get column types
print(rel.types)  # [INTEGER, VARCHAR]

Filtering and Projection

import duckdb
import pandas as pd

test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

rel = duckdb.df(test_df)

# Filter rows
print(rel.filter('i > 1'))

# Project specific columns
print(rel.project('i, j'))

# Transform columns
print(rel.project('i + 1'))

Sorting and Limiting

import duckdb
import pandas as pd

test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

rel = duckdb.df(test_df)

# Order results
print(rel.order('j'))

# Limit rows
print(rel.limit(2))

# Skip rows and limit
print(rel.limit(2, offset=1))

Method Chaining

import duckdb
import pandas as pd

test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

rel = duckdb.df(test_df)

# Chain multiple operations
result = rel.filter('i > 1').project('i + 1, j').order('j').limit(2)
print(result)

Aggregation

import duckdb
import pandas as pd

test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

rel = duckdb.df(test_df)

# Simple aggregation
print(rel.aggregate("sum(i)"))

# Aggregation with implicit grouping
print(rel.aggregate("j, sum(i)"))

# Explicit grouping
print(rel.aggregate("sum(i)", "j"))

# Distinct values
print(rel.distinct())

Multi-Relation Operations

import duckdb
import pandas as pd

conn = duckdb.connect()
test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

rel = conn.from_df(test_df)
rel2 = conn.from_df(test_df)

# Union
print(rel.union(rel))

# Join on column
print(rel.join(rel2, 'i'))

# Join with explicit condition
print(rel.set_alias('a').join(rel.set_alias('b'), 'a.i=b.i'))

Shorthand Methods on DataFrames

import duckdb
import pandas as pd

test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

# Direct operations on DataFrames
print(duckdb.filter(test_df, 'i > 1'))
print(duckdb.project(test_df, 'i + 1'))
print(duckdb.order(test_df, 'j'))
print(duckdb.limit(test_df, 2))
print(duckdb.aggregate(test_df, "sum(i)"))
print(duckdb.distinct(test_df))

# Chaining (first call includes DataFrame parameter)
print(duckdb.filter(test_df, 'i > 1').project('i + 1, j').order('j').limit(2))

Converting Relations to Other Formats

import duckdb
import pandas as pd

conn = duckdb.connect()
test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

rel = conn.from_df(test_df)

# Execute and get query result
res = rel.execute()
print(res.fetchone())
print(res.fetchall())

# Convert to pandas DataFrame
print(rel.to_df())

# Shorthand for to_df()
print(rel.df())

Creating Tables from Relations

import duckdb
import pandas as pd

conn = duckdb.connect()
test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

rel = conn.from_df(test_df)

# Create table from relation
rel.create("test_table2")

# Insert into existing table
conn.execute("CREATE TABLE test_table3 (i INTEGER, j STRING)")
rel.insert_into("test_table3")

# Insert single values
conn.values([5, 'five']).insert_into("test_table3")

# Insert using relation method
rel_3 = conn.table("test_table3")
rel_3.insert([6, 'six'])

# Create view from relation
rel.create_view('test_view')

Querying Relations with SQL

import duckdb
import pandas as pd

test_df = pd.DataFrame.from_dict({
    "i": [1, 2, 3, 4],
    "j": ["one", "two", "three", "four"]
})

rel = duckdb.df(test_df)

# Run SQL on relation (first parameter is the view name)
res = rel.query('my_name_for_rel', 'SELECT * FROM my_name_for_rel')
print(res.fetchone())
print(res.fetchdf())

# Shorthand for fetchdf()
print(res.df())

# Query DataFrames directly
res = duckdb.query_df(test_df, 'my_name_for_test_df', 
                      'SELECT * FROM my_name_for_test_df')
print(res.df())

Complete Working Example

Here’s a comprehensive example combining multiple features:
import duckdb
import pandas as pd

# Create connection
conn = duckdb.connect()

# Create and populate table
conn.execute("CREATE TABLE sales(product VARCHAR, quantity INTEGER, price DOUBLE)")
conn.executemany(
    "INSERT INTO sales VALUES (?, ?, ?)",
    [
        ['Widget', 10, 19.99],
        ['Gadget', 5, 49.99],
        ['Widget', 15, 19.99],
        ['Doohickey', 3, 99.99],
        ['Gadget', 8, 49.99]
    ]
)

# Query with aggregation
result = conn.execute("""
    SELECT product, 
           SUM(quantity) as total_quantity,
           AVG(price) as avg_price,
           SUM(quantity * price) as total_revenue
    FROM sales
    GROUP BY product
    ORDER BY total_revenue DESC
""").fetchdf()

print(result)

# Use relational API for the same query
rel = conn.table('sales')
rel_result = rel.aggregate(
    'product, SUM(quantity) as total_quantity, AVG(price) as avg_price, SUM(quantity * price) as total_revenue',
    'product'
).order('total_revenue DESC')

print(rel_result.df())

Working with Cursors (PEP 249)

DuckDB supports the Python DB-API specification:
import duckdb

conn = duckdb.connect()
cursor = conn.cursor()

# Note: cursor is optional and fully redundant
# You can use execute() directly on the connection
cursor.execute("CREATE TABLE test (i INTEGER)")
cursor.execute("INSERT INTO test VALUES (?)", [42])
result = cursor.execute("SELECT * FROM test").fetchall()
print(result)

Error Handling

import duckdb

conn = duckdb.connect()

try:
    conn.execute("SELECT * FROM nonexistent_table")
except duckdb.Error as e:
    print(f"Database error: {e}")

Additional Resources