Resources and Runtime
This document covers the LuisaCompute runtime system, including resource management, command execution, and synchronization.
Overview
The LuisaCompute runtime provides a unified abstraction layer over various GPU APIs (CUDA, DirectX, Metal). It handles:
Resource Management: Buffers, images, volumes, and acceleration structures
Command Scheduling: Asynchronous execution with automatic dependency tracking
Memory Transfers: Efficient data movement between host and device
Synchronization: Stream and event-based coordination
Context
The Context is the entry point to the LuisaCompute runtime. It manages backend plugins and global configuration.
Creating a Context
#include <luisa/luisa-compute.h>
// From command-line arguments
int main(int argc, char* argv[]) {
Context context{argv[0]};
// ...
}
// From explicit path
Context context{"/path/to/executable"};
// With custom data directory (for caches)
Context context{"/path/to/executable", "/path/to/data"};
Querying Available Backends
Context context{argv[0]};
// List installed backends
auto backends = context.installed_backends();
for (auto& name : backends) {
std::cout << "Backend: " << name << std::endl;
}
// Query device names for a backend
auto cuda_devices = context.backend_device_names("cuda");
for (auto& name : cuda_devices) {
std::cout << "CUDA device: " << name << std::endl;
}
Device
A Device represents a physical or virtual GPU/CPU for computation.
Creating Devices
// Create specific backend
Device cuda = context.create_device("cuda");
Device dx = context.create_device("dx"); // Windows
Device metal = context.create_device("metal"); // macOS
Device cpu = context.create_device("cpu"); // Any platform
// Create with validation enabled
Device device = context.create_device("cuda", nullptr, true);
// Or via environment variable
// LUISA_ENABLE_VALIDATION=1 ./program
// Use first available backend
Device device = context.create_default_device();
Device Properties
// Query device information
std::string backend = device.backend_name(); // "cuda", "dx", etc.
void* native_handle = device.native_handle(); // CUcontext, ID3D12Device*, etc.
uint warp_size = device.compute_warp_size(); // SIMD width (e.g., 32 for CUDA)
size_t granularity = device.memory_granularity(); // Allocation alignment
// Check if device is valid
if (device) {
// Device is properly initialized
}
Backend Extensions
Devices may support backend-specific extensions:
// CUDA-specific extensions
if (auto cuda_ext = device.extension<CUDAExternalExt>()) {
// Use CUDA interop features
}
// DirectX-specific extensions
if (auto dx_ext = device.extension<DXConfigExt>()) {
// Use DirectX-specific features
}
Buffers
Buffers are linear memory regions for structured data storage.
Creating Buffers
// Simple buffer of 1000 floats
Buffer<float> float_buf = device.create_buffer<float>(1000);
// Buffer of vectors
Buffer<float4> vec4_buf = device.create_buffer<float4>(1024);
// Buffer of custom structures (must be reflected with LUISA_STRUCT)
struct Particle {
float3 position;
float3 velocity;
float mass;
};
LUISA_STRUCT(Particle, position, velocity, mass) {};
Buffer<Particle> particles = device.create_buffer<Particle>(10000);
Buffer Operations
// Upload from host
std::vector<float> host_data(1000, 1.0f);
stream << float_buf.copy_from(host_data.data());
// Download to host
std::vector<float> result(1000);
stream << float_buf.copy_to(result.data());
// Copy between buffers
Buffer<float> other_buf = device.create_buffer<float>(1000);
stream << other_buf.copy_from(float_buf);
// Fill with value (in a kernel)
Kernel1D fill = [&](BufferFloat buf, Float value) noexcept {
Var i = dispatch_id().x;
buf.write(i, value);
};
auto fill_shader = device.compile(fill);
stream << fill_shader(float_buf, 3.14f).dispatch(1000);
Sub-Buffers
// Create a view into a portion of a buffer
Buffer<float> large_buf = device.create_buffer<float>(10000);
// View elements [1000, 5999]
BufferView<float> sub = large_buf.view(1000, 5000);
// Use sub-buffer in kernels
stream << kernel(sub).dispatch(5000);
// Copy to/from sub-buffers
stream << sub.copy_from(host_data.data());
Byte Buffers
For untyped/raw memory access:
// Create byte buffer (1000 bytes)
ByteBuffer raw = device.create_byte_buffer(1000);
// Use in kernels with manual pointer arithmetic
Kernel1D process = [&](ByteBuffer raw) noexcept {
Var idx = dispatch_id().x;
// Access via buffer pointer...
};
Images
Images are 2D textures with hardware-accelerated sampling and caching.
Creating Images
// Basic RGBA8 image (1024x1024)
// Template type determines how pixels are read/written
// Storage format determines internal memory layout
Image<float> image = device.create_image<float>(
PixelStorage::BYTE4, // 4 channels, 8 bits each
1024, 1024 // Width, height
);
// Floating point format
Image<float> hdr_image = device.create_image<float>(
PixelStorage::FLOAT4, // 4 channels, 32-bit float each
1920, 1080
);
// With mipmaps
Image<float> mip_image = device.create_image<float>(
PixelStorage::BYTE4,
1024, 1024,
10 // 10 mipmap levels (1024 -> 1)
);
// Simultaneous access (for concurrent read/write)
Image<float> shared = device.create_image<float>(
PixelStorage::BYTE4,
1024, 1024, 1, // 1 mipmap level
true // Simultaneous access
);
Pixel Storage Formats
Storage Format |
Channels |
Bits per Channel |
C++ Read/Write Type |
|---|---|---|---|
|
1 |
8 |
|
|
2 |
8 |
|
|
4 |
8 |
|
|
1 |
16 |
|
|
2 |
16 |
|
|
4 |
16 |
|
|
1 |
32 |
|
|
2 |
32 |
|
|
4 |
32 |
|
Image Operations
// Upload from host
std::vector<std::byte> pixels(width * height * 4);
// ... fill pixels ...
stream << image.copy_from(pixels.data());
// Download to host
stream << image.copy_to(pixels.data());
// Copy between images
Image<float> dest = device.create_image<float>(PixelStorage::BYTE4, 1024, 1024);
stream << dest.copy_from(image);
// Generate mipmaps
stream << image.generate_mipmaps();
Image Views
// View a specific mipmap level
ImageView<float> level0 = image.view(0); // Full resolution
ImageView<float> level1 = image.view(1); // Half resolution
// View in kernel
Kernel2D process = [&](ImageFloat image) noexcept {
Var coord = dispatch_id().xy();
Float4 color = image.read(coord);
image.write(coord, color * 0.5f);
};
stream << process(image.view(0)).dispatch(1024, 1024);
Volumes
Volumes are 3D textures, useful for volumetric data and 3D lookups.
// Create 256^3 volume
Volume<float> volume = device.create_volume<float>(
PixelStorage::BYTE4,
256, 256, 256
);
// Upload 3D data
std::vector<std::byte> voxel_data(256 * 256 * 256 * 4);
stream << volume.copy_from(voxel_data.data());
// Use in kernel
Kernel3D process = [&](VolumeFloat vol) noexcept {
Var coord = dispatch_id().xyz();
Float4 value = vol.read(coord);
vol.write(coord, value * 2.0f);
};
stream << process(volume).dispatch(256, 256, 256);
Bindless Arrays
Bindless arrays allow dynamic indexing of resources in shaders, bypassing binding limits.
Creating Bindless Arrays
// Create with 65536 slots (default)
BindlessArray bindless = device.create_bindless_array();
// Or specify slot count
BindlessArray bindless = device.create_bindless_array(1024);
// Specify allowed resource types
BindlessArray textures_only = device.create_bindless_array(
1024,
BindlessSlotType::TEXTURE_ONLY // Only images/volumes
);
Using Bindless Arrays
// Prepare resources
std::vector<Image<float>> textures;
for (int i = 0; i < 100; i++) {
textures.push_back(device.create_image<float>(PixelStorage::BYTE4, 512, 512));
}
// Bind resources to slots
luisa::vector<BindlessArrayUpdate> updates;
for (int i = 0; i < textures.size(); i++) {
updates.emplace_back(BindlessArrayUpdate{
BindlessArrayUpdate::Operation::Emplace,
i, // Slot index
textures[i].handle(), // Resource handle
Sampler::linear_linear_mirror() // Sampler
});
}
// Apply updates
stream << bindless.update(updates);
// Use in kernel
Kernel2D sample_textures = [&](BindlessArray textures, Int tex_id) noexcept {
Var coord = dispatch_id().xy();
Float2 uv = make_float2(coord) / 512.0f;
// Sample from dynamically selected texture
Float4 color = textures.sample(tex_id, uv);
// ...
};
Acceleration Structures (Ray Tracing)
For hardware-accelerated ray-scene intersection.
Meshes
// Create mesh from vertex and index buffers
Buffer<float3> vertices = device.create_buffer<float3>(num_vertices);
Buffer<Triangle> triangles = device.create_buffer<Triangle>(num_triangles);
// ... fill buffers ...
Mesh mesh = device.create_mesh(vertices, triangles);
// Build acceleration structure
stream << mesh.build();
Acceleration Structure (Accel)
// Create top-level acceleration structure
Accel accel = device.create_accel();
// Add meshes to the accel
luisa::vector<Accel::Modification> mods;
mods.push_back(Accel::Modification{
Accel::Modification::operation_set_mesh,
mesh.handle(), // Bottom-level mesh
float4x4::identity(), // Transform matrix
0, // Instance ID (visible in shader)
0xff, // Visibility mask
0, // Instance index
Accel::Modification::flag_mesh
});
// Build/update accel
stream << accel.build(mods);
// Use in ray tracing kernel
Kernel2D trace = [&](ImageFloat image, Accel accel) noexcept {
Var coord = dispatch_id().xy();
Ray ray = ...;
Var<hit> hit = accel.intersect(ray, {});
$if (hit->hit()) {
// Process hit
};
};
Streams
Streams are command queues for asynchronous execution.
Creating Streams
// Compute stream (default)
Stream compute = device.create_stream(StreamTag::COMPUTE);
// Graphics stream (for rasterization)
Stream graphics = device.create_stream(StreamTag::GRAPHICS);
// Copy/streaming stream
Stream transfer = device.create_stream(StreamTag::COPY);
Submitting Commands
// Simple command sequence
stream << buffer.copy_from(host_data)
<< shader.dispatch(1024, 1024)
<< buffer.copy_to(result_data)
<< synchronize();
// Complex command buffer
{
auto cmd_list = stream.command_buffer();
cmd_list << buffer_a.copy_from(data_a)
<< buffer_b.copy_from(data_b)
<< compute_shader(buffer_a, buffer_b)
.dispatch(1024, 1024)
<< buffer_c.copy_from(buffer_a)
<< commit();
}
// Host callbacks
stream << shader.dispatch(1024, 1024)
<< [&]() {
// This runs on host after shader completes
std::cout << "Shader finished!" << std::endl;
}
<< synchronize();
Stream Synchronization
// Synchronize specific stream
stream.synchronize();
stream << synchronize();
// Wait for specific event
Event event = device.create_event();
stream << shader.dispatch(1024, 1024)
<< event.signal()
<< synchronize();
event.synchronize(); // Block until signaled
Events
Events provide synchronization between streams or with the host.
Basic Usage
Event event = device.create_event();
// Stream A signals
Stream stream_a = device.create_stream();
stream_a << command_a
<< event.signal()
<< synchronize();
// Stream B waits
Stream stream_b = device.create_stream();
stream_b << event.wait()
<< command_b // Executes after command_a completes
<< synchronize();
Timeline Events
For more complex synchronization patterns:
TimelineEvent timeline = device.create_timeline_event();
// Signal with value
stream_a << timeline.signal(100);
// Wait for specific value
stream_b << timeline.wait(100); // Wait until value >= 100
Memory Model and RAII
All resources follow RAII (Resource Acquisition Is Initialization):
{
// Resource created
Buffer<float> buffer = device.create_buffer<float>(1000);
Image<float> image = device.create_image<float>(PixelStorage::BYTE4, 1024, 1024);
// Use resources...
} // Resources automatically released here
Move Semantics
Resources are move-only (cannot be copied):
Buffer<float> buf1 = device.create_buffer<float>(1000);
Buffer<float> buf2 = std::move(buf1); // Valid
// Buffer<float> buf3 = buf2; // Error: copy not allowed
Resource Lifetimes
Important: Ensure resources outlive their commands:
void problematic(Stream& stream) {
Buffer<float> local = device.create_buffer<float>(1000);
stream << kernel(local).dispatch(1000);
// local destroyed here, but command may not have executed!
}
// Better
void correct(Stream& stream, Buffer<float>& buffer) {
stream << kernel(buffer).dispatch(1000);
} // Caller keeps buffer alive
Command Scheduling
LuisaCompute automatically analyzes command dependencies and optimizes execution order:
// These commands have no dependencies - may execute in parallel
stream << buf_a.copy_from(host_a) // Write to buf_a
<< buf_b.copy_from(host_b) // Write to buf_b (independent)
<< kernel(buf_a).dispatch(n) // Read buf_a
<< kernel(buf_b).dispatch(n) // Read buf_b (can run concurrently)
<< synchronize();
// Dependencies are automatically tracked
stream << buf.write(data) // Write to buf
<< kernel(buf).dispatch(n) // Read buf - waits for write
<< buf.copy_to(result) // Read buf - waits for kernel
<< synchronize();
Best Practices
1. Batch Commands
// Good: Fewer, larger command batches
stream << cmd1 << cmd2 << cmd3 << cmd4 << synchronize();
// Avoid: Many small submissions
stream << cmd1 << synchronize();
stream << cmd2 << synchronize();
2. Reuse Resources
// Good: Allocate once, reuse
Buffer<float> buffer = device.create_buffer<float>(max_size);
for (int frame = 0; frame < 1000; frame++) {
stream << update_buffer(buffer)
<< process(buffer)
<< synchronize();
}
// Avoid: Reallocating each frame
for (int frame = 0; frame < 1000; frame++) {
Buffer<float> buffer = device.create_buffer<float>(size);
// ...
}
3. Use Appropriate Stream Types
// Compute work on compute stream
Stream compute = device.create_stream(StreamTag::COMPUTE);
// Transfer work on copy stream (may overlap with compute)
Stream transfer = device.create_stream(StreamTag::COPY);
transfer << upload_data(buffer);
compute << process(buffer);
// Upload and compute can overlap!
4. Minimize Host-GPU Transfers
// Good: Keep data on GPU
Buffer<float> persistent = device.create_buffer<float>(n);
for (int i = 0; i < 100; i++) {
stream << kernel(persistent).dispatch(n);
}
stream << persistent.copy_to(host);
// Avoid: Round-trips
for (int i = 0; i < 100; i++) {
stream << buffer.copy_from(host_data) // Upload
<< kernel(buffer).dispatch(n)
<< buffer.copy_to(host_data) // Download
<< synchronize();
}