Velar Documentation
Write a Python function, call .deploy(), and it runs on a real GPU. No Docker, no Kubernetes, no YAML.
Installation
Install the Velar SDK from PyPI. Requires Python 3.9+.
pip install velar-sdkQuickstart
Three steps: authenticate, define a function, deploy.
1. Authenticate
Run the login command — it opens your browser and saves your API key to ~/.velar/token.
velar login2. Write a function
Decorate any Python function with @app.function() and specify the GPU you want. Use @app.local_entrypoint() to define what runs locally after deploy.
import velar
app = velar.App("hello-gpu")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("transformers", "accelerate")
@app.function(gpu="A100", image=image)
def hello(name: str) -> str:
return f"Hello {name}, running on a real A100!"
@app.local_entrypoint()
def main():
app.deploy(wait=True)
result = hello.remote("world")
print(result) # Hello world, running on a real A100!3. Run
velar run hello:appVelar builds the image remotely — no Docker required on your machine. First build typically takes 1–3 minutes. Subsequent deploys with no code changes skip the build entirely (cache hit). Serverless deployments are cancelled automatically when the entrypoint finishes.
GPU Functions
@app.function() defines a serverless GPU function. The pod spins up on demand and terminates when the job finishes. You pay only for the seconds it runs.
@app.function(
gpu="A100", # L4 | RTX4090 | L40S | A100 | H100 | H100-SXM | H200
image=image,
timeout=600, # seconds before auto-terminate (default: 600)
gpu_count=1, # number of GPUs (default: 1)
)
def my_function(x: int) -> int:
return x * 2
@app.local_entrypoint()
def main():
app.deploy(wait=True)
result = my_function.remote(21) # → 42
print(result)Supported GPUs: L4 ($0.66/hr), RTX 4090 ($1.00/hr), L40S ($1.46/hr), A100 ($2.36/hr), H100 PCIe ($4.06/hr), H100 SXM ($4.57/hr), H200 ($6.10/hr). All billed per second.
Endpoints
@app.endpoint() deploys a persistent pod with a stable URL. The pod stays alive between requests — no cold start. Use this for inference APIs that are called repeatedly. Optionally register a stable slug (requires Pro or Business plan).
import velar
app = velar.App("inference-api")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("transformers", "accelerate")
@app.endpoint(
gpu="L4",
image=image,
slug="my-model", # stable URL: invoke.velar.run/{user_id}/my-model
keep_warm=True, # pod stays running between requests
)
def predict(payload: dict) -> dict:
import os
from transformers import pipeline
pipe = pipeline("text-generation", model="gpt2")
return {"text": pipe(payload["prompt"], max_length=100)[0]["generated_text"]}
@app.local_entrypoint()
def main():
app.deploy(wait=True)
result = predict.remote({"prompt": "The future of AI is"})
print(result)Images & Dependencies
Start from any public Docker image and layer on your dependencies. Velar generates the Dockerfile and builds it for linux/amd64 automatically.
# Start from a CUDA base image
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
)
# Install Python packages
.pip_install("transformers", "accelerate", "bitsandbytes", "peft")
# Run arbitrary shell commands
.run_commands("apt-get update && apt-get install -y ffmpeg")
# Set environment variables baked into the image
.env(HF_HOME="/data/hf_cache", TOKENIZERS_PARALLELISM="false")Velar uses content-addressed image tags (SHA-256 of your Dockerfile + handler). If neither changes between deploys, the build step is skipped entirely.
Secrets & Env Vars
Pass secrets as a dict — they are stored encrypted and injected as environment variables into the container at runtime.
import os
import velar
app = velar.App("my-app")
@app.function(
gpu="A100",
image=image,
secrets={
"HF_TOKEN": os.environ["HF_TOKEN"],
"OPENAI_API_KEY": os.environ["OPENAI_API_KEY"],
},
)
def load_model():
import os
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
token=os.environ["HF_TOKEN"],
)
return "loaded"Always load secrets from environment variables — os.environ["KEY"] — instead of hardcoding values. This keeps credentials out of your source code and git history.
Volumes
Mount a persistent disk volume at /data. Useful for caching model weights between runs.
@app.function(
gpu="A100",
image=image,
volume_size_gb=50, # mounts a 50 GB volume at /data
secrets={"HF_TOKEN": "hf_xxxx"},
)
def download_and_cache_model():
import os
from transformers import AutoModelForCausalLM
cache_dir = "/data/models/llama2-7b"
if not os.path.exists(cache_dir):
AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
cache_dir=cache_dir,
token=os.environ["HF_TOKEN"],
)
return "ready"Multi-GPU
Request multiple GPUs with gpu_count. Cost scales linearly — 4× A100s = 4× the per-second rate.
@app.function(
gpu="H100",
gpu_count=4, # 4× H100, e.g. for 70B model sharding
image=image,
timeout=3600,
)
def train_large_model(config: dict):
import torch
print(f"GPUs available: {torch.cuda.device_count()}") # → 4
# ... your training loop
return {"status": "done"}deploy()
app.deploy() builds and deploys all functions and endpoints defined in the app. Returns a dict of Deployment objects keyed by function name.
# Non-blocking — returns immediately
deployments = app.deploy(wait=False)
# Blocking — waits until the pod is running (shows a live spinner)
deployments = app.deploy(wait=True)
# Access individual deployments
d = deployments["my_function"]
print(d.status()) # "running" | "pending" | "completed" | "failed"Deployment API
Each Deployment object exposes these methods.
d = deployments["my_function"]
# Check current status
d.status() # → "running"
# Returns stdout lines captured during the last .remote() call
lines = d.logs() # → ["line 1", "line 2", ...]
# Block until the deployment reaches running state
d.wait(timeout=300)
# Terminate the pod and reconcile billing
d.cancel()End-to-end examples
Copy-paste ready. Each example is a complete, runnable script.
LLM Inference (Llama 3)
Deploy Llama 3 8B with transformers and call it remotely. Requires a Hugging Face token with access to the model.
import os
import velar
app = velar.App("llm-inference")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("transformers", "accelerate", "bitsandbytes")
@app.function(
gpu="A100",
image=image,
timeout=1800, # 30 min — first run downloads ~16GB from HuggingFace
secrets={"HF_TOKEN": os.environ["HF_TOKEN"]},
)
def generate(prompt: str, max_tokens: int = 200) -> str:
import os, torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ["HF_TOKEN"])
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
token=os.environ["HF_TOKEN"],
)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
output = model.generate(**inputs, max_new_tokens=max_tokens)
return tokenizer.decode(output[0], skip_special_tokens=True)
@app.local_entrypoint()
def main():
app.deploy(wait=True)
answer = generate.remote("Explain transformers in one paragraph.")
print(answer)Image Generation (Stable Diffusion)
Generate images with SDXL and return them as base64-encoded PNG.
import velar
app = velar.App("stable-diffusion")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("diffusers", "transformers", "accelerate")
@app.function(gpu="L40S", image=image, timeout=600)
def generate_image(prompt: str, steps: int = 30) -> str:
import io, base64, torch
from diffusers import StableDiffusionXLPipeline
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16,
).to("cuda")
img = pipe(prompt, num_inference_steps=steps).images[0]
buffer = io.BytesIO()
img.save(buffer, format="PNG")
return base64.b64encode(buffer.getvalue()).decode()
@app.local_entrypoint()
def main():
import base64
app.deploy(wait=True)
b64 = generate_image.remote("a futuristic city at sunset, cinematic")
with open("output.png", "wb") as f:
f.write(base64.b64decode(b64))
print("Saved output.png")Fine-tuning with LoRA
Fine-tune a model with PEFT/LoRA and save the adapter to a mounted volume.
import os
import velar
app = velar.App("lora-finetune")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("transformers", "peft", "trl", "datasets", "accelerate")
@app.function(
gpu="A100",
image=image,
timeout=7200, # 2h max
volume_size_gb=50, # save adapter at /data/adapter
secrets={"HF_TOKEN": os.environ["HF_TOKEN"]},
)
def finetune(dataset_name: str, output_dir: str = "/data/adapter") -> str:
import os, torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from datasets import load_dataset
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ["HF_TOKEN"])
model = AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype=torch.bfloat16, device_map="auto",
token=os.environ["HF_TOKEN"],
)
lora_config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"])
model = get_peft_model(model, lora_config)
dataset = load_dataset(dataset_name, split="train[:1000]")
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
args=TrainingArguments(output_dir=output_dir, num_train_epochs=1,
per_device_train_batch_size=4, fp16=True),
)
trainer.train()
model.save_pretrained(output_dir)
return f"Adapter saved to {output_dir}"
@app.local_entrypoint()
def main():
app.deploy(wait=True)
result = finetune.remote("yahma/alpaca-cleaned")
print(result)Batch Processing
Use .map() to process a list of inputs in parallel — each item runs on a separate GPU invocation concurrently.
import velar
app = velar.App("batch-embeddings")
image = velar.Image.from_registry(
"pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime"
).pip_install("sentence-transformers")
@app.function(gpu="L4", image=image)
def embed(text: str) -> list[float]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
return model.encode(text).tolist()
@app.local_entrypoint()
def main():
app.deploy(wait=True)
texts = [
"The quick brown fox jumps over the lazy dog.",
"Serverless GPU inference is fast and cheap.",
"Transformers changed natural language processing forever.",
]
# All 3 run in parallel on separate GPU pods
results = list(embed.map(texts))
print(f"Generated {len(results)} embeddings, dim={len(results[0])}")