Skip to main content
This guide walks you through submitting and monitoring a GPU workload using the Python SDK.
New to containerization? Try client.run() to auto-containerize and submit your training project in one line, no Docker or Kubernetes expertise required.

Prerequisites

Submit a Workload

1

Import the SDK

from chamber_sdk import ChamberClient, JobClass, JobStatus
2

Initialize the Client

client = ChamberClient.from_config()
# Or: client = ChamberClient()  # Uses CHAMBER_TOKEN env var
3

Submit a Workload

job = client.submit_job(
    name="my-training-job",
    initiative_id="your-team-id",
    gpu_type="H100",
    requested_gpus=4,
    job_class=JobClass.RESERVED,
    priority=50,
    tags={
        "model": "llama-7b",
        "dataset": "custom-v1"
    }
)

print(f"Workload ID: {job.id}")
print(f"Status: {job.status}")
4

Monitor the Workload

# Poll for updates
job = client.get_workload(job.id)
print(f"Current status: {job.status}")

# Or wait for completion
result = client.wait_for_completion(
    job.id,
    poll_interval=30,  # Check every 30 seconds
    timeout=7200       # Timeout after 2 hours
)

if result.status == JobStatus.COMPLETED:
    print("Workload completed successfully!")
else:
    print(f"Workload ended with status: {result.status}")

Complete Example

from chamber_sdk import ChamberClient, JobClass, JobStatus

def run_training():
    # Initialize
    client = ChamberClient.from_config()

    # Check capacity first
    capacity = client.get_capacity()
    print(f"Available GPU hours: {capacity.budget.available}")

    # Submit workload
    job = client.submit_job(
        name="transformer-training",
        initiative_id="team-ml",
        gpu_type="H100",
        requested_gpus=8,
        job_class=JobClass.RESERVED,
        priority=75,
        tags={
            "experiment": "transformer-v2",
            "hyperparams": "lr=1e-4,batch=32"
        }
    )
    print(f"Submitted: {job.id}")

    # Wait for completion
    result = client.wait_for_completion(job.id, timeout=3600)

    # Get metrics
    if result.status == JobStatus.COMPLETED:
        metrics = client.get_workload_metrics(job.id)
        print(f"Avg GPU utilization: {metrics.gpu_utilization.avg:.1f}%")
        print(f"Peak memory: {metrics.memory_utilization.max:.1f}%")

    return result

if __name__ == "__main__":
    run_training()

Using Templates

If your organization has workload templates configured, you can use them to simplify workload submission:
# List available templates
templates = client.list_templates(scope="ORGANIZATION")
for t in templates:
    print(f"{t.name}: {t.gpu_type} x {t.requested_gpus}")

# Submit using a template
job = client.submit_job(
    name="templated-job",
    initiative_id="team-ml",
    gpu_type="H100",
    template_id="template-abc123",
    tags={"experiment": "v1"}
)

Cancelling a Workload

# Cancel a running or pending workload
cancelled = client.cancel_workload(job.id)
print(f"Workload cancelled: {cancelled.status}")  # CANCELLED

Workload Classes

ClassDescriptionUse Case
RESERVEDGuaranteed capacity, non-preemptibleProduction training, time-sensitive workloads
ELASTICUses idle capacity, can be preemptedExperiments, fault-tolerant workloads

Common Parameters

ParameterTypeDescription
namestrHuman-readable workload name (1-255 chars)
initiative_idstrTeam ID
gpu_typestrGPU model (e.g., “H100”, “A100”)
requested_gpusint/floatNumber of GPUs (can be fractional)
job_classJobClassRESERVED or ELASTIC (default: RESERVED)
priorityint0-100, higher = more important
tagsdictKey-value pairs for organizing workloads

Next Steps