oumi.launcher

oumi.launcher#

Launcher module for the Oumi (Open Universal Machine Intelligence) library.

This module provides functionality for launching and managing jobs across various cloud platforms.

Example

>>> from oumi.launcher import Launcher, JobConfig, JobResources
>>> launcher = Launcher()
>>> job_resources = JobResources(cloud="local")
>>> job_config = JobConfig(
...     name="my_job", resources=job_resources, run="python train.py"
... )
>>> job_status = launcher.up(job_config, cluster_name="my_cluster")

Note

This module integrates with various cloud platforms. Ensure that the necessary credentials and configurations are set up for the cloud platform you intend to use.

class oumi.launcher.JobConfig(name: str | None = None, user: str | None = None, working_dir: str = '???', num_nodes: int = 1, resources: ~oumi.core.configs.job_config.JobResources = <factory>, envs: dict[str, str] = <factory>, file_mounts: dict[str, str] = <factory>, storage_mounts: dict[str, ~oumi.core.configs.job_config.StorageMount] = <factory>, setup: str | None = None, run: str = '???')[source]#

Bases: BaseConfig

Configuration for launching jobs on a cluster.

__finalize_and_validate__()[source]#: Finalizes and validates the configuration.

envs: dict[str, str]#: The environment variables to set on the node.

file_mounts: dict[str, str]#

File mounts to attach to the node.

For mounting (copying) local directories, the key is the file path on the remote and the value is the local path. The keys of file_mounts cannot be shared with storage_mounts.

name: str | None = None#: Job name (optional). Only used for display purposes.

num_nodes: int = 1#: The number of nodes to use for the job. Defaults to 1.

resources: JobResources#: The resources required for each node in the job.

run: str = '???'#: The script to run on every node. Required. Runs after setup.

setup: str | None = None#

The setup script to run on every node. Optional.

setup will always be executed before run. In sky-based clouds, setup is executed only once upon cluster creation, not once per job.

ex) pip install -r requirements.txt

storage_mounts: dict[str, StorageMount]#

Storage system mounts to attach to the node.

For mounting remote storage solutions, the key is the file path on the remote and the value is a StorageMount. The keys of storage_mounts cannot be shared with file_mounts.

user: str | None = None#: The user that the job will run as (optional). Required only for Polaris.

working_dir: str = '???'#

The local directory containing the scripts required to execute this job.

This directory will be copied to the remote node before the job is executed.

Bases: object

Resources required for a single node in a job.

accelerators: str | None = None#

Accelerator type (optional). Supported values vary by environment.

For GCP you may specify the accelerator name and count, e.g. “V100:4”.

cloud: str = '???'#

The cloud used to run the job (required).

Options:

aws: Amazon Web Services
azure: Microsoft Azure
gcp: Google Cloud Platform
lambda: Lambda Cloud
local: The local machine launching the job
polaris: The Polaris cluster at Argonne National Laboratory
runpod: RunPod

cpus: str | None = None#

Number of vCPUs to use per node (optional).

Sky-based clouds support strings with modifiers, e.g. “2+” to indicate at least 2 vCPUs.

disk_size: int | None = None#

Disk size in GiB to allocate for OS (mounted at /) (optional)

Ignored by Polaris.

disk_tier: str | None = None#

Disk tier to use for OS (optional).

For sky-based clouds this Could be one of ‘low’, ‘medium’, ‘high’, ‘ultra’, or ‘best’ (default: None). As of Feb ‘25, only AWS, Azure, GCP, and OCI support disk tiers.

image_id: str | None = None#

The image id used to boot the instances (optional).

You can specify a docker by using the format docker:<image_id>. This field is not applicable for all clouds.

instance_type: str | None = None#

Instance type to use (optional).

Supported values vary by environment. The instance type is automatically inferred if accelerators is specified.

memory: str | None = None#

Memory to allocate per node in GiB (optional).

Sky-based clouds support strings with modifiers, e.g. “256+” to indicate at least 256 GB.

region: str | None = None#: The region to use (optional). Supported values vary by environment.

use_spot: bool = False#

Whether the cluster should use spot instances (optional).

If unspecified, defaults to False (on-demand instances).

zone: str | None = None#: The zone to use (optional). Supported values vary by environment.

class oumi.launcher.Launcher[source]#

Bases: object

A class for managing the lifecycle of jobs on different clouds.

cancel(job_id: str, cloud_name: str, cluster_name: str) → JobStatus[source]#: Cancels the specified job.

down(cloud_name: str, cluster_name: str) → None[source]#: Turns down the specified cluster.

get_cloud(job_or_cloud: JobConfig | str) → BaseCloud[source]#: Gets the cloud instance for the specified job.

run(job: JobConfig, cluster_name: str) → JobStatus[source]#

Runs the specified job on the specified cluster.

Parameters:

job – The job configuration.
cluster_name – The name of the cluster to run the job on.

Returns:

The status of the job.

Return type:

Optional[JobStatus]

status(cloud: str | None = None, cluster: str | None = None, id: str | None = None) → dict[str, list[JobStatus]][source]#

Gets the status of all jobs across all clusters.

Parameters:

cloud – If specified, filters all jobs to only those on the specified cloud.
cluster – If specified, filters all jobs to only those on the specified cluster.
id – If specified, filters all jobs to only those with the specified ID.

Returns:

The status of all jobs, indexed by cloud name.

Return type:

dict[str, list(JobStatus)]

stop(cloud_name: str, cluster_name: str) → None[source]#: Stops the specified cluster.

up(job: JobConfig, cluster_name: str | None, **kwargs) → tuple[BaseCluster, JobStatus][source]#: Creates a new cluster and starts the specified job on it.

which_clouds() → list[str][source]#: Gets the names of all clouds in the registry.

class oumi.launcher.StorageMount(source: str = '???', store: str = '???')[source]#

Bases: object

A storage system mount to attach to a node.

source: str = '???'#

The remote path to mount the local path to (Required).

e.g. ‘gs://bucket/path’ for GCS, ‘s3://bucket/path’ for S3, or ‘r2://path’ for R2.

store: str = '???'#

The remote storage solution (Required).

Must be one of ‘s3’, ‘gcs’ or ‘r2’.

oumi.launcher.cancel(job_id: str, cloud_name: str, cluster_name: str) → JobStatus#: Cancels the specified job.

oumi.launcher.down(cloud_name: str, cluster_name: str) → None#: Turns down the specified cluster.

oumi.launcher.get_cloud(job_or_cloud: JobConfig | str) → BaseCloud#: Gets the cloud instance for the specified job.

oumi.launcher.run(job: JobConfig, cluster_name: str) → JobStatus#

Runs the specified job on the specified cluster.

Parameters:

job – The job configuration.
cluster_name – The name of the cluster to run the job on.

Returns:

The status of the job.

Return type:

Optional[JobStatus]

oumi.launcher.status(cloud: str | None = None, cluster: str | None = None, id: str | None = None) → dict[str, list[JobStatus]]#

Gets the status of all jobs across all clusters.

Parameters:

cloud – If specified, filters all jobs to only those on the specified cloud.
cluster – If specified, filters all jobs to only those on the specified cluster.
id – If specified, filters all jobs to only those with the specified ID.

Returns:

The status of all jobs, indexed by cloud name.

Return type:

dict[str, list(JobStatus)]

oumi.launcher.stop(cloud_name: str, cluster_name: str) → None#: Stops the specified cluster.

oumi.launcher.up(job: JobConfig, cluster_name: str | None, **kwargs) → tuple[BaseCluster, JobStatus]#: Creates a new cluster and starts the specified job on it.

oumi.launcher.which_clouds() → list[str]#: Gets the names of all clouds in the registry.

oumi.launcher

Contents

oumi.launcher#

Subpackages#