oumi.launcher.clients#

Submodules#

oumi.launcher.clients.local_client module#

class oumi.launcher.clients.local_client.LocalClient[source]#

Bases: object

A client for running jobs locally in a subprocess.

cancel(job_id) JobStatus | None[source]#

Cancels the specified job.

Parameters:
  • job_id – The ID of the job to cancel.

  • queue – The name of the queue to search.

Returns:

The job status if found, None otherwise.

get_job(job_id: str) JobStatus | None[source]#

Gets the specified job’s status.

Parameters:

job_id – The ID of the job to get.

Returns:

The job status if found, None otherwise.

list_jobs() list[JobStatus][source]#

Returns a list of job statuses.

submit_job(job: JobConfig) JobStatus[source]#

Runs the specified job on this cluster.

oumi.launcher.clients.polaris_client module#

class oumi.launcher.clients.polaris_client.PolarisClient(user: str)[source]#

Bases: object

A client for communicating with Polaris at ALCF.

class SupportedQueues(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]#

Bases: Enum

Enum representing the supported queues on Polaris.

For more details, see: https://docs.alcf.anl.gov/polaris/running-jobs/#queues

DEBUG = 'debug'#
DEBUG_SCALING = 'debug-scaling'#
DEMAND = 'demand'#
PREEMPTABLE = 'preemptable'#
PROD = 'prod'#
cancel(job_id, queue: SupportedQueues) JobStatus | None[source]#

Cancels the specified job.

Parameters:
  • job_id – The ID of the job to cancel.

  • queue – The name of the queue to search.

Returns:

The job status if found, None otherwise.

static get_active_users() list[str][source]#

Gets the list of users with an open SSH tunnel to Polaris.

Returns:

A list of users.

get_job(job_id: str, queue: SupportedQueues) JobStatus | None[source]#

Gets the specified job’s status.

Parameters:
  • job_id – The ID of the job to get.

  • queue – The name of the queue to search.

Returns:

The job status if found, None otherwise.

list_jobs(queue: SupportedQueues) list[JobStatus][source]#

Lists a list of job statuses for the given queue.

Returns:

A list of dictionaries, each containing the status of a cluster.

put(file_contents: str, destination: str) None[source]#

Puts the specified file contents to the remote path.

Parameters:
  • file_contents – The contents of the file to write.

  • destination – The remote path to write the file to.

put_recursive(source: str, destination: str) None[source]#

Puts the specified file/directory to the remote path using rsync.

Parameters:
  • source – The local file/directory to write.

  • destination – The remote path to write the file/directory to.

run_commands(commands: list[str]) PolarisResponse[source]#

Runs the provided commands in a single SSH command.

Parameters:

commands – The commands to run.

submit_job(job_path: str, working_dir: str, node_count: int, queue: SupportedQueues, name: str | None) str[source]#

Submits the specified job script to Polaris.

Parameters:
  • job_path – The path to the job script to submit.

  • working_dir – The working directory to submit the job from.

  • node_count – The number of nodes to use for the job.

  • queue – The name of the queue to submit the job to.

  • name – The name of the job (optional).

Returns:

The ID of the submitted job.

class oumi.launcher.clients.polaris_client.PolarisResponse(stdout: str, stderr: str, exit_code: int)[source]#

Bases: object

A response from Polaris.

exit_code: int#
stderr: str#
stdout: str#
oumi.launcher.clients.polaris_client.retry_auth(user_function)[source]#

Decorator to ensure auth is fresh before calling a function.

oumi.launcher.clients.sky_client module#

class oumi.launcher.clients.sky_client.SkyClient[source]#

Bases: object

A wrapped client for communicating with Sky Pilot.

class SupportedClouds(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)[source]#

Bases: Enum

Enum representing the supported clouds.

AWS = 'aws'#
AZURE = 'azure'#
GCP = 'gcp'#
LAMBDA = 'lambda'#
RUNPOD = 'runpod'#
cancel(cluster_name: str, job_id: str) None[source]#

Gets the job queue of a cluster.

Parameters:
  • cluster_name – The name of the cluster to cancel the job on.

  • job_id – The ID of the job to cancel.

down(cluster_name: str) None[source]#

Tears down the target cluster.

Parameters:

cluster_name – The name of the cluster to tear down.

exec(job: JobConfig, cluster_name: str) str[source]#

Executes the specified job on the target cluster.

Parameters:
  • job – The job to execute.

  • cluster_name – The name of the cluster to execute the job on.

Returns:

The ID of the job that was created.

launch(job: JobConfig, cluster_name: str | None = None, **kwargs) JobStatus[source]#

Creates a cluster and starts the provided Job.

Parameters:
  • job – The job to execute on the cluster.

  • cluster_name – The name of the cluster to create.

  • kwargs – Additional arguments to pass to the Sky Pilot client.

Returns:

A JobStatus with only id and cluster populated.

queue(cluster_name: str) list[dict][source]#

Gets the job queue of a cluster.

Parameters:

cluster_name – The name of the cluster to get the queue for.

Returns:

A list of dictionaries, each containing the metadata of a cluster.

status() list[dict[str, Any]][source]#

Gets a list of cluster statuses.

Returns:

A list of dictionaries, each containing the status of a cluster.

stop(cluster_name: str) None[source]#

Stops the target cluster.

Parameters:

cluster_name – The name of the cluster to stop.