oumi.launcher.clusters#
Submodules#
oumi.launcher.clusters.frontier_cluster module#
- class oumi.launcher.clusters.frontier_cluster.FrontierCluster(name: str, client: SlurmClient)[source]#
- Bases: - BaseCluster- A cluster implementation backed by OLCF Frontier. - class SupportedQueues(value)[source]#
- Bases: - Enum- Enum representing the supported partitions (queues) on Frontier. - For more details, see: https://docs.olcf.ornl.gov/systems/frontier_user_guide.html#batch-partition-queue-policy - BATCH = 'batch'#
 - EXTENDED = 'extended'#
 
 - get_job(job_id: str) JobStatus | None[source]#
- Gets the jobs on this cluster if it exists, else returns None. 
 - get_logs_stream(cluster_name: str, job_id: str | None = None) TextIOBase[source]#
- Gets a stream that tails the logs of the target job. - Parameters:
- cluster_name – The name of the cluster the job was run in. 
- job_id – The ID of the job to tail the logs of. 
 
 
 - run_job(job: JobConfig) JobStatus[source]#
- Runs the specified job on this cluster. - For Frontier this method consists of 5 parts: - Copy the working directory to /lustre/orion/lrn081/scratch/$USER/oumi_launcher/$JOB_NAME. 
- Check if there is a conda installation at /lustre/orion/lrn081/scratch/$USER/miniconda3/envs/oumi. If not, install it. 
- Copy all file mounts. 
- Create a job script with all env vars, setup, and run commands. 
- CD into the working directory and submit the job. 
 - Parameters:
- job – The job to run. 
- Returns:
- The job status. 
- Return type:
 
 
oumi.launcher.clusters.local_cluster module#
- class oumi.launcher.clusters.local_cluster.LocalCluster(name: str, client: LocalClient)[source]#
- Bases: - BaseCluster- A cluster implementation for running jobs locally. - get_job(job_id: str) JobStatus | None[source]#
- Gets the jobs on this cluster if it exists, else returns None. 
 - get_logs_stream(cluster_name: str, job_id: str | None = None) TextIOBase[source]#
- Gets a stream that tails the logs of the target job. - Parameters:
- cluster_name – The name of the cluster the job was run in. 
- job_id – The ID of the job to tail the logs of. 
 
 
 
oumi.launcher.clusters.perlmutter_cluster module#
- class oumi.launcher.clusters.perlmutter_cluster.PerlmutterCluster(name: str, client: SlurmClient)[source]#
- Bases: - BaseCluster- A cluster implementation backed by NERSC Perlmutter. - class SupportedQueues(value)[source]#
- Bases: - Enum- Enum representing the supported queues on Perlmutter. - Unlike most other research clusters, Perlmutter calls queues quality of service (QoS). We use the term queue for consistency with other clusters. For more details, see: https://docs.nersc.gov/jobs/policy/#perlmutter-gpu. - DEBUG = 'debug'#
 - DEBUG_PREEMPT = 'debug_preempt'#
 - INTERACTIVE = 'interactive'#
 - JUPYTER = 'jupyter'#
 - OVERRUN = 'overrun'#
 - PREEMPT = 'preempt'#
 - PREMIUM = 'premium'#
 - REALTIME = 'realtime'#
 - REGULAR = 'regular'#
 - SHARED = 'shared'#
 - SHARED_INTERACTIVE = 'shared_interactive'#
 - SHARED_OVERRUN = 'shared_overrun'#
 
 - get_job(job_id: str) JobStatus | None[source]#
- Gets the jobs on this cluster if it exists, else returns None. 
 - get_logs_stream(job_id: str, cluster_name: str) TextIOBase[source]#
- Gets a stream that tails the logs of the target job. - Parameters:
- job_id – The ID of the job to tail the logs of. 
- cluster_name – The name of the cluster the job was run in. 
 
 
 - run_job(job: JobConfig) JobStatus[source]#
- Runs the specified job on this cluster. - For Perlmutter this method consists of 5 parts: - Copy the working directory to remote’s $HOME/oumi_launcher/$JOB_NAME. 
- Check if there is a conda installation. If not, install it. 
- Copy all file mounts. 
- Create a job script with all env vars, setup, and run commands. 
- CD into the working directory and submit the job. 
 - Parameters:
- job – The job to run. 
- Returns:
- The job status. 
- Return type:
 
 
oumi.launcher.clusters.polaris_cluster module#
- class oumi.launcher.clusters.polaris_cluster.PolarisCluster(name: str, client: PolarisClient)[source]#
- Bases: - BaseCluster- A cluster implementation backed by Polaris. - get_job(job_id: str) JobStatus | None[source]#
- Gets the jobs on this cluster if it exists, else returns None. 
 - get_logs_stream(cluster_name: str, job_id: str | None = None) TextIOBase[source]#
- Gets a stream that tails the logs of the target job. - Parameters:
- cluster_name – The name of the cluster the job was run in. 
- job_id – The ID of the job to tail the logs of. 
 
 
 - run_job(job: JobConfig) JobStatus[source]#
- Runs the specified job on this cluster. - For Polaris this method consists of 5 parts: - Copy the working directory to /home/$USER/oumi_launcher/<submission_time>. 
- Check if there is a conda installation at /home/$USER/miniconda3/envs/oumi. If not, install it. 
- Copy all file mounts. 
- Create a job script with all env vars, setup, and run commands. 
- CD into the working directory and submit the job. 
 - Parameters:
- job – The job to run. 
- Returns:
- The job status. 
- Return type:
 
 
oumi.launcher.clusters.sky_cluster module#
- class oumi.launcher.clusters.sky_cluster.SkyCluster(name: str, client: SkyClient)[source]#
- Bases: - BaseCluster- A cluster implementation backed by Sky Pilot. - get_job(job_id: str) JobStatus | None[source]#
- Gets the jobs on this cluster if it exists, else returns None. 
 - get_logs_stream(cluster_name: str, job_id: str | None = None) SkyLogStream[source]#
- Gets a stream that tails the logs of the target job. - Parameters:
- cluster_name – The name of the cluster the job was run in. 
- job_id – The ID of the job to tail the logs of. 
 
 
 
oumi.launcher.clusters.slurm_cluster module#
- class oumi.launcher.clusters.slurm_cluster.SlurmCluster(name: str, client: SlurmClient)[source]#
- Bases: - BaseCluster- A cluster implementation backed by a Slurm scheduler. - class ConnectionInfo(hostname: str, user: str)[source]#
- Bases: - object- Dataclass to hold information about a connection. - hostname: str#
 - property name#
- Gets the name of the connection in the form user@hostname. 
 - user: str#
 
 - get_job(job_id: str) JobStatus | None[source]#
- Gets the jobs on this cluster if it exists, else returns None. 
 - get_logs_stream(cluster_name: str, job_id: str | None = None) SlurmLogStream[source]#
- Gets a stream that tails the logs of the target job. - Parameters:
- cluster_name – The name of the cluster the job was run in. 
- job_id – The ID of the job to tail the logs of. 
 
- Returns:
- A SlurmLogStream object that can be used to read the logs. 
 
 - static get_slurm_connections() list[ConnectionInfo][source]#
- Gets Slurm connections from the OUMI_SLURM_CONNECTIONS env variable. 
 - static parse_cluster_name(name: str) ConnectionInfo[source]#
- Parses the cluster name into queue and user components. - Parameters:
- name – The name of the cluster. 
- Returns:
- The parsed cluster information. 
- Return type:
- _ConnectionInfo 
 
 - run_job(job: JobConfig) JobStatus[source]#
- Runs the specified job on this cluster. - For Slurm this method consists of 4 parts: - Copy the working directory to ~/oumi_launcher/<submission_time>. 
- Copy all file mounts. 
- Create a job script with all env vars, setup, and run commands. 
- CD into the working directory and submit the job. 
 - Parameters:
- job – The job to run. 
- Returns:
- The job status. 
- Return type:
 
 
