Source code for oumi.utils.str_utils
# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import logging
import re
from typing import Optional
[docs]
def sanitize_run_name(run_name: Optional[str]) -> Optional[str]:
"""Computes a sanitized version of wandb run name.
A valid run name may only contain alphanumeric characters, dashes, underscores,
and dots, with length not exceeding max limit.
Args:
run_name: The original raw value of run name.
"""
if not run_name:
return run_name
# Technically, the limit is 128 chars, but we limit to 100 characters
# because the system may generate aux artifact names e.g., by prepending a prefix
# (e.g., "model-") to our original run name, which are also subject
# to max 128 chars limit.
_MAX_RUN_NAME_LENGTH = 100
# Replace all unsupported characters with '_'.
result = re.sub("[^a-zA-Z0-9\\_\\-\\.]", "_", run_name)
if len(result) > _MAX_RUN_NAME_LENGTH:
suffix = "..." + hashlib.shake_128(run_name.encode("utf-8")).hexdigest(8)
result = result[0 : (_MAX_RUN_NAME_LENGTH - len(suffix))] + suffix
if result != run_name:
logger = logging.getLogger("oumi")
logger.warning(f"Run name '{run_name}' got sanitized to '{result}'")
return result
[docs]
def str_to_bool(s: str) -> bool:
"""Convert a string representation to a boolean value.
This function interprets various string inputs as boolean values.
It is case-insensitive and recognizes common boolean representations.
Args:
s: The string to convert to a boolean.
Returns:
bool: The boolean interpretation of the input string.
Raises:
ValueError: If the input string cannot be interpreted as a boolean.
Examples:
>>> str_to_bool("true") # doctest: +SKIP
True
>>> str_to_bool("FALSE") # doctest: +SKIP
False
>>> str_to_bool("1") # doctest: +SKIP
True
>>> str_to_bool("no") # doctest: +SKIP
False
"""
s = s.strip().lower()
if s in ("true", "yes", "1", "on", "t", "y"):
return True
elif s in ("false", "no", "0", "off", "f", "n"):
return False
else:
raise ValueError(f"Cannot convert '{s}' to boolean.")
[docs]
def compute_utf8_len(s: str) -> int:
"""Computes string length in UTF-8 bytes."""
# This is inefficient: allocates a temporary copy of string content.
# FIXME Can we do better?
return len(s.encode("utf-8"))