# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import pandas as pd
PARQUET_EXTENSION = ".parquet"
[docs]
def save_infer_prob(output_filepath: str, probabilities: list[list[list[float]]]):
"""Save batched probabilities into a parquet file."""
df_probs = pd.DataFrame(probabilities)
df_probs.to_parquet(f"{output_filepath}{PARQUET_EXTENSION}")
[docs]
def load_infer_prob(input_filepath: str) -> list[list[list[float]]]:
"""Retrieve batched probabilities from a parquet file."""
probs_count_in_first_batch = None
def to_list(probs):
"""Ensure number of probabilities is the same for all entries."""
probs_list = list(probs)
nonlocal probs_count_in_first_batch
probs_count_in_first_batch = probs_count_in_first_batch or len(probs_list)
if probs_count_in_first_batch != len(probs_list):
raise ValueError(
f"Reading `{input_filepath}{PARQUET_EXTENSION}`: inconsistent number of"
f"probs across entries: len({probs_list})!={probs_count_in_first_batch}"
)
return probs_list
df_probs = pd.read_parquet(f"{input_filepath}{PARQUET_EXTENSION}")
probabilities = df_probs.to_numpy().tolist()
probabilities = [[to_list(probs) for probs in batch] for batch in probabilities]
return probabilities
# The inference probabilities (`probabilities`) are structured as follows:
# (the example below assumes 4 batches of batch_size=2 and, for each of these,
# 4 probabilities corresponding to the multiple choices A, B, C, D)
#
# [
# [ <-- batch no 0:
# [p_0_0_A, p_0_0_B, p_0_0_C, p_0_0_D], <-- batch index = 0
# [p_0_1_A, p_0_1_B, p_0_1_C, p_0_1_D], <-- batch index = 1
# ],
# [ <-- batch no 1:
# [p_1_0_A, p_1_0_B, p_1_0_C, p_1_0_D], <-- batch index = 0
# [p_1_1_A, p_1_1_B, p_1_1_C, p_1_1_D], <-- batch index = 1
# ],
# [ <-- batch no 2:
# [p_2_0_A, p_2_0_B, p_2_0_C, p_2_0_D], <-- batch index = 0
# [p_2_1_A, p_2_1_B, p_2_1_C, p_2_1_D], <-- batch index = 1
# ],
# [ <-- batch no 3:
# [p_3_0_A, p_3_0_B, p_3_0_C, p_3_0_D], <-- batch index = 0
# [p_3_1_A, p_3_1_B, p_3_1_C, p_3_1_D], <-- batch index = 1
# ]
# ]
#
# We save these into a .csv file of the following format:
# - Every row corresponds to a batch.
# - Within each row, the batch items are strings separated by comma (,).
# - Each item (string) contains a list of probabilities (floats).
#
# batch index = 0 batch index = 1 batch no
# <--------------------------------> , <--------------------------------> |
# "[p_0_0_A, p_0_0_B, p_0_0_C, p_0_0_D]","[p_0_1_A, p_0_1_B, p_0_1_C, p_0_1_D]" <--0
# "[p_1_0_A, p_1_0_B, p_1_0_C, p_1_0_D]","[p_1_1_A, p_1_1_B, p_1_1_C, p_1_1_D]" <--1
# "[p_2_0_A, p_2_0_B, p_2_0_C, p_2_0_D]","[p_2_1_A, p_2_1_B, p_2_1_C, p_2_1_D]" <--2
# "[p_3_0_A, p_3_0_B, p_3_0_C, p_3_0_D]","[p_3_1_A, p_3_1_B, p_3_1_C, p_3_1_D]" <--3
#
[docs]
def save_infer_prob_csv(output_filepath: str, probabilities: list[list[list[float]]]):
"""Save batched probabilities into a csv file."""
with open(output_filepath, "w") as write_obj:
csv_writer = csv.writer(write_obj)
csv_writer.writerows(probabilities)
[docs]
def load_infer_prob_csv(input_filepath: str) -> list[list[list[float]]]:
"""Retrieve batched probabilities from a csv file."""
probs_count_in_first_batch = None
try:
with open(input_filepath) as read_obj:
csv_reader = csv.reader(read_obj)
probabilities = []
for batch in csv_reader:
probabilities_batch = []
for entry in batch:
probs_list = str_to_float_list(entry)
# Number of probabilities must be the same for all entries.
probs_count_in_first_batch = probs_count_in_first_batch or len(
probs_list
)
if probs_count_in_first_batch != len(probs_list):
raise ValueError(
f"Reading {input_filepath}: inconsistent number of probs "
f"across entries: len({probs_list}) != "
f"{probs_count_in_first_batch}"
)
probabilities_batch.append(probs_list)
probabilities.append(probabilities_batch)
return probabilities
except FileNotFoundError:
raise FileNotFoundError(f"{load_infer_prob}: Path {input_filepath} not found!")
[docs]
def str_to_float_list(input: str) -> list[float]:
"""Convert an `str` representing a list of `floats` to an actual list of `floats`.
Example: input: `[1.1, 2.2, 3.3]` => output: [1.1, 2.2, 3.3]
"""
# 1) Get rid of '[' and ']'.
if (input[0] != "[") or (input[-1] != "]"):
raise ValueError(
f"Input `{input}` must start with '[' and end with ']' to represent a list"
)
input = input[1:-1]
# 2) Convert string to a list of items.
list_of_items = input.split(", ")
if not len(list_of_items):
raise ValueError(f"List `{list_of_items}` does NOT contain any items")
# 3) Cast all list items to `float`.
try:
list_of_floats = [float(item) for item in list_of_items]
except ValueError:
raise ValueError(f"List `{list_of_items}` should contain probabilities")
return list_of_floats