Source code for oumi.models.cnn_classifier

# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""The CNNClassifier model provides a basic example how to use ConvNets in Oumi."""

from typing import Callable, Optional

import torch
import torch.nn as nn
from torch.nn import functional as F

from oumi.core import registry
from oumi.core.models.base_model import BaseModel



[docs]
@registry.register("CnnClassifier", registry.RegistryType.MODEL)
class CNNClassifier(BaseModel):
    """A simple ConvNet for classification of small fixed-size images."""

    def __init__(
        self,
        image_width: int,
        image_height: int,
        *,
        in_channels: int = 3,
        output_dim: int = 10,
        kernel_size: int = 5,
    ):
        """Initialize the ConvNet for image classification.

        Args:
            image_width: Width of input images in pixels.
            image_height: Height of input images in pixels.
            in_channels: The number of input channels e.g., 3 for RGB, 1 for greyscale.
            output_dim: The output dimension i.e., the number of classes.
            kernel_size: Convolutional kernel size.
        """
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=kernel_size)
        w, h = self._compute_next_level_image_size(
            image_width, image_height, kernel_size=kernel_size, halve=False
        )
        self.conv2 = nn.Conv2d(32, 32, kernel_size=kernel_size)
        w, h = self._compute_next_level_image_size(
            w, h, kernel_size=kernel_size, halve=True
        )
        self.conv3 = nn.Conv2d(32, 64, kernel_size=kernel_size)
        w, h = self._compute_next_level_image_size(
            w, h, kernel_size=kernel_size, halve=True
        )
        self._final_image_width = w
        self._final_image_height = h
        self.fc1 = nn.Linear(
            self._final_image_width * self._final_image_height * 64, 256
        )
        self.fc2 = nn.Linear(256, output_dim)

    @staticmethod
    def _compute_next_level_image_size(
        w: int, h: int, kernel_size: int, halve: bool
    ) -> tuple[int, int]:
        w, h = (w - (kernel_size - 1)), (h - (kernel_size - 1))
        if halve:
            w, h = (w // 2), (h // 2)
        if w <= 0 or h <= 0:
            raise ValueError(f"Image is too small for kernel_size={kernel_size}")
        return (w, h)


[docs]
    def forward(
        self,
        images: torch.Tensor,
        labels: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> dict[str, torch.Tensor]:
        """Forward pass of the model."""
        # Whether to apply dropout. `False` corresponds to inference mode.
        training_mode = labels is not None

        x = F.relu(self.conv1(images))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.dropout(x, p=0.5, training=training_mode)
        x = F.relu(F.max_pool2d(self.conv3(x), 2))
        x = F.dropout(x, p=0.5, training=training_mode)
        x = x.view(-1, self._final_image_width * self._final_image_height * 64)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=training_mode)
        logits = self.fc2(x)
        outputs = {"logits": logits}
        if training_mode:
            targets = F.log_softmax(logits, dim=1)
            loss = self.criterion(targets, labels)
            outputs["loss"] = loss
        return outputs


    @property
    def criterion(self) -> Callable:
        """Returns the criterion function to compute loss."""
        return F.nll_loss