"""Abstract camera interfaces.
This module defines abstract camera interface classes for RGB, depth, stereo, and point-cloud capable sensors.
It provides a consistent API for frame acquisition, camera intrinsics access, robot attachment, and conversion
helpers across concrete camera backends. These interfaces establish shared coordinate and image conventions to
ensure interchangeable camera implementations throughout the robotblockset camera stack.
Key functionalities include:
- Abstract base interfaces for generic, RGB, RGB-D, and stereo RGB-D cameras.
- Unified methods for grabbing and retrieving RGB, depth, confidence, and point-cloud data.
- Camera intrinsics and resolution access through consistent typed APIs.
- Attachment/detachment support for associating cameras with robot instances.
- Coordinate-frame and image-indexing conventions for backend consistency.
- Shared conversion helpers for point-cloud representations.
Copyright (c) 2026 Jozef Stefan Institute
Authors: Leon Zlajpah.
"""
import abc
from typing import Optional, Tuple, TYPE_CHECKING
import cv2
import numpy as np
from robotblockset.cameras.point_clouds import open3d_to_point_cloud
from robotblockset.rbs_typing import (
CameraIntrinsicsMatrixType,
CameraResolutionType,
HomogeneousMatrixType,
NumpyConfidenceMapType,
NumpyDepthMapType,
NumpyFloatImageType,
NumpyIntImageType,
PointCloud,
)
from robotblockset.tools import rbs_object
# from loguru import logger
from robotblockset.tools import get_logger
logger = get_logger(__name__)
if TYPE_CHECKING:
from robotblockset.robots import robot
[docs]
class Camera(abc.ABC, rbs_object):
"""Base class for all cameras
We use the right-handed, y-down convention for the camera frame:
- origin is at the camera lens center
- z-axis points forward (towards the scene)
- x-axis points to the right
- y-axis points down
cf. https://www.stereolabs.com/docs/positional-tracking/coordinate-frames/#selecting-a-coordinate-system for an overview of other conventions
We use the (associated) top-left convention for the image plane:
- origin is at the top left corner of the image
- x-axis points to the right
- y-axis points down
keep in mind though that images are indexed column-row in numpy, which corresponds to y-x in the cartesian image coordinates
so to get the value of the pixel at (u,v) you need to do image[v,u] and the shape of the numpy array is (height, width)
cf https://scikit-image.org/docs/stable/user_guide/numpy_images.html#numpy-indexing
"""
Name = "Camera"
Robot: Optional["robot"] = None # robot to which sensor is attached
[docs]
def AttachTo(self, robot: "robot") -> None:
"""
Attach the sensor to a robot.
Parameters
----------
robot : 'robot'
robot
The robot to which the sensor is attached.
"""
self.Robot = robot
[docs]
def Detach(self) -> None:
"""
Detach the sensor from the robot.
Sets the `Robot` attribute to `None`.
"""
self.Robot = None
[docs]
def GetAttachedRobot(self) -> Tuple[Optional["robot"], str]:
"""
Get the robot to which the sensor is attached.
Returns
-------
Tuple[Optional[robot], str]
The robot object and its name, or (None, "None") if the sensor is not attached to a robot.
"""
if self.Robot is None:
return None, "None"
else:
return self.Robot, self.Robot.Name
[docs]
@abc.abstractmethod
def intrinsics_matrix(self) -> CameraIntrinsicsMatrixType:
"""returns the intrinsics matrix of the camera:
[[fx, 0, cx],
[0, fy, cy],
[0, 0, 1]]
where all values are defined in pixels.
"""
raise NotImplementedError
@abc.abstractmethod
def _grab_images(self) -> None:
"""Grab the latest camera data into buffers.
Transfer the latest camera data (RGB, stereo RGB, depth) from the camera to a memory buffer."""
raise NotImplementedError
[docs]
class RGBCamera(Camera, abc.ABC):
"""Base class for all RGB cameras"""
Name = "RGBCamera"
@property
@abc.abstractmethod
def resolution(self) -> CameraResolutionType:
"""The resolution of the camera, in pixels."""
raise NotImplementedError
@property
@abc.abstractmethod
def fps(self) -> float:
"""The frames per second of the camera."""
raise NotImplementedError
[docs]
def get_rgb_image(self) -> NumpyFloatImageType:
"""Get a new RGB image from the camera."""
self._grab_images()
return self._retrieve_rgb_image()
[docs]
def get_rgb_image_as_int(self) -> NumpyIntImageType:
"""Get a new RGB image from the camera as uint8.
This is faster to retrieve as images are typically stored as ints in the buffer.
It is also more compact, so recommended for communication."""
self._grab_images()
return self._retrieve_rgb_image_as_int()
@abc.abstractmethod
def _retrieve_rgb_image(self) -> NumpyFloatImageType:
"""Returns the current RGB image in the memory buffer."""
raise NotImplementedError
@abc.abstractmethod
def _retrieve_rgb_image_as_int(self) -> NumpyIntImageType:
"""Returns the current RGB image in the memory buffer as uint8.
This is typically the format in which it is stored in memory.
Returning it directly avoids the overhead of converting it to floats first, which is what _retrieve_rgb_image() does.
"""
raise NotImplementedError
[docs]
class DepthCamera(Camera, abc.ABC):
"""Base class for all depth cameras"""
Name = "DepthCamera"
[docs]
def get_depth_map(self) -> NumpyDepthMapType:
"""Get the latest depth map of the camera.
The depth map is a 2D array of floats, that provide the estimated z-coordinate in the camera frame
of that point on the image plane (pixel).
Returns
-------
NumpyDepthMapType: depth map as a float image.
"""
self._grab_images()
return self._retrieve_depth_map()
[docs]
def get_depth_image(self) -> NumpyIntImageType:
"""Return an 8-bit visualization depth image.
an 8-bit (int) quantization of the latest depth map, which can be used for visualization"""
self._grab_images()
return self._retrieve_depth_image()
@abc.abstractmethod
def _retrieve_depth_map(self) -> NumpyDepthMapType:
"""Returns the current depth map in the memory buffer."""
raise NotImplementedError
@abc.abstractmethod
def _retrieve_depth_image(self) -> NumpyIntImageType:
"""Returns the current depth image in the memory buffer."""
raise NotImplementedError
[docs]
def get_confidence_map(self) -> NumpyConfidenceMapType:
"""Get a confidence map for the depth map.
The confidence map is a 2D array of floats, that provide a measure of confidence in the depth estimate for each pixel.
The values are between 0 and 1, where 1 indicates high confidence and 0 indicates low confidence.
Note that not all stereo depth cameras provide a confidence map. We provide a default (naive) implementation.
It computes a confidence map based on depth discontinuities using OpenCV's Canny edge detection.
This is based on the assumption that depth estimates are less reliable at depth discontinuities.
Child classes can override this method to provide a more accurate confidence map if available.
For example:
- StereoRGBDCamera provides a confidence map based on disparity between left and right RGB images using OpenCV's SGBM algorithm implementation.
- Realsense provides a confidence map based on disparity between left and right infrared images using OpenCV's SGBM algorithm implementation.
- ZED provides a confidence map based on the camera's internal confidence measure.
See also: https://github.com/opencv/opencv_contrib/blob/master/modules/ximgproc/samples/disparity_filtering.cpp
Returns
-------
NumpyConfidenceMapType: confidence map with values between 0 and 1.
"""
self._grab_images()
return self._retrieve_confidence_map()
def _retrieve_confidence_map(self) -> NumpyConfidenceMapType:
"""Returns a default confidence map for the depth map."""
# This is a very basic implementation, based on depth discontinuities using OpenCV's Canny edge detection.
depth_map = self._retrieve_depth_map()
depth_map_uint8 = np.empty_like(depth_map, dtype=np.uint8)
depth_map_uint8 = cv2.normalize(depth_map, depth_map_uint8, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
edges = cv2.Canny(depth_map_uint8, 100, 200)
confidence_map = 1.0 - (edges / 255.0) # edges are 255, so invert to get confidence
return confidence_map
[docs]
class RGBDCamera(RGBCamera, DepthCamera):
"""Base class for all RGBD cameras"""
Name = "RGBDCamera"
[docs]
def get_colored_point_cloud(self) -> PointCloud:
"""Get the latest point cloud of the camera.
The point cloud contains the estimated position in the camera frame of points on the image plane (pixels).
Each point also has a color associated with it, which is the color of the corresponding pixel in the RGB image.
Returns
-------
PointCloud: the points (= positions) and colors
"""
self._grab_images()
return self._retrieve_colored_point_cloud()
def _retrieve_colored_point_cloud(self) -> PointCloud:
"""Returns the current point cloud in the memory buffer.
Default implementation uses the depth map and RGB with open3d's create_from_rgbd_image() function.
See: https://www.open3d.org/docs/release/python_api/open3d.t.geometry.PointCloud.html#open3d.t.geometry.PointCloud.create_from_rgbd_image
"""
if not hasattr(self, "_logged_colored_point_cloud_warning"):
logger.warning(
"""You are using an RGBDCamera which does not override _retrieve_colored_point_cloud.
We will use a default implementation based on Open3D, which is quite slow (several milliseconds per frame).
Consider impementing this method if your camera supports point cloud processing."""
)
self._logged_colored_point_cloud_warning = True
import open3d as o3d
image_rgb_uint8 = self._retrieve_rgb_image_as_int()
depth_map = self._retrieve_depth_map()
intrinsics = self.intrinsics_matrix()
# Convert airo-mono data types to open3d data types
image_o3d = o3d.t.geometry.Image(image_rgb_uint8)
depth_map_o3d = o3d.t.geometry.Image(depth_map)
rgbd_o3d = o3d.t.geometry.RGBDImage(image_o3d, depth_map_o3d)
# Note this is quite slow, > 100ms for a 2K image
pcd = o3d.t.geometry.PointCloud.create_from_rgbd_image(
rgbd_o3d,
intrinsics,
depth_scale=1.0,
depth_max=1000.0,
)
point_cloud = open3d_to_point_cloud(pcd)
return point_cloud
[docs]
class StereoRGBDCamera(RGBDCamera):
"""Base class for all stereo RGBD cameras"""
Name = "StereoRGBDCamera"
LEFT_RGB = "left"
RIGHT_RGB = "right"
_VIEWS = (LEFT_RGB, RIGHT_RGB)
[docs]
def get_rgb_image(self, view: str = LEFT_RGB) -> NumpyFloatImageType:
self._grab_images()
return self._retrieve_rgb_image(view)
[docs]
def get_rgb_image_as_int(self, view: str = LEFT_RGB) -> NumpyIntImageType:
self._grab_images()
return self._retrieve_rgb_image_as_int(view)
@abc.abstractmethod
def _retrieve_rgb_image(self, view: str = LEFT_RGB) -> NumpyFloatImageType:
raise NotImplementedError
@abc.abstractmethod
def _retrieve_rgb_image_as_int(self, view: str = LEFT_RGB) -> NumpyIntImageType:
raise NotImplementedError
# TODO: check view argument value?
[docs]
@abc.abstractmethod
def intrinsics_matrix(self, view: str = LEFT_RGB) -> CameraIntrinsicsMatrixType:
raise NotImplementedError
@property
@abc.abstractmethod
def pose_of_right_view_in_left_view(self) -> HomogeneousMatrixType:
"""
get the transform of the right view frame in the left view frame,
ususally this is simply a translation along the x-axis and referred to as the disparity.
cf https://en.wikipedia.org/wiki/Binocular_disparity
The left view is usually considered to be the 'camera frame', i.e. this is the frame that is used to define the camara extrinsics matrix
"""
raise NotImplementedError
def _retrieve_confidence_map(self) -> NumpyConfidenceMapType:
"""Returns a confidence map for the depth map."""
# Based on disparity between left and right RGB images using OpenCV's SGBM algorithm implementation.
# default values for SGBM according to OpenCV docs
max_disp = 160 # must be divisible by 16
window_size = 3
p1 = 216 # 24 * window_size ** 2
p2 = 864 # 96 * window_size ** 2
pre_filter_cap = 63
wls_lambda = 8000.0
wls_sigma = 1.5
# get the left and right images
left = self._retrieve_rgb_image_as_int(self.LEFT_RGB)
right = self._retrieve_rgb_image_as_int(self.RIGHT_RGB)
left_matcher = cv2.StereoSGBM.create(
minDisparity=0,
numDisparities=max_disp,
blockSize=window_size,
P1=p1,
P2=p2,
preFilterCap=pre_filter_cap,
mode=cv2.StereoSGBM_MODE_SGBM_3WAY,
)
wls_filter = cv2.ximgproc.createDisparityWLSFilter(left_matcher)
right_matcher = cv2.ximgproc.createRightMatcher(left_matcher)
left_disp = left_matcher.compute(left, right).astype(np.float32) / 16.0
right_disp = right_matcher.compute(right, left).astype(np.float32) / 16.0
wls_filter.setLambda(wls_lambda)
wls_filter.setSigmaColor(wls_sigma)
wls_filter.filter(left_disp, left, disparity_map_right=right_disp)
confidence_map = wls_filter.getConfidenceMap()
return confidence_map / 255.0