@Samahu ,I conducted YOLO inference on reflectivity and IR images with an initial resolution of 2048x128, resizing them to 1024x128 for improved segmentation. To ensure accurate results and address issues, I need clarification and guidance on the following:
- Distortion Correction: I applied a distortion correction assuming a 45-degree vertical field of view (FOV) for spherical to rectilinear conversion. Is this approach valid, or should I consider alternative methods?
- 2D to 3D Projection: I aim to project 2D inferences (e.g., bounding boxes of detected objects) onto 3D point clouds to estimate the poses of moving objects like cars. How can I accurately align the resized images with the 3D data?
- Metadata Usage: To maintain a correspondence between 2D inferences and 3D points, I am considering resizing the images using metadata, such as azimuth and elevation angles, for better spatial alignment. Would this be an appropriate approach?
- Required Information for Projection:
- What additional information is critical for projecting 2D data onto 3D point clouds (e.g., range data, calibration parameters)?
- How can I accurately map inferred 2D pixel ranges to their corresponding 3D spatial positions in the point cloud?
import cv2
import matplotlib.pyplot as plt
import numpy as np
def correct_lidar_image_distortion(input_image, vertical_fov=45, horizontal_fov=360, interpolation=cv2.INTER_LINEAR):
if len(input_image.shape) == 3: # Check if the image has 3 channels (color)
input_image = cv2.cvtColor(input_image, cv2.COLOR_BGR2GRAY)
h, w = input_image.shape
# Initialize the remap grids
map_x = np.zeros((h, w), dtype=np.float32)
map_y = np.zeros((h, w), dtype=np.float32)
# Calculate angular resolution
vertical_angle_per_pixel = vertical_fov / h # ~0.3515° per pixel
horizontal_angle_per_pixel = horizontal_fov / w # ~0.176° per pixel
new_vertical_angle_per_pixel = horizontal_angle_per_pixel # ~0.3515° per pixel
#new_horizontal_angle_per_pixel = horizontal_fov / w # ~0.176° per pixel
# Populate remapping grids based on angular distribution
for y in range(h):
for x in range(w):
# Calculate the original angles for each pixel
theta = (x * horizontal_angle_per_pixel) - (horizontal_fov / 2)
phi = (y * vertical_angle_per_pixel) - (vertical_fov / 2)
# Map to corrected positions (assuming cylindrical to rectilinear correction if needed)
corrected_x = ((theta + (horizontal_fov / 2)) / horizontal_fov) * w
corrected_y = ((phi + (vertical_fov / 2)) / vertical_fov) * h
# Fill the remapping arrays
map_x[y, x] = corrected_x
map_y[y, x] = corrected_y
# Apply the remapping to correct the distortion
corrected_image = cv2.remap(input_image, map_x, map_y, interpolation)
return corrected_image
def resize_to_half_width(input_image):
“”"
Reduces the width of the image by eliminating every 2nd column (down-sample horizontally by 2).
:param input_image: Input image to be resized
:return: Image with half the original width and the same height
"""
# Select every alternate column to reduce width by half
resized_image = input_image[:, ::2]
return resized_image
then i ran inference following code reference:
class ScanIterator:
if torch.cuda.is_available():
DEVICE = “cuda”
elif torch.backends.mps.is_available():
DEVICE = “mps”
else:
DEVICE = “cpu”
def __init__(self, scans: ScanSource):
self._metadata = scans.metadata
# Load YOLO pretrained model
self.model_yolo_nir = YOLO("yolov9c-seg.pt").to(device=self.DEVICE)
self.model_yolo_ref = YOLO("yolov9c-seg.pt").to(device=self.DEVICE)
# Define classes to output results for
self.name_to_class = {value: key for key, value in self.model_yolo_ref.names.items()}
self.classes_to_detect = [
self.name_to_class['person'],
self.name_to_class['car'],
self.name_to_class['traffic light'],
self.name_to_class['bus']
]
# Post-process REFLECTIVITY channel
self.paired_list = [
[ChanField.NEAR_IR, AutoExposure(), BeamUniformityCorrector(), self.model_yolo_nir],
[ChanField.REFLECTIVITY, AutoExposure(), BeamUniformityCorrector(), self.model_yolo_ref]
]
# Map the self._update function to the scans iterator
self._scans = map(partial(self._update), scans)
# Return the scans iterator when instantiating the class
def __iter__(self):
return self._scans
def _update(self, scan: LidarScan) -> LidarScan:
resized_width = 1024
resized_height = 128
stacked_result_rgb = np.empty((resized_width * len(self.paired_list), resized_width, 3), np.uint8)
for i, (field, ae, buc, model) in enumerate(self.paired_list):
# Destagger the data to get a normal-looking 3D image
img = destagger(self._metadata, scan.field(field)).astype(np.float32)
correct_lidar_image_distortion(img)
resize_to_half_width(img)
img = cv2.resize(img, (1024, 128), interpolation=cv2.INTER_LINEAR)
# Make the image more uniform and better exposed
ae(img)
buc(img, update_state=True)
# Convert to 3-channel uint8 for YOLO inference
img_rgb = np.repeat(np.uint8(np.clip(np.rint(img * 255), 0, 255))[..., np.newaxis], 3, axis=-1)
# Run YOLO inference with tracking enabled
results: Results = next(
model.track(
[img_rgb],
stream=True, # Reduce memory requirements for streaming
persist=True, # Maintain tracks across sequential frames
conf=0.1,
imgsz=[img.shape[0], img.shape[1]],
classes=self.classes_to_detect
)
).cpu()
# Plot results with bounding boxes and masks
img_rgb = results.plot(boxes=True, masks=True, line_width=1, font_size=3)
# Save stacked RGB images for OpenCV viewing
stacked_result_rgb[i * scan.h:(i + 1) * scan.h, ...] = img_rgb
# Display in OpenCV
cv2.imshow("YOLO Results", stacked_result_rgb)
cv2.waitKey(1)
return scan