HDRplus/include/hdrplus/utility.h

#pragma once

#include <string>
#include <stdexcept> // std::runtime_error
#include <opencv2/opencv.hpp> // all opencv header
#include <omp.h>

// https://stackoverflow.com/questions/63404539/portable-loop-unrolling-with-template-parameter-in-c-with-gcc-icc
/// Helper macros for stringification
#define TO_STRING_HELPER(X)   #X
#define TO_STRING(X)          TO_STRING_HELPER(X)

// Define loop unrolling depending on the compiler
#if defined(__ICC) || defined(__ICL)
  #define UNROLL_LOOP(n)      _Pragma(TO_STRING(unroll (n)))
#elif defined(__clang__)
  #define UNROLL_LOOP(n)      _Pragma(TO_STRING(unroll (n)))
#elif defined(__GNUC__) && !defined(__clang__)
  #define UNROLL_LOOP(n)      _Pragma(TO_STRING(GCC unroll (16)))
#elif defined(_MSC_BUILD)
  #pragma message ("Microsoft Visual C++ (MSVC) detected: Loop unrolling not supported!")
  #define UNROLL_LOOP(n)
#else
  #warning "Unknown compiler: Loop unrolling not supported!"
  #define UNROLL_LOOP(n)
#endif


namespace hdrplus
{


template <typename T, int kernel>
cv::Mat box_filter_kxk( const cv::Mat& src_image )
{
    const T* src_image_ptr = (T*)src_image.data;
    int src_height = src_image.size().height;
    int src_width  = src_image.size().width;
    int src_step   = src_image.step1();

    if ( kernel <= 0 )
    {
#ifdef __ANDROID__
		return cv::Mat();
#else
        throw std::runtime_error(std::string( __FILE__ ) + "::" + __func__ + " box filter only support kernel size >= 1");
#endif
    }

    //  int(src_height / kernel) = floor(src_height / kernel)
    // When input size is not multiplier of kernel, take floor
    cv::Mat dst_image( src_height / kernel, src_width / kernel, src_image.type() );
    T* dst_image_ptr = (T*)dst_image.data;
    int dst_height = dst_image.size().height;
    int dst_width  = dst_image.size().width;
    int dst_step = dst_image.step1();

    for ( int row_i = 0; row_i < dst_height; ++row_i )
    {
        for ( int col_i = 0; col_i < dst_width; col_i++ )
        {
            // Take ceiling for rounding
            T box_sum = T( 0 );

            UNROLL_LOOP( kernel )
            for ( int kernel_row_i = 0; kernel_row_i < kernel; ++kernel_row_i )
            {
                UNROLL_LOOP( kernel )
                for ( int kernel_col_i = 0; kernel_col_i < kernel; ++kernel_col_i )
                {
                    box_sum += src_image_ptr[ ( row_i * kernel + kernel_row_i ) * src_step + ( col_i * kernel + kernel_col_i ) ];
                }
            }

            // Average by taking ceiling
            T box_avg = box_sum / T( kernel * kernel );
            dst_image_ptr[ row_i * dst_step + col_i ] = box_avg;
        }
    }

    return dst_image;
}


template <typename T, int kernel>
cv::Mat downsample_nearest_neighbour( const cv::Mat& src_image )
{
    const T* src_image_ptr = (T*)src_image.data;
    int src_height = src_image.size().height;
    int src_width  = src_image.size().width;
    int src_step   = src_image.step1();

    //  int(src_height / kernel) = floor(src_height / kernel)
    // When input size is not multiplier of kernel, take floor
    cv::Mat dst_image = cv::Mat( src_height / kernel, src_width / kernel, src_image.type() );
    T* dst_image_ptr = (T*)dst_image.data;
    int dst_height = dst_image.size().height;
    int dst_width  = dst_image.size().width;
    int dst_step = dst_image.step1();

    // -03 should be enough to optimize below code
    for ( int row_i = 0; row_i < dst_height; row_i++ )
    {
        UNROLL_LOOP( 32 )
        for ( int col_i = 0; col_i < dst_width; col_i++ )
        {
            dst_image_ptr[ row_i * dst_step + col_i ] = \
                src_image_ptr[ (row_i * kernel) * src_step + (col_i * kernel) ];
        }
    }

    return dst_image;
}


template< typename T >
void print_cvmat( cv::Mat image )
{
    const T* img_ptr = (const T*)image.data;
    int height = image.size().height;
    int width = image.size().width;
    int step = image.step1();
    int chns = image.channels();

    printf("print_cvmat()::Image of size height = %d, width = %d, step = %d\n", \
        height, width, step );

    if ( chns == 1 )
    {
        for ( int row_i = 0; row_i < height; ++row_i )
        {
            int row_i_offset = row_i * step;
            for ( int col_i = 0; col_i < width; ++col_i )
            {
                printf("%3.d ", img_ptr[ row_i_offset + col_i ]);
                //printf("%3.d ", int( image.at<T>( row_i, col_i ) ) );
            }
            printf("\n");
        }
    }
    else if ( chns == 3 )
    {
        for ( int row_i = 0; row_i < height; ++row_i )
        {
            int row_i_offset = row_i * step;
            for ( int col_i = 0; col_i < width; ++col_i )
            {
                printf("[%3.d, %3.d, %3.d] ", img_ptr[ row_i_offset + col_i * 3 + 0 ], \
                                               img_ptr[ row_i_offset + col_i * 3 + 1 ], \
                                               img_ptr[ row_i_offset + col_i * 3 + 2 ] );
            }
            printf("\n");
        }
    }
    else
    {
#ifdef __ANDROID__
#else
        throw std::runtime_error("cv::Mat number of channel currently not support to print\n");
#endif
    }
}


/**
 * @brief Extract RGB channel seprately from bayer image
 *
 * @tparam T data tyoe of bayer image.
 * @return vector of RGB image. OpenCV internally maintain reference count.
 *      Thus this step won't create deep copy overhead.
 *
 * @example extract_rgb_from_bayer<uint16_t>( bayer_img, rgb_vector_container );
 */
template <typename T>
void extract_rgb_from_bayer( const cv::Mat& bayer_img, \
    cv::Mat& img_ch1, cv::Mat& img_ch2, cv::Mat& img_ch3, cv::Mat& img_ch4 )
{
    const T* bayer_img_ptr = (const T*)bayer_img.data;
    int bayer_width = bayer_img.size().width;
    int bayer_height = bayer_img.size().height;
    int bayer_step = bayer_img.step1();

    if ( bayer_width % 2 != 0 || bayer_height % 2 != 0 )
    {
#ifdef __ANDROID__
#else
        throw std::runtime_error("Bayer image data size incorrect, must be multiplier of 2\n");
#endif
    }

    // RGB image is half the size of bayer image
    int rgb_width = bayer_width / 2;
    int rgb_height = bayer_height / 2;
    img_ch1.create( rgb_height, rgb_width, bayer_img.type() );
    img_ch2.create( rgb_height, rgb_width, bayer_img.type() );
    img_ch3.create( rgb_height, rgb_width, bayer_img.type() );
    img_ch4.create( rgb_height, rgb_width, bayer_img.type() );
    int rgb_step = img_ch1.step1();

    T* img_ch1_ptr = (T*)img_ch1.data;
    T* img_ch2_ptr = (T*)img_ch2.data;
    T* img_ch3_ptr = (T*)img_ch3.data;
    T* img_ch4_ptr = (T*)img_ch4.data;

    #pragma omp parallel for
    for ( int rgb_row_i = 0; rgb_row_i < rgb_height; rgb_row_i++ )
    {
        int rgb_row_i_offset = rgb_row_i * rgb_step;

        // Every RGB row corresbonding to two Bayer image row
        int bayer_row_i_offset0 = ( rgb_row_i * 2 + 0 ) * bayer_step; // For RG
        int bayer_row_i_offset1 = ( rgb_row_i * 2 + 1 ) * bayer_step; // For GB

        for ( int rgb_col_j = 0; rgb_col_j < rgb_width; rgb_col_j++ )
        {
            // img_ch1/2/3/4 : (0,0), (1,0), (0,1), (1,1)
            int bayer_col_i_offset0 = rgb_col_j * 2 + 0;
            int bayer_col_i_offset1 = rgb_col_j * 2 + 1;

            img_ch1_ptr[ rgb_row_i_offset + rgb_col_j ] = bayer_img_ptr[ bayer_row_i_offset0 + bayer_col_i_offset0 ];
            img_ch3_ptr[ rgb_row_i_offset + rgb_col_j ] = bayer_img_ptr[ bayer_row_i_offset0 + bayer_col_i_offset1 ];
            img_ch2_ptr[ rgb_row_i_offset + rgb_col_j ] = bayer_img_ptr[ bayer_row_i_offset1 + bayer_col_i_offset0 ];
            img_ch4_ptr[ rgb_row_i_offset + rgb_col_j ] = bayer_img_ptr[ bayer_row_i_offset1 + bayer_col_i_offset1 ];
        }
    }
}


/**
 * @brief Convert RGB image to gray image through same weight linear combination.
 *        Also support implicit data type conversion.
 *
 * @tparam RGB_DTYPE rgb image type (e.g. uint16_t)
 * @tparam GRAY_DTYPE gray image type (e.g. uint16_t)
 * @tparam GRAY_CVTYPE opencv gray image type
 */
template< typename RGB_DTYPE, typename GRAY_DTYPE, int GRAY_CVTYPE >
cv::Mat rgb_2_gray( const cv::Mat& rgb_img )
{
    const RGB_DTYPE* rgb_img_ptr = (const RGB_DTYPE*)rgb_img.data;
    int img_width = rgb_img.size().width;
    int img_height = rgb_img.size().height;
    int rgb_img_step = rgb_img.step1();

    // Create output gray cv::Mat
    cv::Mat gray_img( img_height, img_width, GRAY_CVTYPE );
    GRAY_DTYPE* gray_img_ptr = (GRAY_DTYPE*)gray_img.data;
    int gray_img_step = gray_img.step1();

    #pragma omp parallel for
    for ( int row_i = 0; row_i < img_height; row_i++ )
    {
        int rgb_row_i_offset = row_i * rgb_img_step;
        int gray_row_i_offset = row_i * gray_img_step;

        UNROLL_LOOP( 32 ) // multiplier of cache line size
        for ( int col_j = 0; col_j < img_width; col_j++ )
        {
            GRAY_DTYPE avg_ij(0);

            avg_ij += rgb_img_ptr[ rgb_row_i_offset + (col_j * 3 + 0) ];
            avg_ij += rgb_img_ptr[ rgb_row_i_offset + (col_j * 3 + 1) ];
            avg_ij += rgb_img_ptr[ rgb_row_i_offset + (col_j * 3 + 2) ];

            avg_ij /= 3;

            gray_img_ptr[ gray_row_i_offset + col_j ] = avg_ij;
        }
    }

    // OpenCV use reference count. Thus return won't create deep copy
    return gray_img;
}


template <typename T>
void print_tile( const cv::Mat& img, int tile_size, int start_idx_row, int start_idx_col )
{
    const T* img_ptr = (T*)img.data;
    int src_step   = img.step1();

    for ( int row = start_idx_row; row < tile_size + start_idx_row; ++row )
    {
        const T* img_ptr_row = img_ptr + row * src_step;
        for ( int col = start_idx_col; col < tile_size + start_idx_col; ++col )
        {
            printf("%u ", img_ptr_row[ col ] );
        }
        printf("\n");
    }
    printf("\n");
}


template< typename T>
void print_img( const cv::Mat& img, int img_height = -1, int img_width = -1 )
{
    const T* img_ptr = (T*)img.data;
    if ( img_height == -1 && img_width == -1 )
    {
        img_height = img.size().height;
        img_width = img.size().width;
    }
    else
    {
        img_height = std::min( img.size().height, img_height );
        img_width = std::min( img.size().width, img_width );
    }
    printf("Image size (h=%d, w=%d), Print range (h=0-%d, w=0-%d)]\n", \
        img.size().height, img.size().width, img_height, img_width );

    int img_step = img.step1();

    for ( int row = 0; row < img_height; ++row )
    {
        const T* img_ptr_row = img_ptr + row * img_step;
        for ( int col = 0; col < img_width; ++col )
        {
            printf("%u ", img_ptr_row[ col ]);
        }
        printf("\n");
    }
    printf("\n");
}

} // namespace hdrplus