optimize unroll

main
Xiao Song 3 years ago
parent 5b729dc11a
commit 7e9217b01f

@ -6,8 +6,8 @@ project(hdrplus)
# set c++ standard
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED True)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -Wall")
# make sure we use Release and warn otherwise
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)

@ -17,6 +17,7 @@
step 3:
```shell
cd LibRaw-X.YY
autoreconf -f -i
./configure # with optional args
make
```

@ -3,21 +3,32 @@
#include <string>
#include <stdexcept> // std::runtime_error
#include <opencv2/opencv.hpp> // all opencv header
// TODO: add openmp support
#if defined(__clang__)
#define LOOP_UNROLL unroll
#elif defined(__GNUC__) || defined(__GNUG__)
#define LOOP_UNROLL GCC unroll
#elif defined(_MSC_VER)
#define LOOP_UNROLL unroll
// https://stackoverflow.com/questions/63404539/portable-loop-unrolling-with-template-parameter-in-c-with-gcc-icc
/// Helper macros for stringification
#define TO_STRING_HELPER(X) #X
#define TO_STRING(X) TO_STRING_HELPER(X)
// Define loop unrolling depending on the compiler
#if defined(__ICC) || defined(__ICL)
#define UNROLL_LOOP(n) _Pragma(TO_STRING(unroll (n)))
#elif defined(__clang__)
#define UNROLL_LOOP(n) _Pragma(TO_STRING(unroll (n)))
#elif defined(__GNUC__) && !defined(__clang__)
#define UNROLL_LOOP(n) _Pragma(TO_STRING(GCC unroll (16)))
#elif defined(_MSC_BUILD)
#pragma message ("Microsoft Visual C++ (MSVC) detected: Loop unrolling not supported!")
#define UNROLL_LOOP(n)
#else
#warning "Unknown compiler: Loop unrolling not supported!"
#define UNROLL_LOOP(n)
#endif
namespace hdrplus
{
template <typename T, int kernel>
cv::Mat box_filter_kxk( const cv::Mat& src_image )
{
@ -45,10 +56,11 @@ cv::Mat box_filter_kxk( const cv::Mat& src_image )
{
// Take ceiling for rounding
T box_sum = T( 0 );
//#pragma LOOP_UNROLL
UNROLL_LOOP( kernel )
for ( int kernel_row_i = 0; kernel_row_i < kernel; ++kernel_row_i )
{
//#pragma LOOP_UNROLL
UNROLL_LOOP( kernel )
for ( int kernel_col_i = 0; kernel_col_i < kernel; ++kernel_col_i )
{
box_sum += src_image_ptr[ ( row_i * kernel + kernel_row_i ) * src_step + ( col_i * kernel + kernel_col_i ) ];
@ -84,6 +96,7 @@ cv::Mat downsample_nearest_neighbour( const cv::Mat& src_image )
// -03 should be enough to optimize below code
for ( int row_i = 0; row_i < dst_height; row_i++ )
{
UNROLL_LOOP( 32 )
for ( int col_i = 0; col_i < dst_width; col_i++ )
{
dst_image_ptr[ row_i * dst_step + col_i ] = \
@ -184,8 +197,6 @@ template <typename T>
void print_tile( const cv::Mat& img, int tile_size, int start_idx_row, int start_idx_col )
{
const T* img_ptr = (T*)img.data;
int src_height = img.size().height;
int src_width = img.size().width;
int src_step = img.step1();
for ( int row = start_idx_row; row < tile_size + start_idx_row; ++row )

@ -30,7 +30,7 @@ static void build_per_grayimg_pyramid( \
images_pyramid.resize( inv_scale_factors.size() );
for ( int i = 0; i < inv_scale_factors.size(); ++i )
for ( size_t i = 0; i < inv_scale_factors.size(); ++i )
{
cv::Mat blur_image;
cv::Mat downsample_image;
@ -81,7 +81,7 @@ static void build_upsampled_prev_aligement( \
constexpr int repeat_factor = pyramid_scale_factor_prev_curr / tilesize_scale_factor_prev_curr;
// printf("build_upsampled_prev_aligement with scale factor %d, repeat factor %d, tile size factor %d\n", \
pyramid_scale_factor_prev_curr, repeat_factor, tilesize_scale_factor_prev_curr );
// pyramid_scale_factor_prev_curr, repeat_factor, tilesize_scale_factor_prev_curr );
int dst_height = src_height * repeat_factor;
int dst_width = src_width * repeat_factor;
@ -107,8 +107,10 @@ static void build_upsampled_prev_aligement( \
align_i.second *= pyramid_scale_factor_prev_curr;
// repeat
UNROLL_LOOP( repeat_factor )
for ( int repeat_row_i = 0; repeat_row_i < repeat_factor; ++repeat_row_i )
{
UNROLL_LOOP( repeat_factor )
for ( int repeat_col_i = 0; repeat_col_i < repeat_factor; ++repeat_col_i )
{
dst_alignment[ row_i * repeat_factor + repeat_row_i ][ col_i * repeat_factor + repeat_col_i ] = align_i;
@ -167,6 +169,7 @@ static unsigned long long l1_distance( const cv::Mat& img1, const cv::Mat& img2,
const data_type* img1_ptr_row_i = img1_ptr + (img1_tile_row_start_idx + row_i) * img1_step + img1_tile_col_start_idx;
const data_type* img2_ptr_row_i = img2_ptr + (img2_tile_row_start_idx + row_i) * img2_step + img2_tile_col_start_idx;
UNROLL_LOOP( tile_size )
for ( int col_i = 0; col_i < tile_size; ++col_i )
{
data_type l1 = CUSTOME_ABS( img1_ptr_row_i[ col_i ] - img2_ptr_row_i[ col_i ] );
@ -232,6 +235,7 @@ static return_type l2_distance( const cv::Mat& img1, const cv::Mat& img2, \
const data_type* img1_ptr_row_i = img1_ptr + (img1_tile_row_start_idx + row_i) * img1_step + img1_tile_col_start_idx;
const data_type* img2_ptr_row_i = img2_ptr + (img2_tile_row_start_idx + row_i) * img2_step + img2_tile_col_start_idx;
UNROLL_LOOP( tile_size )
for ( int col_i = 0; col_i < tile_size; ++col_i )
{
data_type l1 = CUSTOME_ABS( img1_ptr_row_i[ col_i ] - img2_ptr_row_i[ col_i ] );
@ -372,7 +376,7 @@ void align_image_level( \
// printf("Alter image pad h=%d, w=%d: \n", alt_img_pad.size().height, alt_img_pad.size().width );
// print_img<uint16_t>( alt_img_pad );
//printf("!! enlarged tile size %d\n", curr_tile_size + 2 * search_radiou );
// printf("!! enlarged tile size %d\n", curr_tile_size + 2 * search_radiou );
int alt_tile_row_idx_max = alt_img_pad.size().height - ( curr_tile_size + 2 * search_radiou );
int alt_tile_col_idx_max = alt_img_pad.size().width - ( curr_tile_size + 2 * search_radiou );
@ -389,8 +393,8 @@ void align_image_level( \
int ref_tile_row_start_idx_i = ref_tile_row_i * curr_tile_size / 2;
int ref_tile_col_start_idx_i = ref_tile_col_i * curr_tile_size / 2;
//printf("\nRef img tile [%d, %d] -> start idx [%d, %d] (row, col)\n", \
ref_tile_row_i, ref_tile_col_i, ref_tile_row_start_idx_i, ref_tile_col_start_idx_i );
// printf("\nRef img tile [%d, %d] -> start idx [%d, %d] (row, col)\n", \
// ref_tile_row_i, ref_tile_col_i, ref_tile_row_start_idx_i, ref_tile_col_start_idx_i );
// printf("\nRef img tile [%d, %d]\n", ref_tile_row_i, ref_tile_col_i );
// print_tile<uint16_t>( ref_img, curr_tile_size, ref_tile_row_start_idx_i, ref_tile_col_start_idx_i );
@ -410,21 +414,21 @@ void align_image_level( \
alt_tile_col_start_idx_i = 0;
if ( alt_tile_row_start_idx_i > alt_tile_row_idx_max )
{
int before = alt_tile_row_start_idx_i;
// int before = alt_tile_row_start_idx_i;
alt_tile_row_start_idx_i = alt_tile_row_idx_max;
// printf("@@ change start x from %d to %d\n", before, alt_tile_row_idx_max);
}
if ( alt_tile_col_start_idx_i > alt_tile_col_idx_max )
{
int before = alt_tile_col_start_idx_i;
// int before = alt_tile_col_start_idx_i;
alt_tile_col_start_idx_i = alt_tile_col_idx_max;
// printf("@@ change start y from %d to %d\n", before, alt_tile_col_idx_max );
}
// Because alternative image is padded with search radious.
// Using same coordinate with reference image will automatically considered search radious * 2
//printf("Alt image tile [%d, %d]-> start idx [%d, %d]\n", \
ref_tile_row_i, ref_tile_col_i, alt_tile_row_start_idx_i, alt_tile_col_start_idx_i );
// printf("Alt image tile [%d, %d]-> start idx [%d, %d]\n", \
// ref_tile_row_i, ref_tile_col_i, alt_tile_row_start_idx_i, alt_tile_col_start_idx_i );
// printf("\nAlt image tile [%d, %d]\n", ref_tile_row_i, ref_tile_col_i );
// print_tile<uint16_t>( alt_img_pad, curr_tile_size + 2 * search_radiou, alt_tile_row_start_idx_i, alt_tile_col_start_idx_i );
@ -436,16 +440,16 @@ void align_image_level( \
{
for ( int search_col_j = 0; search_col_j < ( search_radiou * 2 + 1 ); search_col_j++ )
{
//printf("\n--->tile at [%d, %d] search (%d, %d)\n", \
ref_tile_row_i, ref_tile_col_i, search_row_j - search_radiou, search_col_j - search_radiou );
// printf("\n--->tile at [%d, %d] search (%d, %d)\n", \
// ref_tile_row_i, ref_tile_col_i, search_row_j - search_radiou, search_col_j - search_radiou );
// TODO: currently distance is incorrect
unsigned long long distance_j = distance_func_ptr( ref_img, alt_img_pad, \
ref_tile_row_start_idx_i, ref_tile_col_start_idx_i, \
alt_tile_row_start_idx_i + search_row_j, alt_tile_col_start_idx_i + search_col_j );
//printf("<---tile at [%d, %d] search (%d, %d), new dis %llu, old dis %llu\n", \
ref_tile_row_i, ref_tile_col_i, search_row_j - search_radiou, search_col_j - search_radiou, distance_j, min_distance_i );
// printf("<---tile at [%d, %d] search (%d, %d), new dis %llu, old dis %llu\n", \
// ref_tile_row_i, ref_tile_col_i, search_row_j - search_radiou, search_col_j - search_radiou, distance_j, min_distance_i );
// If this is smaller distance
if ( distance_j < min_distance_i )
@ -456,30 +460,30 @@ void align_image_level( \
}
// If same value, choose the one closer to the original tile location
// if ( distance_j == min_distance_i && min_distance_row_i != -1 && min_distance_col_i != -1 )
// {
// int prev_distance_row_2_ref = min_distance_row_i - search_radiou;
// int prev_distance_col_2_ref = min_distance_col_i - search_radiou;
// int curr_distance_row_2_ref = search_row_j - search_radiou;
// int curr_distance_col_2_ref = search_col_j - search_radiou;
// int prev_distance_2_ref_sqr = prev_distance_row_2_ref * prev_distance_row_2_ref + prev_distance_col_2_ref * prev_distance_col_2_ref;
// int curr_distance_2_ref_sqr = curr_distance_row_2_ref * curr_distance_row_2_ref + curr_distance_col_2_ref * curr_distance_col_2_ref;
// // previous min distance idx is farther away from ref tile start location
// if ( prev_distance_2_ref_sqr > curr_distance_2_ref_sqr )
// {
// // printf("@@@ Same distance %d, choose closer one (%d, %d) instead of (%d, %d)\n", \
// distance_j, search_row_j, search_col_j, min_distance_row_i, min_distance_col_i);
// min_distance_col_i = search_col_j;
// min_distance_row_i = search_row_j;
// }
// }
if ( distance_j == min_distance_i && min_distance_row_i != -1 && min_distance_col_i != -1 )
{
int prev_distance_row_2_ref = min_distance_row_i - search_radiou;
int prev_distance_col_2_ref = min_distance_col_i - search_radiou;
int curr_distance_row_2_ref = search_row_j - search_radiou;
int curr_distance_col_2_ref = search_col_j - search_radiou;
int prev_distance_2_ref_sqr = prev_distance_row_2_ref * prev_distance_row_2_ref + prev_distance_col_2_ref * prev_distance_col_2_ref;
int curr_distance_2_ref_sqr = curr_distance_row_2_ref * curr_distance_row_2_ref + curr_distance_col_2_ref * curr_distance_col_2_ref;
// previous min distance idx is farther away from ref tile start location
if ( prev_distance_2_ref_sqr > curr_distance_2_ref_sqr )
{
// printf("@@@ Same distance %d, choose closer one (%d, %d) instead of (%d, %d)\n", \
// distance_j, search_row_j, search_col_j, min_distance_row_i, min_distance_col_i);
min_distance_col_i = search_col_j;
min_distance_row_i = search_row_j;
}
}
}
}
//printf("tile at (%d, %d) alignment (%d, %d)\n", \
ref_tile_row_i, ref_tile_col_i, min_distance_row_i, min_distance_col_i );
// printf("tile at (%d, %d) alignment (%d, %d)\n", \
// ref_tile_row_i, ref_tile_col_i, min_distance_row_i, min_distance_col_i );
int alignment_row_i = prev_alignment_row_i + min_distance_row_i - search_radiou;
int alignment_col_i = prev_alignment_col_i + min_distance_col_i - search_radiou;
@ -516,38 +520,6 @@ void align_image_level( \
}
static void build_per_pyramid_reftiles_start( \
std::vector<std::vector<std::vector<std::pair<int, int>>>>& per_pyramid_reftiles_start, \
const std::vector<std::vector<cv::Mat>>& per_grayimg_pyramid, \
const std::vector<int>& grayimg_tile_sizes )
{
per_pyramid_reftiles_start.resize( per_grayimg_pyramid.at(0).size() );
// Every image pyramid level
for ( int level_i = 0; level_i < per_grayimg_pyramid.at(0).size(); level_i++ )
{
int level_i_img_h = per_grayimg_pyramid.at(0).at( level_i ).size().height;
int level_i_img_w = per_grayimg_pyramid.at(0).at( level_i ).size().width;
int level_i_tile_size = grayimg_tile_sizes.at( level_i );
int num_tiles_h = level_i_img_h / (level_i_tile_size / 2) - 1;
int num_tiles_w = level_i_img_w / (level_i_tile_size / 2) - 1;
// Allocate memory
per_pyramid_reftiles_start.at( level_i ).resize( num_tiles_h, std::vector<std::pair<int, int>>( num_tiles_w ) );
for ( int tile_col_i = 0; tile_col_i < num_tiles_h; tile_col_i++ )
{
for ( int tile_row_j = 0; tile_row_j < num_tiles_w; tile_row_j++ )
{
per_pyramid_reftiles_start.at( level_i ).at( tile_col_i ).at( tile_row_j ) \
= std::make_pair<int, int>( tile_col_i * level_i_tile_size, tile_row_j * level_i_tile_size );
}
}
}
}
void align::process( const hdrplus::burst& burst_images, \
std::vector<std::vector<std::vector<std::pair<int, int>>>>& images_alignment )

@ -30,7 +30,7 @@ burst::burst( const std::string& burst_path, const std::string& reference_image_
// Find reference image path in input directory
// reference image path need to be absolute path
reference_image_idx = -1;
for ( int i = 0; i < bayer_image_paths.size(); ++i )
for ( size_t i = 0; i < bayer_image_paths.size(); ++i )
{
if ( bayer_image_paths[ i ] == reference_image_path )
{

@ -43,7 +43,7 @@ void test_align_one_level(int argc, char** argv)
int num_tiles_h = rggb_imgs.at(0).size().height / ( tilesize / 2 ) - 1;
int num_tiles_w = rggb_imgs.at(0).size().width / ( tilesize / 2 ) - 1;
for ( int img_channel = 0; img_channel < rggb_imgs.size(); ++img_channel )
for ( int img_channel = 0; img_channel < int(rggb_imgs.size()); ++img_channel )
{
for ( int tile_row_i = 0; tile_row_i < num_tiles_h; ++tile_row_i )
{

Loading…
Cancel
Save