Classes
struct	gpufilter::_alg_setup
	Algorithm setup to configure the GPU to run. More...
Modules
	GPU Computation functions
Functions
void	gpufilter::calc_alg_setup (alg_setup &algs, const int &w, const int &h)
	Calculate algorithm setup values.
void	gpufilter::calc_alg_setup (alg_setup &algs, const int &w, const int &h, const int &extb)
	Upload device constants sizes.
void	gpufilter::up_alg_setup (const alg_setup &algs)
	Upload algorithm setup values.
void	gpufilter::up_constants_coefficients1 (const float &b0, const float &a1)
	Upload device constants first-order coefficients.
void	gpufilter::up_constants_coefficients2 (const float &b0, const float &a1, const float &a2)
	Upload device constants second-order coefficients.
void	gpufilter::prepare_algSAT (alg_setup &algs, dvector< float > &d_inout, dvector< float > &d_ybar, dvector< float > &d_vhat, dvector< float > &d_ysum, const float *h_in, const int &w, const int &h)
	Prepare for Algorithm SAT.
void	gpufilter::algSAT (dvector< float > &d_out, dvector< float > &d_ybar, dvector< float > &d_vhat, dvector< float > &d_ysum, const dvector< float > &d_in, const alg_setup &algs)
	Compute Algorithm SAT.
void	gpufilter::algSAT (dvector< float > &d_inout, dvector< float > &d_ybar, dvector< float > &d_vhat, dvector< float > &d_ysum, const alg_setup &algs)
	Compute Algorithm SAT.
void	gpufilter::algSAT (float *inout, const int &w, const int &h)
	Compute Algorithm SAT.
void	gpufilter::prepare_alg4 (alg_setup &algs, alg_setup &algs_transp, dvector< float > &d_out, dvector< float > &d_transp_out, dvector< float2 > &d_transp_pybar, dvector< float2 > &d_transp_ezhat, dvector< float2 > &d_pubar, dvector< float2 > &d_evhat, cudaArray &a_in, const float h_in, const int &w, const int &h, const float &b0, const float &a1, const float &a2, const int &extb=0, const initcond &ic=zero)
	Prepare for Algorithm 4.
void	gpufilter::alg4 (dvector< float > &d_out, dvector< float > &d_transp_out, dvector< float2 > &d_transp_pybar, dvector< float2 > &d_transp_ezhat, dvector< float2 > &d_pubar, dvector< float2 > &d_evhat, const cudaArray *a_in, const alg_setup &algs, const alg_setup &algs_transp)
	Compute Algorithm 4 (first-order)
void	gpufilter::alg4 (float *h_inout, const int &w, const int &h, const float &b0, const float &a1, const float &a2, const int &extb=0, const initcond &ic=zero)
	Compute Algorithm 4 (second-order)
void	gpufilter::prepare_alg5 (alg_setup &algs, dvector< float > &d_out, dvector< float > &d_transp_pybar, dvector< float > &d_transp_ezhat, dvector< float > &d_ptucheck, dvector< float > &d_etvtilde, cudaArray &a_in, const float h_in, const int &w, const int &h, const float &b0, const float &a1, const int &extb=0, const initcond &ic=zero)
	Prepare for Algorithm 5.
void	gpufilter::alg5 (dvector< float > &d_out, dvector< float > &d_transp_pybar, dvector< float > &d_transp_ezhat, dvector< float > &d_ptucheck, dvector< float > &d_etvtilde, const cudaArray *a_in, const alg_setup &algs)
	Compute Algorithm 5 (first-order)
void	gpufilter::alg5 (float *h_inout, const int &w, const int &h, const float &b0, const float &a1, const int &extb=0, const initcond &ic=zero)
	Compute Algorithm 5 (first-order)
void	gpufilter::gaussian_gpu (float **inout, const int &w, const int &h, const int &d, const float &s, const int &extb=1, const initcond &ic=clamp)
	Gaussian blur an image in the GPU.
void	gpufilter::gaussian_gpu (float *inout, const int &w, const int &h, const float &s, const int &extb=1, const initcond &ic=clamp)
	Gaussian blur a single-channel image in the GPU.
void	gpufilter::bspline3i_gpu (float **inout, const int &w, const int &h, const int &d, const int &extb=1, const initcond &ic=mirror)
	Compute the Bicubic B-Spline interpolation of an image in the GPU.
void	gpufilter::bspline3i_gpu (float *inout, const int &w, const int &h, const int &extb=1, const initcond &ic=mirror)
	Compute the Bicubic B-Spline interpolation of a single-channel image in the GPU.
__host__ void	gpufilter::up_texture (cudaArray &a_in, const float h_in, const int &w, const int &h, const initcond &ic)
	Upload input image as a texture in device.

Function Documentation

void gpufilter::alg4	(	dvector< float > &	d_out,
		dvector< float > &	d_transp_out,
		dvector< float2 > &	d_transp_pybar,
		dvector< float2 > &	d_transp_ezhat,
		dvector< float2 > &	d_pubar,
		dvector< float2 > &	d_evhat,
		const cudaArray *	a_in,
		const alg_setup &	algs,
		const alg_setup &	algs_transp
	)

Compute Algorithm 4 (first-order)

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Note:: For performance purposes (in CUDA kernels implementation) this function only works with minimum image resolution, and only in multiples of 64 in each dimension.

Parameters:

[out]	d_out	The output 2D image allocated in device memory
[out]	d_transp_out	The transposed output 2D image used in the middle of the computation
[out]	d_transp_pybar	The $P_{m,n}(\bar{Y})$ allocated in device memory
[out]	d_transp_ezhat	The $E_{m,n}(\hat{Z})$ allocated in device memory
[out]	d_pubar	The $P^T_{m,n}(\bar{U})$ allocated in device memory
[out]	d_evhat	The $E^T_{m,n}(\hat{V})$ allocated in device memory
[in]	a_in	The input 2D image allocated in device memory as cudaArray
[in]	algs	Algorithm setup to be uploaded to the GPU
[in]	algs_transp	Algorithm setup transposed to be used in the middle of the computation

Examples:: example_r4.cc, and example_r5.cc.

void gpufilter::alg4	(	float *	h_inout,
		const int &	w,
		const int &	h,
		const float &	b0,
		const float &	a1,
		const float &	a2,
		const int &	extb = `0`,
		const initcond &	ic = `zero`
	)

Compute Algorithm 4 (second-order)

This function computes second-order recursive filtering with given feedback and feedforward coefficients of an input 2D image using algorithm $4_2$ .

The algorithm 4 is discussed in depth in our paper ([Nehab:2011] cited in alg5() function).

Parameters:

[in,out]	h_inout	The in/output 2D image to compute recursive filtering
[in]	w	Image width
[in]	h	Image height
[in]	b0	Feedforward coefficient
[in]	a1	Feedback first-order coefficient
[in]	a2	Feedback second-order coefficient
[in]	extb	Extension (in blocks) to consider outside image (default 0)
[in]	ic	Initial condition (for outside access) (default zero)

void gpufilter::alg5	(	dvector< float > &	d_out,
		dvector< float > &	d_transp_pybar,
		dvector< float > &	d_transp_ezhat,
		dvector< float > &	d_ptucheck,
		dvector< float > &	d_etvtilde,
		const cudaArray *	a_in,
		const alg_setup &	algs
	)

Compute Algorithm 5 (first-order)

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Note:: For performance purposes (in CUDA kernels implementation) this function only works with minimum image resolution, and only in multiples of 64 in each dimension.

Parameters:

[out]	d_out	The output 2D image allocated in device memory
[out]	d_transp_pybar	The $P_{m,n}(\bar{Y})$ allocated in device memory
[out]	d_transp_ezhat	The $E_{m,n}(\hat{Z})$ allocated in device memory
[out]	d_ptucheck	The $P^T_{m,n}(\check{U})$ allocated in device memory
[out]	d_etvtilde	The $E^T_{m,n}(\tilde{U})$ allocated in device memory
[in]	a_in	The input 2D image allocated in device memory as cudaArray
[in]	algs	Algorithm setup used for this computation

Examples:: example_r2.cc, and example_r3.cc.

void gpufilter::alg5	(	float *	h_inout,
		const int &	w,
		const int &	h,
		const float &	b0,
		const float &	a1,
		const int &	extb = `0`,
		const initcond &	ic = `zero`
	)

Compute Algorithm 5 (first-order)

This function computes first-order recursive filtering with given feedback and feedforward coefficients of an input 2D image using algorithm $5_1$ .

The algorithm 5 is discussed in depth in our paper:

@inproceedings{Nehab:2011,
  title = {{GPU}-{E}fficient {R}ecursive {F}iltering and {S}ummed-{A}rea {T}ables},
  author = {{N}ehab, {D}. and {M}aximo, {A}. and {L}ima, {R}. {S}. and {H}oppe, {H}.},
  journal = {{ACM} {T}ransactions on {G}raphics ({P}roceedings of the {ACM} {SIGGRAPH} {A}sia 2011)},
  year = {2011},
  volume = {30},
  number = {6},
  doi = {},
  publisher = {ACM},
  address = {{N}ew {Y}ork, {NY}, {USA}}
}

Parameters:

[in,out]	h_inout	The in/output 2D image to compute recursive filtering in host memory
[in]	w	Image width
[in]	h	Image height
[in]	b0	Feedforward coefficient
[in]	a1	Feedback coefficient
[in]	extb	Extension (in blocks) to consider outside image (default 0)
[in]	ic	Initial condition (for outside access) (default zero)

void gpufilter::algSAT	(	dvector< float > &	d_out,
		dvector< float > &	d_ybar,
		dvector< float > &	d_vhat,
		dvector< float > &	d_ysum,
		const dvector< float > &	d_in,
		const alg_setup &	algs
	)

Compute Algorithm SAT.

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Note:: For performance purposes (in CUDA kernels implementation) this function works better in multiples of 32 in each dimension.

See also:: Base algSAT() function and [Nehab:2011] cited in alg5() function

Parameters:

[out]	d_out	The output 2D image allocated in device memory
[out]	d_ybar	The $P_{m,n}(\bar{Y})$ allocated in device memory
[out]	d_vhat	The $P^T_{m,n}(\hat{V})$ allocated in device memory
[out]	d_ysum	The $s(P_{m,n}(Y))$ allocated in device memory
[in]	d_in	The input 2D image allocated in device memory
[in]	algs	Algorithm setup used for this computation

Examples:: example_sat2.cc, and example_sat3.cc.

void gpufilter::algSAT	(	dvector< float > &	d_inout,
		dvector< float > &	d_ybar,
		dvector< float > &	d_vhat,
		dvector< float > &	d_ysum,
		const alg_setup &	algs
	)

Compute Algorithm SAT.

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Note:: For performance purposes (in CUDA kernels implementation) this function works better in multiples of 32 in each dimension.

See also:: Base algSAT() function and [Nehab:2011] cited in alg5() function

Parameters:

[in,out]	d_inout	The in/output 2D image allocated in device memory
[out]	d_ybar	The $P_{m,n}(\bar{Y})$ allocated in device memory
[out]	d_vhat	The $P^T_{m,n}(\hat{V})$ allocated in device memory
[out]	d_ysum	The $s(P_{m,n}(Y))$ allocated in device memory
[in]	algs	Algorithm setup used for this computation

void gpufilter::algSAT	(	float *	inout,
		const int &	w,
		const int &	h
	)

Compute Algorithm SAT.

This function computes the summed-area table (SAT) of an input 2D image using algorithm SAT.

The algorithm SAT is discussed in depth in our paper (see [Nehab:2011] in alg5() function) where the following image illustrates the process:

Illustration of Algorithm SAT

Overlapped summed-area table computation according to algorithm SAT. Stage S.1 reads the input (in gray) then computes and stores incomplete prologues $P_{m,n}(\bar{Y})$ (in red) and $P^T_{m,n}(\hat{V})$ (in blue). Stage S.2 completes prologues $P_{m,n}(Y)$ and computes scalars $s\big(P_{m-1,n}(Y)\big)$ (in yellow). Stage S.3 completes prologues $P^T_{m,n}(V)$ . Finally, stage S.4 reads the input and completed prologues, then computes and stores the final summed-area table.

Parameters:

[in,out]	inout	The in/output 2D image to compute SAT
[in]	w	Image width
[in]	h	Image height

void gpufilter::bspline3i_gpu	(	float **	inout,
		const int &	w,
		const int &	h,
		const int &	d,
		const int &	extb = `1`,
		const initcond &	ic = `mirror`
	)

Compute the Bicubic B-Spline interpolation of an image in the GPU.

Given an input 2D image compute the Bicubic B-Spline interpolation of it by applying a first-order recursive filter using zero-border initial conditions.

Parameters:

[in,out]	inout	The 2D image to compute the Bicubic B-Spline interpolation
[in]	w	Width of the input image
[in]	h	Height of the input image
[in]	d	Depth of the input image (color channels)
[in]	extb	Extension (in blocks) to consider outside image (default 1 block)
[in]	ic	Initial condition (for outside access) (default mirror)

Examples:: app_recursive.cc, and example_bspline.cc.

void gpufilter::bspline3i_gpu	(	float *	inout,
		const int &	w,
		const int &	h,
		const int &	extb = `1`,
		const initcond &	ic = `mirror`
	)

Compute the Bicubic B-Spline interpolation of a single-channel image in the GPU.

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Parameters:

[in,out]	inout	The single-channel 2D image to compute the Bicubic B-Spline interpolation
[in]	w	Width of the input image
[in]	h	Height of the input image
[in]	extb	Extension (in blocks) to consider outside image (default 1 block)
[in]	ic	Initial condition (for outside access) (default mirror)

void gpufilter::calc_alg_setup	(	alg_setup &	algs,
		const int &	w,
		const int &	h
	)

Calculate algorithm setup values.

Given the dimensions of the 2D work image, calculate the device constant memory size-related values. It returns the setup to run any GPU algorithm.

Parameters:

[out]	algs	Algorithm setup to be uploaded to the GPU
[in]	w	Width of the work image
[in]	h	Height of the work image

void gpufilter::calc_alg_setup	(	alg_setup &	algs,
		const int &	w,
		const int &	h,
		const int &	extb
	)

Upload device constants sizes.

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. Given the dimensions of the 2D work image, calculate the device constant memory size-related values. The work image is the original image plus extension blocks to run algorithms out-of-bounds. It returns the setup to run any GPU algorithm.

Parameters:

[out]	algs	Algorithm setup to be uploaded to the GPU
[in]	w	Width of the work image
[in]	h	Height of the work image
[in]	extb	Extension (in blocks) to consider outside image

void gpufilter::gaussian_gpu	(	float **	inout,
		const int &	w,
		const int &	h,
		const int &	d,
		const float &	s,
		const int &	extb = `1`,
		const initcond &	ic = `clamp`
	)

Gaussian blur an image in the GPU.

Given an input single-channel 2D image compute the Gaussian blur of it by applying a first-order recursive filter (using alg5()) followed by a second-order recursive filter (using alg4()) and zero-border initial condition.

Parameters:

[in,out]	inout	The 2D image to compute Gaussian blur
[in]	w	Width of the input image
[in]	h	Height of the input image
[in]	d	Depth of the input image (color channels)
[in]	s	Sigma support of Gaussian blur computation
[in]	extb	Extension (in blocks) to consider outside image (default 1 block)
[in]	ic	Initial condition (for outside access) (default clamp)

Examples:: app_recursive.cc, and example_gauss.cc.

void gpufilter::gaussian_gpu	(	float *	inout,
		const int &	w,
		const int &	h,
		const float &	s,
		const int &	extb = `1`,
		const initcond &	ic = `clamp`
	)

Gaussian blur a single-channel image in the GPU.

This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.

Parameters:

[in,out]	inout	The single-channel 2D image to compute Gaussian blur
[in]	w	Width of the input image
[in]	h	Height of the input image
[in]	s	Sigma support of Gaussian blur computation
[in]	extb	Extension (in blocks) to consider outside image (default 1 block)
[in]	ic	Initial condition (for outside access) (default clamp)

void gpufilter::prepare_alg4	(	alg_setup &	algs,
		alg_setup &	algs_transp,
		dvector< float > &	d_out,
		dvector< float > &	d_transp_out,
		dvector< float2 > &	d_transp_pybar,
		dvector< float2 > &	d_transp_ezhat,
		dvector< float2 > &	d_pubar,
		dvector< float2 > &	d_evhat,
		cudaArray *&	a_in,
		const float *	h_in,
		const int &	w,
		const int &	h,
		const float &	b0,
		const float &	a1,
		const float &	a2,
		const int &	extb = `0`,
		const initcond &	ic = `zero`
	)

Prepare for Algorithm 4.

This function prepares the data structures used by the recursive filtering algorithm 4 (order 2) of an input 2D image.

The algorithm 4 is discussed in depth in our paper (see [Nehab:2011] in alg5() function) and it is implemented in alg4() function.

Parameters:

[out]	algs	Algorithm setup computed for initial computation
[out]	algs_transp	Algorithm setup transposed to be used in the middle of the computation
[out]	d_out	The output 2D image to be allocated in device memory
[out]	d_transp_out	The transposed output 2D image used in the middle of the computation
[out]	d_transp_pybar	The $P_{m,n}(\bar{Y})$ to be allocated in device memory
[out]	d_transp_ezhat	The $E_{m,n}(\hat{Z})$ to be allocated in device memory
[out]	d_pubar	The $P^T_{m,n}(\bar{U})$ to be allocated in device memory
[out]	d_evhat	The $E^T_{m,n}(\hat{V})$ to be allocated in device memory
[out]	a_in	The input 2D image as cudaArray to be allocated and copied to device memory
[in]	h_in	The input 2D image to compute algorithm 4 in host memory
[in]	w	Image width
[in]	h	Image height
[in]	b0	Feedforward coefficient
[in]	a1	Feedback coefficient
[in]	a2	Feedback coefficient
[in]	extb	Extension (in blocks) to consider outside image (default 0)
[in]	ic	Initial condition (for outside access) (default zero)

Examples:: example_r5.cc.

void gpufilter::prepare_alg5	(	alg_setup &	algs,
		dvector< float > &	d_out,
		dvector< float > &	d_transp_pybar,
		dvector< float > &	d_transp_ezhat,
		dvector< float > &	d_ptucheck,
		dvector< float > &	d_etvtilde,
		cudaArray *&	a_in,
		const float *	h_in,
		const int &	w,
		const int &	h,
		const float &	b0,
		const float &	a1,
		const int &	extb = `0`,
		const initcond &	ic = `zero`
	)

Prepare for Algorithm 5.

This function prepares the data structures used by the recursive filtering algorithm 5 (order 1) of an input 2D image.

The algorithm 5 is discussed in depth in our paper (see [Nehab:2011] in alg5() function) and it is implemented in alg5() function.

Parameters:

[out]	algs	Algorithm setup computed and uploaded to the GPU
[out]	d_out	The output 2D image to be allocated in device memory
[out]	d_transp_pybar	The $P_{m,n}(\bar{Y})$ to be allocated in device memory
[out]	d_transp_ezhat	The $E_{m,n}(\hat{Z})$ to be allocated in device memory
[out]	d_ptucheck	The $P^T_{m,n}(\check{U})$ to be allocated in device memory
[out]	d_etvtilde	The $E^T_{m,n}(\tilde{V})$ to be allocated in device memory
[out]	a_in	The input 2D image as cudaArray to be allocated and copied to device memory
[in]	h_in	The input 2D image to compute algorithm 5 in host memory
[in]	w	Image width
[in]	h	Image height
[in]	b0	Feedforward coefficient
[in]	a1	Feedback coefficient
[in]	extb	Extension (in blocks) to consider outside image (default 0)
[in]	ic	Initial condition (for outside access) (default zero)

Examples:: example_r3.cc.

void gpufilter::prepare_algSAT	(	alg_setup &	algs,
		dvector< float > &	d_inout,
		dvector< float > &	d_ybar,
		dvector< float > &	d_vhat,
		dvector< float > &	d_ysum,
		const float *	h_in,
		const int &	w,
		const int &	h
	)

Prepare for Algorithm SAT.

This function prepares the data structures used by the summed-area table (SAT) algorithm of an input 2D image.

The algorithm SAT is discussed in depth in our paper (see [Nehab:2011] in alg5() function) and it is implemented in algSAT() function.

Parameters:

[out]	algs	Algorithm setup computed and uploaded to the GPU
[out]	d_inout	The in/output 2D image to be allocated and copied to device memory
[out]	d_ybar	The $P_{m,n}(\bar{Y})$ to be allocated in device memory
[out]	d_vhat	The $P^T_{m,n}(\hat{V})$ to be allocated in device memory
[out]	d_ysum	The $s(P_{m,n}(Y))$ to be allocated in device memory
[in]	h_in	The input 2D image to compute SAT in host memory
[in]	w	Image width
[in]	h	Image height

Examples:: example_sat3.cc.

void gpufilter::up_alg_setup ( const alg_setup & algs )

Upload algorithm setup values.

Given the algorithm setup, upload the values to the device constant memory.

Parameters:

[in] algs Algorithm setup to upload to the GPU

void gpufilter::up_constants_coefficients1	(	const float &	b0,
		const float &	a1
	)

Upload device constants first-order coefficients.

Given the first-order coefficients of the recursive filter, upload to the device constant memory the coefficients-related values.

Parameters:

[in]	b0	Feedforward coefficient
[in]	a1	Feedback first-order coefficient

void gpufilter::up_constants_coefficients2	(	const float &	b0,
		const float &	a1,
		const float &	a2
	)

Upload device constants second-order coefficients.

Given the second-order coefficients of the recursive filter, upload to the device constant memory the coefficients-related values.

Parameters:

[in]	b0	Feedforward coefficient
[in]	a1	Feedback first-order coefficient
[in]	a2	Feedback second-order coefficient

__host__ void gpufilter::up_texture	(	cudaArray *&	a_in,
		const float *	h_in,
		const int &	w,
		const int &	h,
		const initcond &	ic
	)

Upload input image as a texture in device.

Given an input image in the host, upload it to the device memory as a texture.

Parameters:

[out]	a_in	The input 2D image as cudaArray to be allocated and copied to device memory
[in]	h_in	The input 2D image to compute algorithm 5 in host memory
[in]	w	Image width
[in]	h	Image height
[in]	ic	Initial condition (for outside access) (default zero)

Classes

Modules

Functions

Function Documentation