/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
#define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__

#include "internal_shared.hpp"
#include "../vec_traits.hpp"

namespace cv { namespace gpu { namespace device
{
    namespace detail
    {
        //! Mask accessor

        struct MaskReader
        {
            explicit MaskReader(const PtrStep& mask_): mask(mask_) {}

            __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }

            const PtrStep mask;
        };

        struct NoMask 
        {
            __device__ __forceinline__ bool operator()(int y, int x) const { return true; } 
        };

        //! Read Write Traits

        template <size_t src_elem_size, size_t dst_elem_size>
        struct UnReadWriteTraits_
        {
            enum { shift = 1 };
        };
        template <size_t src_elem_size>
        struct UnReadWriteTraits_<src_elem_size, 1>
        {
            enum { shift = 4 };
        };
        template <size_t src_elem_size>
        struct UnReadWriteTraits_<src_elem_size, 2>
        {
            enum { shift = 2 };
        };
        template <typename T, typename D> struct UnReadWriteTraits
        {
            enum { shift = UnReadWriteTraits_<sizeof(T), sizeof(D)>::shift };
            
            typedef typename TypeVec<T, shift>::vec_type read_type;
            typedef typename TypeVec<D, shift>::vec_type write_type;
        };

        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size>
        struct BinReadWriteTraits_
        {
            enum { shift = 1 };
        };
        template <size_t src_elem_size1, size_t src_elem_size2>
        struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 1>
        {
            enum { shift = 4 };
        };
        template <size_t src_elem_size1, size_t src_elem_size2>
        struct BinReadWriteTraits_<src_elem_size1, src_elem_size2, 2>
        {
            enum { shift = 2 };
        };
        template <typename T1, typename T2, typename D> struct BinReadWriteTraits
        {
            enum {shift = BinReadWriteTraits_<sizeof(T1), sizeof(T2), sizeof(D)>::shift};

            typedef typename TypeVec<T1, shift>::vec_type read_type1;
            typedef typename TypeVec<T2, shift>::vec_type read_type2;
            typedef typename TypeVec<D , shift>::vec_type write_type;
        };

        //! Transform kernels

        template <int shift> struct OpUnroller;
        template <> struct OpUnroller<1>
        {
            template <typename T, typename D, typename UnOp, typename Mask>
            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
            {
                if (mask(y, x_shifted))
                    dst.x = op(src.x);
            }

            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
            {
                if (mask(y, x_shifted))
                    dst.x = op(src1.x, src2.x);
            }
        };
        template <> struct OpUnroller<2>
        {
            template <typename T, typename D, typename UnOp, typename Mask>
            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
            {
                if (mask(y, x_shifted))
                    dst.x = op(src.x);
                if (mask(y, x_shifted + 1))
                    dst.y = op(src.y);
            }

            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
            {
                if (mask(y, x_shifted))
                    dst.x = op(src1.x, src2.x);
                if (mask(y, x_shifted + 1))
                    dst.y = op(src1.y, src2.y);
            }
        };
        template <> struct OpUnroller<3>
        {
            template <typename T, typename D, typename UnOp, typename Mask>
            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
            {
                if (mask(y, x_shifted))
                    dst.x = op(src.x);
                if (mask(y, x_shifted + 1))
                    dst.y = op(src.y);
                if (mask(y, x_shifted + 2))
                    dst.z = op(src.z);
            }

            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
            {
                if (mask(y, x_shifted))
                    dst.x = op(src1.x, src2.x);
                if (mask(y, x_shifted + 1))
                    dst.y = op(src1.y, src2.y);
                if (mask(y, x_shifted + 2))
                    dst.z = op(src1.z, src2.z);
            }
        };
        template <> struct OpUnroller<4>
        {
            template <typename T, typename D, typename UnOp, typename Mask>
            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
            {
                if (mask(y, x_shifted))
                    dst.x = op(src.x);
                if (mask(y, x_shifted + 1))
                    dst.y = op(src.y);
                if (mask(y, x_shifted + 2))
                    dst.z = op(src.z);
                if (mask(y, x_shifted + 3))
                    dst.w = op(src.w);
            }

            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
            {
                if (mask(y, x_shifted))
                    dst.x = op(src1.x, src2.x);
                if (mask(y, x_shifted + 1))
                    dst.y = op(src1.y, src2.y);
                if (mask(y, x_shifted + 2))
                    dst.z = op(src1.z, src2.z);
                if (mask(y, x_shifted + 3))
                    dst.w = op(src1.w, src2.w);
            }
        };

        template <typename T, typename D, typename UnOp, typename Mask>
        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep_<D> dst_, const Mask mask, const UnOp op)
        {
            typedef typename UnReadWriteTraits<T, D>::read_type read_type;
            typedef typename UnReadWriteTraits<T, D>::write_type write_type;
            const int shift = UnReadWriteTraits<T, D>::shift;

            const int x = threadIdx.x + blockIdx.x * blockDim.x;
            const int y = threadIdx.y + blockIdx.y * blockDim.y;
            const int x_shifted = x * shift;

            if (y < src_.rows)
            {
                const T* src = src_.ptr(y);
                D* dst = dst_.ptr(y);

                if (x_shifted + shift - 1 < src_.cols)
                {
                    const read_type src_n_el = ((const read_type*)src)[x];
                    write_type dst_n_el;

                    OpUnroller<shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);

                    ((write_type*)dst)[x] = dst_n_el;
                }
                else
                {
                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
                    {
                        if (mask(y, real_x))
                            dst[real_x] = op(src[real_x]);
                    }
                }
            }
        }

        template <typename T, typename D, typename UnOp, typename Mask>
        static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep_<D> dst, const Mask mask, const UnOp op)
        {
		    const int x = blockDim.x * blockIdx.x + threadIdx.x;
		    const int y = blockDim.y * blockIdx.y + threadIdx.y;

            if (x < src.cols && y < src.rows && mask(y, x))
            {
                dst.ptr(y)[x] = op(src.ptr(y)[x]);
            }
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
        __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep_<T2> src2_, PtrStep_<D> dst_, 
            const Mask mask, const BinOp op)
        {
            typedef typename BinReadWriteTraits<T1, T2, D>::read_type1 read_type1;
            typedef typename BinReadWriteTraits<T1, T2, D>::read_type2 read_type2;
            typedef typename BinReadWriteTraits<T1, T2, D>::write_type write_type;
            const int shift = BinReadWriteTraits<T1, T2, D>::shift;

            const int x = threadIdx.x + blockIdx.x * blockDim.x;
            const int y = threadIdx.y + blockIdx.y * blockDim.y;
            const int x_shifted = x * shift;

            if (y < src1_.rows)
            {
                const T1* src1 = src1_.ptr(y);
                const T2* src2 = src2_.ptr(y);
                D* dst = dst_.ptr(y);

                if (x_shifted + shift - 1 < src1_.cols)
                {
                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
                    write_type dst_n_el;
                    
                    OpUnroller<shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);

                    ((write_type*)dst)[x] = dst_n_el;
                }
                else
                {
                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
                    {
                        if (mask(y, real_x))
                            dst[real_x] = op(src1[real_x], src2[real_x]);
                    }
                }
            }
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
        static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep_<T2> src2, PtrStep_<D> dst, 
            const Mask mask, const BinOp op)
        {
		    const int x = blockDim.x * blockIdx.x + threadIdx.x;
		    const int y = blockDim.y * blockIdx.y + threadIdx.y;

            if (x < src1.cols && y < src1.rows && mask(y, x))
            {
                const T1 src1_data = src1.ptr(y)[x];
                const T2 src2_data = src2.ptr(y)[x];
                dst.ptr(y)[x] = op(src1_data, src2_data);
            }
        }        

        template <bool UseSmart> struct TransformDispatcher;
        template<> struct TransformDispatcher<false>
        {
            template <typename T, typename D, typename UnOp, typename Mask>
            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
            {
                dim3 threads(16, 16, 1);
                dim3 grid(1, 1, 1);

                grid.x = divUp(src.cols, threads.x);
                grid.y = divUp(src.rows, threads.y);        

                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() ); 
            }

            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
            {
                dim3 threads(16, 16, 1);
                dim3 grid(1, 1, 1);

                grid.x = divUp(src1.cols, threads.x);
                grid.y = divUp(src1.rows, threads.y);        

                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() );            
            }
        };
        template<> struct TransformDispatcher<true>
        {
            template <typename T, typename D, typename UnOp, typename Mask>
            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
            {
                const int shift = UnReadWriteTraits<T, D>::shift;

                dim3 threads(16, 16, 1);
                dim3 grid(1, 1, 1);            

                grid.x = divUp(src.cols, threads.x * shift);
                grid.y = divUp(src.rows, threads.y);        

                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() );
            }

            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
            {
                const int shift = BinReadWriteTraits<T1, T2, D>::shift;

                dim3 threads(16, 16, 1);
                dim3 grid(1, 1, 1);

                grid.x = divUp(src1.cols, threads.x * shift);
                grid.y = divUp(src1.rows, threads.y);        

                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
                cudaSafeCall( cudaGetLastError() );

                if (stream == 0)
                    cudaSafeCall( cudaDeviceSynchronize() );            
            }
        };

        template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
        {
            static const bool value = false;
        };
        template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
        {
            static const bool value = UnReadWriteTraits<T, D>::shift != 1;
        };
        template <typename T, typename D> struct UseSmartUn
        {
            static const bool value = UseSmartUn_<T, D, VecTraits<T>::cn, VecTraits<D>::cn>::value;
        };

        template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
        {
            static const bool value = false;
        };
        template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
        {
            static const bool value = BinReadWriteTraits<T1, T2, D>::shift != 1;
        };
        template <typename T1, typename T2, typename D> struct UseSmartBin
        {
            static const bool value = UseSmartBin_<T1, T2, D, VecTraits<T1>::cn, VecTraits<T2>::cn, VecTraits<D>::cn>::value;
        };

        template <typename T, typename D, typename UnOp, typename Mask>
        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
        {
            TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
        {
            TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
        }
    }
}}}

#endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
