3#if _MSC_VER >= 1900 && defined(_M_X64)
5#include "BackendBase.h"
14 class CCuda_Conv2D_Cudnn;
18 class FL_EXPORT CTensor;
21 class FL_EXPORT CBackendConv2D :
public CBackendBase<T>
25 CBackendConv2D(
const CBackendConv2D<T>& bc);
26 virtual ~CBackendConv2D();
28 virtual const CResult SetConvolutionParams(
const CConvolutionParameters& convParams);
29 virtual CConvolutionParameters GetConvolutionParams();
30 virtual const CResult SetTransConvolutionParams(
const CTransConvolutionParameters& convParams);
31 virtual CTransConvolutionParameters GetTransConvolutionParams();
33 virtual const CResult Forward(CTensor<T>* pTsrX, CTensor<T>* pTsrW, CTensor<T>* pTsrY,
const std::vector<int64_t>& vctYShape);
34 virtual const CResult DerivativeImage(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx,
const std::vector<int64_t>& vctDxShape,
bool bAddGradient, CTensor<T>* pTsrAddGradientTemp =
nullptr, CTensor<T>* pTsrKernelBuffer =
nullptr);
35 virtual const CResult DerivativeKernel(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw,
const std::vector<int64_t>& vctDwShape,
bool bAddGradient, CTensor<T>* pTsrAddGradientTemp =
nullptr);
39 virtual const CResult TransConvForward(CTensor<T>* pTsrX, CTensor<T>* pTsrW, CTensor<T>* pTsrY, CTensor<T>* pTsrKernelTranspose =
nullptr, CTensor<T>* pTsrKernelBuffer =
nullptr);
40 virtual const CResult TransConvDerivativeImage(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx,
const std::vector<int64_t>& vctXShape,
bool bAddGradient, CTensor<T>* pTsrKernelBuffer =
nullptr, CTensor<T>* pTsrAddGradientTemp =
nullptr);
41 virtual const CResult TransConvDerivativeKernel(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw,
const std::vector<int64_t>& vctDwShape,
bool bAddGradient, CTensor<T>* pTsrInputTranspose =
nullptr, CTensor<T>* pTsrKernelTranspose =
nullptr, CTensor<T>* pTsrAddGradientTemp =
nullptr);
44 DeclareGetClassType();
45 SupportToDuplicateObjectWithoutCreateNewObject(CBackendConv2D<T>, *
this);
48 virtual const CResult ForwardGEMM(CTensor<T>* pTsrOperand, CTensor<T>* pTsrKernel, CTensor<T>* pTsrResult);
49 virtual void Im2Col(
const T* pTData_im,
const int32_t i32Channels,
const int32_t i32Height,
const int32_t i32Width,
const int32_t i32Kernel_h,
const int32_t i32Kernel_w,
const int32_t i32Pad_h,
const int32_t i32Pad_w,
const int32_t i32Stride_h,
const int32_t i32Stride_w,
const int32_t i32Dilation_h,
const int32_t i32Dilation_w, T* pTData_col,
const int32_t i32OutputH=0,
const int32_t i32OutputW = 0);
50 virtual void Im2ColTranspose(
const T* pTData_im,
const int32_t i32Channels,
const int32_t i32Height,
const int32_t i32Width,
const int32_t i32Kernel_h,
const int32_t i32Kernel_w,
const int32_t i32Pad_h,
const int32_t i32Pad_w,
const int32_t i32Stride_h,
const int32_t i32Stride_w,
const int32_t i32Dilation_h,
const int32_t i32Dilation_w, T* pTData_col,
const int32_t i32OutputH = 0,
const int32_t i32OutputW = 0);
51 virtual void Col2Im(
const T* PTData_col,
const int32_t i32Channels,
const int32_t i32Height,
const int32_t width,
const int32_t i32Kernel_h,
const int32_t i32Kernel_w,
const int32_t i32Pad_h,
const int32_t i32Pad_w,
const int32_t i32Stride_h,
const int32_t i32Stride_w,
const int32_t i32Dilation_h,
const int32_t i32Dilation_w,T* pTData_im,
const int32_t i32OutputH = 0,
const int32_t i32OutputW = 0);
53 virtual void GEMM(int32_t i32M, int32_t i32N, int32_t i32K, T tAlpha,
const T* pTA, int32_t i32Lda,
const T* pTB, int32_t i32Ldb, T BETA, T* pTC, int32_t i32Ldc);
56 virtual const CResult ForwardConvImpGEMMCPU(CTensor<T>* pTsrX, CTensor<T>* pTsrW, CTensor<T>* pTsrY);
57 virtual void GEMMUnit16MostN(int32_t i32M, int32_t i32N, int32_t i32K,
const T* pTA, int32_t i32Lda,
const T* pTB, int32_t i32Ldb, T BETA, T* pTC, int32_t i32Ldc);
58 virtual void GEMMUnit16MostM(int32_t i32M, int32_t i32N, int32_t i32K,
const T* pTA, int32_t i32Lda,
const T* pTB, int32_t i32Ldb, T BETA, T* pTC, int32_t i32Ldc);
60 virtual void GEMMUnit36MostN(int32_t i32M, int32_t i32N, int32_t i32K,
const T* pTA, int32_t i32Lda,
const T* pTB, int32_t i32Ldb, T BETA, T* pTC, int32_t i32Ldc);
61 virtual void GEMMUnit36MostM(int32_t i32M, int32_t i32N, int32_t i32K,
const T* pTA, int32_t i32Lda,
const T* pTB, int32_t i32Ldb, T BETA, T* pTC, int32_t i32Ldc);
63 virtual void DyToTransformBatchColumn(
const T* pTDy, T* pTDyTransform, int64_t i64DyBatch, int64_t i64DyChannel, int64_t i64DyHeight, int64_t i64DyWidth, int64_t i64DyTFColumn);
64 virtual void DyToTransform4x4BatchRow(
const T* pTDy, T* pTDyTransform, int64_t i64DyBatch, int64_t i64DyChannel, int64_t i64DyHeight, int64_t i64DyWidth, int64_t i64DyTFColumn);
65 virtual void DyToTransform6x6BatchRow(
const T* pTDy, T* pTDyTransform, int64_t i64DyBatch, int64_t i64DyChannel, int64_t i64DyHeight, int64_t i64DyWidth, int64_t i64DyTFColumn);
67 virtual void KernelToTransform4x4(
const T* pTKernel, T* pTTransform, int64_t i64KernelBatch, int64_t i64Ch, int64_t i64KernelHeight, int64_t i64KernelWidth);
68 virtual void KernelToTransform6x6(
const T* pTKernel, T* pTTransform, int64_t i64KernelBatch, int64_t i64Ch, int64_t i64KernelHeight, int64_t i64KernelWidth);
70 virtual void InputToTransform6x6ChRow(
const T* pTInput, T* pTTransform, int64_t i64Ch, int64_t i64Height, int64_t i64Width, int64_t i64PadH, int64_t i64PadW);
71 virtual void InputToTransformChRow(
const T* pTInput, T* pTTransform, int64_t i64Ch, int64_t i64Height, int64_t i64Width, int64_t i64PadH, int64_t i64PadW);
72 virtual void InputToTransform6x6BatchRow(
const T* pTInput, T* pTTransform, int64_t i64Ch, int64_t i64Height, int64_t i64Width, int64_t i64PadH, int64_t i64PadW);
73 virtual void InputToTransformBatchRow(
const T* pTInput, T* pTTransform, int64_t i64Batch, int64_t i64Ch, int64_t i64Height, int64_t i64Width, int64_t i64PadH, int64_t i64PadW);
74 virtual void InputToTransformChColumn(
const T* pTInput, T* pTTransform, int64_t i64Ch, int64_t i64Height, int64_t i64Width, int64_t i64PadH, int64_t i64PadW);
76 virtual void YTransformToOutput4x4(
const T* pTOutputTransform, T* pTOutput, int64_t i64OutputCh, int64_t i64InputCh, int64_t i64InputH, int64_t i64InputW, int64_t i64PadH, int64_t i64PadW);
77 virtual void YTransformToOutput6x6(
const T* pTOutputTransform, T* pTOutput, int64_t i64OutputCh, int64_t i64InputCh, int64_t i64InputH, int64_t i64InputW, int64_t i64PadH, int64_t i64PadW);
78 virtual const CResult Forward_ConvWinograd(CTensor<T>* pTsrX, CTensor<T>* pTsrW, CTensor<T>* pTsrY);
79 virtual const CResult Forward_ConvDirect(CTensor<T>* pTsrX, CTensor<T>* pTsrW, CTensor<T>* pTsrY);
80 virtual const CResult DerivativeImage_ConvGEMM(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx, CTensor<T>* pTsrKernelBuffer);
81 virtual const CResult DerivativeImage_Direct(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx);
82 virtual const CResult DerivativeImage_Winograd(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx, CTensor<T>* pTsrKernelBuffer);
83 virtual const CResult DerivativeKernel_ConvGEMM(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw);
84 virtual const CResult DerivativeKernel_Direct(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw);
85 virtual const CResult DerivativeKernel_Winograd(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw);
88 virtual const CResult Forward_AtrousConvDirect(CTensor<T>* pTsrOperand, CTensor<T>* pTsrKernel, CTensor<T>* pTsrResult);
89 virtual const CResult DerivativeImage_Direct_Atrous(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx);
90 virtual const CResult DerivativeKernel_Direct_Atrous(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw);
93 virtual const CResult Forward_GroupConvDirect(CTensor<T>* pTsrOperand, CTensor<T>* pTsrKernel, CTensor<T>* pTsrResult);
94 virtual const CResult DerivativeImage_Direct_Group(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx);
95 virtual const CResult DerivativeKernel_Direct_Group(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw);
97 virtual const CResult ForwardGEMM_Group(CTensor<T>* pTsrOperand, CTensor<T>* pTsrKernel, CTensor<T>* pTsrResult);
98 virtual const CResult DerivativeImage_ConvGEMM_Group(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx, CTensor<T>* pTsrKernelBuffer);
99 virtual const CResult DerivativeKernel_ConvGEMM_Group(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw);
101 virtual const CResult Forward_ConvWinograd_Group(CTensor<T>* pTsrX, CTensor<T>* pTsrW, CTensor<T>* pTsrY);
102 virtual const CResult DerivativeImage_Winograd_Group(CTensor<T>* pTsrDy, CTensor<T>* pTsrW, CTensor<T>* pTsrDx, CTensor<T>* pTsrKernelBuffer);
103 virtual const CResult DerivativeKernel_Winograd_Group(CTensor<T>* pTsrDy, CTensor<T>* pTsrX, CTensor<T>* pTsrDw);
106 CConvolutionParameters m_convParams;
107 int64_t m_i64OutputPaddingY;
108 int64_t m_i64OutputPaddingX;
111 CCuda_Conv2D_Cudnn<T>* m_pCudnn;