AscendC vs CUDA

什么是AscendC/CUDA编程

面向算子开发场景的编程语言Ascend C,原生支持C和C++标准规范,最大化匹配用户开发习惯;通过多层接口抽象、自动并行计算、孪生调试等关键技术,极大提高算子开发效率,助力AI开发者低成本完成算子开发和模型调优部署。

CUDACompute Unified Device Architecture,统一计算架构[1])是由英伟达NVIDIA所推出的一种硬件集成技术,是该公司对于GPGPU的正式名称。透过这个技术,用户可利用NVIDIA的GPU进行图像处理之外的运算,亦是首次可以利用GPU作为C-编译器的开发环境。

一句话概括:AscendC/CUDA就是使用昇腾设备/GPU设备的编程接口。

与我们熟悉的编程有什么区别

内存

Host编程仅考虑主存,所有的内存操作对象均为主存,不需要考虑CPU缓存,寄存器等,这些对程序开发完全透明。

Device编程需要了解每个运行单元能访问的内存类型,可以理解要手动管理一级二级缓存,例如,AscendC变成框架下,内存的类型有:

枚举值 具体含义
GM Global Memory,对应AI Core的外部存储。
VECIN 用于矢量计算,搬入数据的存放位置,在数据搬入Vector计算单元时使用此位置
VECOUT 用于矢量计算,搬出数据的存放位置,在将Vector计算单元结果搬出时使用此位置
VECCALC 用于矢量计算/矩阵计算,在计算需要临时变量时使用此位置
A1 用于矩阵计算,存放整块A矩阵,可类比CPU多级缓存中的二级缓存
B1 用于矩阵计算,存放整块B矩阵,可类比CPU多级缓存中的二级缓存
A2 用于矩阵计算,存放切分后的小块A矩阵,可类比CPU多级缓存中的一级缓存
B2 用于矩阵计算,存放切分后的小块B矩阵,可类比CPU多级缓存中的一级缓存
CO1 用于矩阵计算,存放小块结果C矩阵,可理解为Cube Out
CO2 用于矩阵计算,存放整块结果C矩阵,可理解为Cube Out

不同的处理单元,不同的处理步骤访问的内存是不同的,需要开发者自行处理。

编程模型

Host编程一般为串行的,如果想启用并行处理需要手动开启多线程,或者SIMD(Single Instruction, Multiple Data)。

Device编程一般为并行,SPMD(Single-Program Multiple-Data)。在设备上启动多线程,共同处理一份数据。Device编程代码分为两个部分,Host侧执行的一般代码和在设备上执行的核函数(kernel function)。

AscendC还需要注意的是流水线编程范式,流水线编程主要是为了加速数据拷贝,Device处理以及数据拷回的流程。因为DMA搬运单元,各个计算单元是并行工作的,使用流水线能够提高设备单元的使用率。

Device 的内部结构抽象

Ascend AI Core 内部抽象结构

达芬奇架构
AI Core抽象结构

CUDA核心内部抽象结构

CUDA核心结构
CUDA核心结构

AI Core和Stream Multiprocessor的最主要区别是:

  • AI Core中是专用处理单元,包括Vector和Cube,分别用户向量和矩阵运算,能用向量和矩阵运算的操作效率会很高。

  • Stream Multiprocessor基本上都是大量的int32核心,float32核心或者双精度核心,由于数量众多,所以并行能力更强。

AscendC编程和CUDA编程对比

AscendC

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#include <cstring>
#include <iostream>

#ifndef __CCE_KT_TEST__
#include "acl/acl.h"
#else
#include "tikicpulib.h"
#endif
#include "kernel_operator.h"
#include "data_loader.h"

using namespace AscendC;

#ifdef __CCE_KT_TEST__
#define __aicore__
#else
#define __aicore__ [aicore]
#endif

constexpr int BUFFER_NUM = 2;
constexpr int BLOCK_DIM = 16;

/*****************************Copy scalar to ubuf*****************************/
struct FlipTilingData {
uint32_t height;
uint32_t width;
uint32_t channel;
};

inline __aicore__ int32_t align32(int32_t n) { return ((n + 31) & ~31); }
inline __aicore__ int32_t AlignDiv32(int32_t n) { return align32(n) / 32; }

#define CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
__ubuf__ tilingStruct* tilingDataPointer = \
reinterpret_cast<__ubuf__ tilingStruct*>( \
(__ubuf__ uint8_t*)(tilingPointer));

#ifdef __CCE_KT_TEST__
#define INIT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer);
#else
#define INIT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
__ubuf__ uint8_t* tilingUbPointer = (__ubuf__ uint8_t*)get_imm(0); \
copy_gm_to_ubuf(((__ubuf__ uint8_t*)(tilingUbPointer)), \
((__gm__ uint8_t*)(tilingPointer)), 0, 1, \
AlignDiv32(sizeof(tilingStruct)), 0, 0); \
CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingUbPointer); \
pipe_barrier(PIPE_ALL);
#endif

#define GET_TILING_DATA(tilingData, tilingPointer) \
INIT_TILING_DATA(FlipTilingData, tilingData, tilingPointer);

#define CHECK_ACL(x) \
do { \
aclError __ret = x; \
if (__ret != ACL_ERROR_NONE) { \
std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret \
<< std::endl; \
} \
} while (0);

/*******************************Kernel function*******************************/
class KernelFlip {
public:
__aicore__ inline KernelFlip() {}
__aicore__ inline void Init(GM_ADDR input, GM_ADDR output, uint32_t _height,
uint32_t _width, uint32_t _channel) {
uint32_t blockNum = GetBlockNum();
uint32_t blockIdx = GetBlockIdx();

rowLength = _height / blockNum;
startRowIdx = blockIdx * rowLength;
if (startRowIdx + rowLength > _height) {
rowLength = _height - startRowIdx;
}
width = _width;
height = _height;
channel = _channel;
rowSize = width * channel;
uint32_t bufferSize = align32(rowSize * sizeof(uint8_t));

inputGM.SetGlobalBuffer((__gm__ uint8_t*)input + startRowIdx * rowSize,
rowLength * rowSize);
outputGM.SetGlobalBuffer((__gm__ uint8_t*)output + startRowIdx * rowSize,
rowLength * rowSize);
pipe.InitBuffer(inQueue, BUFFER_NUM, bufferSize);
pipe.InitBuffer(outQueue, BUFFER_NUM, bufferSize);
}

__aicore__ inline void Process() {
for (int32_t i = 0; i < rowLength; i++) {
CopyIn(i);
Compute(i);
CopyOut(i);
}
}

private:
__aicore__ inline void CopyIn(int32_t loop) {
LocalTensor<uint8_t> local = inQueue.AllocTensor<uint8_t>();
DataCopy(local, inputGM[loop * rowSize], rowSize);
inQueue.EnQue(local);
}

__aicore__ inline void Compute(int32_t loop) {
LocalTensor<uint8_t> inputLocal = inQueue.DeQue<uint8_t>();
LocalTensor<uint8_t> outputLocal = outQueue.AllocTensor<uint8_t>();
for (int32_t i = 0; i < width; i++) {
for (int32_t c = 0; c < channel; c++) {
outputLocal.SetValue(
i * channel + c,
inputLocal.GetValue((width - i - 1) * channel + c));
}
}
outQueue.EnQue<uint8_t>(outputLocal);
inQueue.FreeTensor(inputLocal);
}

__aicore__ inline void CopyOut(int32_t loop) {
LocalTensor<uint8_t> local = outQueue.DeQue<uint8_t>();
DataCopy(outputGM[loop * rowSize], local, rowSize);
outQueue.FreeTensor(local);
}

private:
TPipe pipe;
TQue<QuePosition::VECIN, BUFFER_NUM> inQueue;
TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue;
GlobalTensor<uint8_t> inputGM, outputGM;
uint32_t startRowIdx, rowLength, rowSize, height, width, channel;
};

/*******************************kernel interface******************************/
extern "C" __global__ __aicore__ void flip(GM_ADDR input, GM_ADDR output,
GM_ADDR tiling) {
GET_TILING_DATA(tilingData, tiling);
KernelFlip op;
op.Init(input, output, tilingData->height, tilingData->width,
tilingData->channel);
op.Process();
}

#ifndef __CCE_KT_TEST__
void flip_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* input,
uint8_t* output, uint8_t* tiling) {
flip<<<blockDim, l2ctrl, stream>>>(input, output, tiling);
}
#endif

/***********************************caller************************************/
int32_t main(int32_t argc, char* argv[]) {
if (argc != 2) {
std::cerr << "usage: " << argv[0] << " path/to/datafile" << std::endl;
exit(-1);
}

uint32_t blockDim = BLOCK_DIM;
uint32_t height, width, channel;
uint8_t* data = readFile(argv[1], height, width, channel);
const char* resultFile = std::string(argv[1]).append(".ret").c_str();

uint32_t dataSize = width * height * channel * sizeof(uint8_t);
size_t inputByteSize = dataSize;
size_t outputByteSize = dataSize;
size_t tilingSize = sizeof(FlipTilingData);

uint8_t *inputHost, *outputHost, *tilingHost;
uint32_t shape[]{height, width, channel};

#ifdef __CCE_KT_TEST__
inputHost = (uint8_t*)AscendC::GmAlloc(inputByteSize);
outputHost = (uint8_t*)AscendC::GmAlloc(outputByteSize);
tilingHost = (uint8_t*)AscendC::GmAlloc(tilingSize);
memcpy(tilingHost, shape, tilingSize);
memcpy(inputHost, data, dataSize);

AscendC::SetKernelMode(KernelMode::AIV_MODE);
ICPU_RUN_KF(flip, blockDim, inputHost, outputHost, tilingHost);

writeFile(resultFile, height, width, channel, outputHost);

AscendC::GmFree((void*)inputHost);
AscendC::GmFree((void*)outputHost);
AscendC::GmFree((void*)tilingHost);
#else
CHECK_ACL(aclInit(nullptr));
aclrtContext context;
int32_t deviceId = 0;
CHECK_ACL(aclrtSetDevice(deviceId));
CHECK_ACL(aclrtCreateContext(&context, deviceId));
aclrtStream stream = nullptr;
CHECK_ACL(aclrtCreateStream(&stream));

uint8_t *inputDevice, *outputDevice, *tilingDevice;
CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingSize));
CHECK_ACL(aclrtMallocHost((void**)(&inputHost), inputByteSize));
CHECK_ACL(aclrtMallocHost((void**)(&outputHost), outputByteSize));
CHECK_ACL(aclrtMalloc((void**)&inputDevice, inputByteSize,
ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ACL(aclrtMalloc((void**)&outputDevice, outputByteSize,
ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingSize,
ACL_MEM_MALLOC_HUGE_FIRST));

memcpy(tilingHost, shape, tilingSize);
memcpy(inputHost, data, dataSize);

CHECK_ACL(aclrtMemcpy(inputDevice, inputByteSize, inputHost, inputByteSize,
ACL_MEMCPY_HOST_TO_DEVICE));
CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, tilingHost, tilingSize,
ACL_MEMCPY_HOST_TO_DEVICE));

flip_do(blockDim, nullptr, stream, inputDevice, outputDevice, tilingDevice);

CHECK_ACL(aclrtSynchronizeStream(stream));
CHECK_ACL(aclrtMemcpy(outputHost, outputByteSize, outputDevice,
outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
writeFile(resultFile, height, width, channel, outputHost);

CHECK_ACL(aclrtFree(inputDevice));
CHECK_ACL(aclrtFree(outputDevice));
CHECK_ACL(aclrtFree(tilingDevice));
CHECK_ACL(aclrtFreeHost(inputHost));
CHECK_ACL(aclrtFreeHost(outputHost));
CHECK_ACL(aclrtFreeHost(tilingHost));

CHECK_ACL(aclrtDestroyStream(stream));
CHECK_ACL(aclrtDestroyContext(context));
CHECK_ACL(aclrtResetDevice(deviceId));
CHECK_ACL(aclFinalize());
#endif
free(data);
return 0;
}

流水线示例
数据切分示例

CUDA

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include <iostream>
#include "data_loader.h"

__global__ void flip(uint8_t* input, uint8_t* output, uint32_t height,
uint32_t width, uint32_t channel) {
int rowIdx = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
int rowSize = width * channel;

for (int row = rowIdx; row < height; row += stride) {
int startOffset = row * rowSize;
for (int idx = 0; idx < width; idx++) {
for (int c = 0; c < channel; c++) {
output[startOffset + idx * channel + c] =
input[startOffset + (width - idx - 1) * channel + c];
}
}
}
}

int main(int32_t argc, char* argv[]) {
if (argc != 2) {
std::cerr << "usage: " << argv[0] << " path/to/datafile" << std::endl;
exit(-1);
}
uint32_t height, width, channel;
char fileName[256], resultFile[256];
memset(fileName, 0, 256);
memset(resultFile, 0, 256);
strcpy(fileName, argv[1]);
strcat(resultFile, fileName);
strcat(resultFile, ".ret");
uint8_t* data = readFile(fileName, height, width, channel);

uint32_t dataSize = width * height * channel * sizeof(uint8_t);
size_t inputByteSize = dataSize;
size_t outputByteSize = dataSize;
uint8_t *input, *output;
cudaMallocManaged((void**)&input, inputByteSize);
cudaMallocManaged((void**)&output, outputByteSize);

memcpy(input, data, inputByteSize);

dim3 blockSize(256);
dim3 gridSize((height + blockSize.x - 1) / blockSize.x);

flip<<<gridSize, blockSize>>>(input, output, height, width, channel);
cudaDeviceSynchronize();

writeFile(resultFile, height, width, channel, output);

cudaFree(input);
cudaFree(output);

return 0;
}