hipudding's Blog

什么是AscendC/CUDA编程

面向算子开发场景的编程语言Ascend C，原生支持C和C++标准规范，最大化匹配用户开发习惯；通过多层接口抽象、自动并行计算、孪生调试等关键技术，极大提高算子开发效率，助力AI开发者低成本完成算子开发和模型调优部署。

CUDA（Compute Unified Device Architecture，统一计算架构[1]）是由英伟达NVIDIA所推出的一种软硬件集成技术，是该公司对于GPGPU的正式名称。透过这个技术，用户可利用NVIDIA的GPU进行图像处理之外的运算，亦是首次可以利用GPU作为C-编译器的开发环境。

一句话概括：AscendC/CUDA就是使用昇腾设备/GPU设备的编程接口。

与我们熟悉的编程有什么区别

内存

Host编程仅考虑主存，所有的内存操作对象均为主存，不需要考虑CPU缓存，寄存器等，这些对程序开发完全透明。

Device编程需要了解每个运行单元能访问的内存类型，可以理解要手动管理一级二级缓存，例如，AscendC变成框架下，内存的类型有：

枚举值	具体含义
GM	Global Memory，对应AI Core的外部存储。
VECIN	用于矢量计算，搬入数据的存放位置，在数据搬入Vector计算单元时使用此位置
VECOUT	用于矢量计算，搬出数据的存放位置，在将Vector计算单元结果搬出时使用此位置
VECCALC	用于矢量计算/矩阵计算，在计算需要临时变量时使用此位置
A1	用于矩阵计算，存放整块A矩阵，可类比CPU多级缓存中的二级缓存
B1	用于矩阵计算，存放整块B矩阵，可类比CPU多级缓存中的二级缓存
A2	用于矩阵计算，存放切分后的小块A矩阵，可类比CPU多级缓存中的一级缓存
B2	用于矩阵计算，存放切分后的小块B矩阵，可类比CPU多级缓存中的一级缓存
CO1	用于矩阵计算，存放小块结果C矩阵，可理解为Cube Out
CO2	用于矩阵计算，存放整块结果C矩阵，可理解为Cube Out

不同的处理单元，不同的处理步骤访问的内存是不同的，需要开发者自行处理。

编程模型

Host编程一般为串行的，如果想启用并行处理需要手动开启多线程，或者SIMD(Single Instruction, Multiple Data)。

Device编程一般为并行，SPMD(Single-Program Multiple-Data)。在设备上启动多线程，共同处理一份数据。Device编程代码分为两个部分，Host侧执行的一般代码和在设备上执行的核函数(kernel function)。

AscendC还需要注意的是流水线编程范式，流水线编程主要是为了加速数据拷贝，Device处理以及数据拷回的流程。因为DMA搬运单元，各个计算单元是并行工作的，使用流水线能够提高设备单元的使用率。

Device 的内部结构抽象

Ascend AI Core 内部抽象结构

CUDA核心内部抽象结构

AI Core和Stream Multiprocessor的最主要区别是：

AI Core中是专用处理单元，包括Vector和Cube，分别用户向量和矩阵运算，能用向量和矩阵运算的操作效率会很高。
Stream Multiprocessor基本上都是大量的int32核心，float32核心或者双精度核心，由于数量众多，所以并行能力更强。

AscendC编程和CUDA编程对比

AscendC

#include <cstring>
#include <iostream>

#ifndef __CCE_KT_TEST__
#include "acl/acl.h"
#else
#include "tikicpulib.h"
#endif
#include "kernel_operator.h"
#include "data_loader.h"

using namespace AscendC;

#ifdef __CCE_KT_TEST__
#define __aicore__
#else
#define __aicore__ [aicore]
#endif

constexpr int BUFFER_NUM = 2;
constexpr int BLOCK_DIM = 16;

/*****************************Copy scalar to ubuf*****************************/
struct FlipTilingData {
  uint32_t height;
  uint32_t width;
  uint32_t channel;
};

inline __aicore__ int32_t align32(int32_t n) { return ((n + 31) & ~31); }
inline __aicore__ int32_t AlignDiv32(int32_t n) { return align32(n) / 32; }

#define CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
  __ubuf__ tilingStruct* tilingDataPointer =                                \
      reinterpret_cast<__ubuf__ tilingStruct*>(                             \
          (__ubuf__ uint8_t*)(tilingPointer));

#ifdef __CCE_KT_TEST__
#define INIT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
  CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer);
#else
#define INIT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
  __ubuf__ uint8_t* tilingUbPointer = (__ubuf__ uint8_t*)get_imm(0);     \
  copy_gm_to_ubuf(((__ubuf__ uint8_t*)(tilingUbPointer)),                \
                  ((__gm__ uint8_t*)(tilingPointer)), 0, 1,              \
                  AlignDiv32(sizeof(tilingStruct)), 0, 0);               \
  CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingUbPointer); \
  pipe_barrier(PIPE_ALL);
#endif

#define GET_TILING_DATA(tilingData, tilingPointer) \
  INIT_TILING_DATA(FlipTilingData, tilingData, tilingPointer);

#define CHECK_ACL(x)                                                    \
  do {                                                                  \
    aclError __ret = x;                                                 \
    if (__ret != ACL_ERROR_NONE) {                                      \
      std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret \
                << std::endl;                                           \
    }                                                                   \
  } while (0);

/*******************************Kernel function*******************************/
class KernelFlip {
 public:
  __aicore__ inline KernelFlip() {}
  __aicore__ inline void Init(GM_ADDR input, GM_ADDR output, uint32_t _height,
                              uint32_t _width, uint32_t _channel) {
    uint32_t blockNum = GetBlockNum();
    uint32_t blockIdx = GetBlockIdx();

    rowLength = _height / blockNum;
    startRowIdx = blockIdx * rowLength;
    if (startRowIdx + rowLength > _height) {
      rowLength = _height - startRowIdx;
    }
    width = _width;
    height = _height;
    channel = _channel;
    rowSize = width * channel;
    uint32_t bufferSize = align32(rowSize * sizeof(uint8_t));

    inputGM.SetGlobalBuffer((__gm__ uint8_t*)input + startRowIdx * rowSize,
                            rowLength * rowSize);
    outputGM.SetGlobalBuffer((__gm__ uint8_t*)output + startRowIdx * rowSize,
                             rowLength * rowSize);
    pipe.InitBuffer(inQueue, BUFFER_NUM, bufferSize);
    pipe.InitBuffer(outQueue, BUFFER_NUM, bufferSize);
  }

  __aicore__ inline void Process() {
    for (int32_t i = 0; i < rowLength; i++) {
      CopyIn(i);
      Compute(i);
      CopyOut(i);
    }
  }

 private:
  __aicore__ inline void CopyIn(int32_t loop) {
    LocalTensor<uint8_t> local = inQueue.AllocTensor<uint8_t>();
    DataCopy(local, inputGM[loop * rowSize], rowSize);
    inQueue.EnQue(local);
  }

  __aicore__ inline void Compute(int32_t loop) {
    LocalTensor<uint8_t> inputLocal = inQueue.DeQue<uint8_t>();
    LocalTensor<uint8_t> outputLocal = outQueue.AllocTensor<uint8_t>();
    for (int32_t i = 0; i < width; i++) {
      for (int32_t c = 0; c < channel; c++) {
        outputLocal.SetValue(
            i * channel + c,
            inputLocal.GetValue((width - i - 1) * channel + c));
      }
    }
    outQueue.EnQue<uint8_t>(outputLocal);
    inQueue.FreeTensor(inputLocal);
  }

  __aicore__ inline void CopyOut(int32_t loop) {
    LocalTensor<uint8_t> local = outQueue.DeQue<uint8_t>();
    DataCopy(outputGM[loop * rowSize], local, rowSize);
    outQueue.FreeTensor(local);
  }

 private:
  TPipe pipe;
  TQue<QuePosition::VECIN, BUFFER_NUM> inQueue;
  TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue;
  GlobalTensor<uint8_t> inputGM, outputGM;
  uint32_t startRowIdx, rowLength, rowSize, height, width, channel;
};

/*******************************kernel interface******************************/
extern "C" __global__ __aicore__ void flip(GM_ADDR input, GM_ADDR output,
                                           GM_ADDR tiling) {
  GET_TILING_DATA(tilingData, tiling);
  KernelFlip op;
  op.Init(input, output, tilingData->height, tilingData->width,
          tilingData->channel);
  op.Process();
}

#ifndef __CCE_KT_TEST__
void flip_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* input,
             uint8_t* output, uint8_t* tiling) {
  flip<<<blockDim, l2ctrl, stream>>>(input, output, tiling);
}
#endif

/***********************************caller************************************/
int32_t main(int32_t argc, char* argv[]) {
  if (argc != 2) {
    std::cerr << "usage: " << argv[0] << " path/to/datafile" << std::endl;
    exit(-1);
  }

  uint32_t blockDim = BLOCK_DIM;
  uint32_t height, width, channel;
  uint8_t* data = readFile(argv[1], height, width, channel);
  const char* resultFile = std::string(argv[1]).append(".ret").c_str();

  uint32_t dataSize = width * height * channel * sizeof(uint8_t);
  size_t inputByteSize = dataSize;
  size_t outputByteSize = dataSize;
  size_t tilingSize = sizeof(FlipTilingData);

  uint8_t *inputHost, *outputHost, *tilingHost;
  uint32_t shape[]{height, width, channel};

#ifdef __CCE_KT_TEST__
  inputHost = (uint8_t*)AscendC::GmAlloc(inputByteSize);
  outputHost = (uint8_t*)AscendC::GmAlloc(outputByteSize);
  tilingHost = (uint8_t*)AscendC::GmAlloc(tilingSize);
  memcpy(tilingHost, shape, tilingSize);
  memcpy(inputHost, data, dataSize);

  AscendC::SetKernelMode(KernelMode::AIV_MODE);
  ICPU_RUN_KF(flip, blockDim, inputHost, outputHost, tilingHost);

  writeFile(resultFile, height, width, channel, outputHost);

  AscendC::GmFree((void*)inputHost);
  AscendC::GmFree((void*)outputHost);
  AscendC::GmFree((void*)tilingHost);
#else
  CHECK_ACL(aclInit(nullptr));
  aclrtContext context;
  int32_t deviceId = 0;
  CHECK_ACL(aclrtSetDevice(deviceId));
  CHECK_ACL(aclrtCreateContext(&context, deviceId));
  aclrtStream stream = nullptr;
  CHECK_ACL(aclrtCreateStream(&stream));

  uint8_t *inputDevice, *outputDevice, *tilingDevice;
  CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingSize));
  CHECK_ACL(aclrtMallocHost((void**)(&inputHost), inputByteSize));
  CHECK_ACL(aclrtMallocHost((void**)(&outputHost), outputByteSize));
  CHECK_ACL(aclrtMalloc((void**)&inputDevice, inputByteSize,
                        ACL_MEM_MALLOC_HUGE_FIRST));
  CHECK_ACL(aclrtMalloc((void**)&outputDevice, outputByteSize,
                        ACL_MEM_MALLOC_HUGE_FIRST));
  CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingSize,
                        ACL_MEM_MALLOC_HUGE_FIRST));

  memcpy(tilingHost, shape, tilingSize);
  memcpy(inputHost, data, dataSize);

  CHECK_ACL(aclrtMemcpy(inputDevice, inputByteSize, inputHost, inputByteSize,
                        ACL_MEMCPY_HOST_TO_DEVICE));
  CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, tilingHost, tilingSize,
                        ACL_MEMCPY_HOST_TO_DEVICE));

  flip_do(blockDim, nullptr, stream, inputDevice, outputDevice, tilingDevice);

  CHECK_ACL(aclrtSynchronizeStream(stream));
  CHECK_ACL(aclrtMemcpy(outputHost, outputByteSize, outputDevice,
                        outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
  writeFile(resultFile, height, width, channel, outputHost);

  CHECK_ACL(aclrtFree(inputDevice));
  CHECK_ACL(aclrtFree(outputDevice));
  CHECK_ACL(aclrtFree(tilingDevice));
  CHECK_ACL(aclrtFreeHost(inputHost));
  CHECK_ACL(aclrtFreeHost(outputHost));
  CHECK_ACL(aclrtFreeHost(tilingHost));

  CHECK_ACL(aclrtDestroyStream(stream));
  CHECK_ACL(aclrtDestroyContext(context));
  CHECK_ACL(aclrtResetDevice(deviceId));
  CHECK_ACL(aclFinalize());
#endif
  free(data);
  return 0;
}

CUDA

#include <iostream>
#include "data_loader.h"

__global__ void flip(uint8_t* input, uint8_t* output, uint32_t height,
                     uint32_t width, uint32_t channel) {
  int rowIdx = threadIdx.x + blockIdx.x * blockDim.x;
  int stride = blockDim.x * gridDim.x;
  int rowSize = width * channel;

  for (int row = rowIdx; row < height; row += stride) {
    int startOffset = row * rowSize;
    for (int idx = 0; idx < width; idx++) {
      for (int c = 0; c < channel; c++) {
        output[startOffset + idx * channel + c] =
            input[startOffset + (width - idx - 1) * channel + c];
      }
    }
  }
}

int main(int32_t argc, char* argv[]) {
  if (argc != 2) {
    std::cerr << "usage: " << argv[0] << " path/to/datafile" << std::endl;
    exit(-1);
  }
  uint32_t height, width, channel;
  char fileName[256], resultFile[256];
  memset(fileName, 0, 256);
  memset(resultFile, 0, 256);
  strcpy(fileName, argv[1]);
  strcat(resultFile, fileName);
  strcat(resultFile, ".ret");
  uint8_t* data = readFile(fileName, height, width, channel);

  uint32_t dataSize = width * height * channel * sizeof(uint8_t);
  size_t inputByteSize = dataSize;
  size_t outputByteSize = dataSize;
  uint8_t *input, *output;
  cudaMallocManaged((void**)&input, inputByteSize);
  cudaMallocManaged((void**)&output, outputByteSize);

  memcpy(input, data, inputByteSize);

  dim3 blockSize(256);
  dim3 gridSize((height + blockSize.x - 1) / blockSize.x);

  flip<<<gridSize, blockSize>>>(input, output, height, width, channel);
  cudaDeviceSynchronize();

  writeFile(resultFile, height, width, channel, output);

  cudaFree(input);
  cudaFree(output);

  return 0;
}

Compile OpenCV with CANN backend

发表于 2023-09-20 更新于 2025-01-23

Prepare environment

Simply you can use docker container, Dockerfile is here.

Or install manually:

driver

1
2
3

Download driver here. https://www.hiascend.com/hardware/firmware-drivers
chmod +x A300-3010-npu-driver_6.0.0_linux-x86_64.run
./A300-3010-npu-driver_6.0.0_linux-x86_64.run --install

toolkit

1
2
3

Download ascend_toolkit here. https://www.hiascend.com/en/software/cann/community
chmod +x Ascend-cann-toolkit_7.0.RC1.alpha002_linux-x86_64.run
./Ascend-cann-toolkit_7.0.RC1.alpha002_linux-x86_64.run --install

Clone repository

Clone opencv repositories, In order to facilitate code reading, make sure opencv_contrib is inside opencv dir.

1
2
3

git clone git@github.com:opencv/opencv.git
cd opencv
git clone git@github.com:opencv/opencv_contrib.git

Build

cmake 
-DCMAKE_INSTALL_PREFIX=/home/hua/code/opencv/build/install 
-DWITH_DEBUG=1 
-DBUILD_WITH_DEBUG_INFO=1 
-DOPENCV_EXTRA_MODULES_PATH=/home/hua/code/opencv/opencv_contrib/modules 
-DWITH_CUDA=0 
-DWITH_CANN=1 
-DPYTHON3_EXECUTABLE=/home/hua/anaconda3/envs/py39/bin/python 
-DPYTHON_LIBRARY=/home/hua/anaconda3/envs/py39 
-DPYTHON_INCLUDE_DIR=/home/hua/anaconda3/envs/py39/include/python3.9 
-DBUILD_opencv_wechat_qrcode=OFF 
-DBUILD_opencv_xfeatures2d=OFF 
-DBUILD_opencv_face=OFF 
-DBUILD_opencv_dnn=OFF 
-DBUILD_opencv_features2d=OFF 
-DWITH_CAROTENE=OFF 
-DWITH_IPP=OFF 
-DBUILD_DOCS=ON

make -j
make install

Make sure CANN and python-dev is detected:

[cmake] --   CANN:                          YES
[cmake] --     Include path                 /home/hua/Ascend/ascend-toolkit/latest/include /home/hua/Ascend/ascend-toolkit/latest/opp
[cmake] --     Link libraries:              /home/hua/Ascend/ascend-toolkit/latest/acllib/lib64/libascendcl.so /home/hua/Ascend/ascend-toolkit/latest/lib64/libacl_op_compiler.so /home/hua/Ascend/ascend-toolkit/latest/opp/built-in/op_proto/lib/linux/x86_64/libopsproto.so /home/hua/Ascend/ascend-toolkit/latest/compiler/lib64/libgraph.so /home/hua/Ascend/ascend-toolkit/latest/compiler/lib64/libge_compiler.so /home/hua/Ascend/ascend-toolkit/latest/compiler/lib64/libgraph_base.so
[cmake] -- 
[cmake] --   Python 3:
[cmake] --     Interpreter:                 /home/hua/anaconda3/envs/py39/bin/python (ver 3.9.17)
[cmake] --     Libraries:                   /home/hua/anaconda3/envs/py39/lib/libpython3.9.so (ver 3.9.17)
[cmake] --     numpy:                       /home/hua/anaconda3/envs/py39/lib/python3.9/site-packages/numpy/core/include (ver 1.25.2)
[cmake] --     install path:                lib/python3.9/site-packages/cv2/python-3.9

Run Test

1	./bin/opencv_test_cannops

OpenCV背景

OpenCV是1998年在Intel公司内的CVL(计算机视觉库)项目，由Gary Bradski发起，并由Vadim Pisarevsky担任技术主管，于1999年开源，2000年首次公开发布。2008年OpenCV的核心成员加入Willow Garage和Itseez公司继续开发。Itseez公司在2016年被Intel收购，核心开发团队重回Intel。

目前主要由Intel公司赞助OpenCV核心开发团队，并且很多OpenCV的开发者是Intel的雇员。这是一个由Intel公司主导，OpenCV.org非盈利基金会运营的开源项目。2019年以来，核心开发团队由Intel，OpenCV中国团队和xperience.ai公司组成。

版本发布策略

目前每6个月发布一次版本，社区没有明确说明每个版本的生命周期。

4.x版本

版本号	发布时间	时间间隔
4.8.0	2023.6.29	\
4.7.0	2022.12.28	6个月
4.6.0	2022.6.7	6个月
4.5.5	2021.12.25	6个月
4.5.4	2021.10.10	2个月
4.5.3	2021.7.6	3个月
4.5.2	2021.4.3	3个月
4.5.1	2020.12.22	4个月
4.5.0	2020.10.12	2个月

4.0.0版本发布于2018.11.18。

3.x版本

版本号	发布时间	时间间隔
3.4.20	2023.6.27（仅打Tag）	\
3.4.19	2022.12.27（仅打Tag）	6个月
3.4.18	2022.6.5（仅打Tag）	6个月
3.4.17	2021.12.25（仅打Tag）	6个月
3.4.16	2021.10.10	2个月
3.4.15	2021.7.6	3个月
3.4.14	2021.4.2	3个月
3.4.13	2020.12.22	4个月
3.4.12	2020.10.12	2个月

3.x版本最新的一个release版本3.4.16发布时间为2021.10，最后一个tag版本3.4.20发布时间为2021.6。

3.0发布于2015.6.24。

2.x版本

2.x版本的最新一个release版本是2.4.13.6，发布于2018.2.26，从发布时间上看，已经不再维护。

社区没有明确说明每个大版本的支持周期

OpenCV基金会

领导团队

Gary Bradski (Itseez, Intel）
Anna Petrovicheva (Intel)
Vladimir Dudnik (Intel)
Stefano Fabri (Deeper)
Tatiana Khanova (Xperience.ai)
Satya Mallick (OpenCV CEO)
Vadim Pisarevsky (Huawei)
Vincent Rabaud (Google)
Edgar Riba (farm-ng)
Aleksandr Voron (N/A)

领导团队（leadership meeting)，每周三 8:00 am, 太平洋时间，通过Hangouts沟通，会议纪要记录在github的wiki上。

开发团队和贡献者社区

github社区活跃开发者

姓名	社区职位	就职公司
Alexander Smorkalov	活跃commitor，合入PR数量众多	Xperience.AI
Vadim Pisarevsky	OpenCV技术负责人	华为
Alexander Alekhin	活跃开发者，reviewer，commitor	Intel
Ilya Lavrenov	活跃开发者	Itseez CTO
Dmitry Kurtaev	活跃开发者，reviewer，commitor	YADRO
... ...

小粒度特性和bug fix可以用issue跟踪，大粒度特性需要有进化提案跟踪。贡献社区需要参考OpenCV社区的贡献指导，所提交的代码需要符合社区编码规范。

领域主席

领域	姓名	单位
RISC-V	Mingjie Xing	中国科学院软件研究所
人脸识别与分析	Weihong Deng	北京邮电大学
人体检测	Andrea Pennisi	University of Antwerp
图像增强	Zhangyang "Atlas" Wang	The University of Texas at Austin
形状检测	Qi Jia	大连理工大学
文档	Dr. Vikas Ramachandra	Columbia University in the City of New York
辅助技术	Jagadish Mahendran	Bovi.ag

官方论坛

社区交流可以在官方论坛上互动。

OpenCV合作伙伴

Intell, OpenCV 白金会员
黄金会员：Microsoft Azure, Google summer of Code, FUTUREWEI, 华为（成为黄金会员方式：捐献十万美金，开发者或者其他资源）
发展合作伙伴：KHADAS, ORBBEC, RunPod

合作联系方式：contact@opencv.ai（美国），admin@opencv.org.cn （中国）

OpenCV中国团队

OpenCV中国团队于2019年9月成立，非营利目的，致力于OpenCV的开发、维护和推广工作。OpenCV中国团队由OpenCV项目发起人Gary Bradski担任团队顾问，OpenCV技术负责人Vadim Pisarevsky领导技术开发，OpenCV中文社区创始人于仕琪博士担任团队负责人。

国内负责人和核心开发成员均为于仕琪博士团队，并且是于仕琪的研究助理，主要社区提交为DNN相关内容。

姓名	职位	github id	OpenCV贡献	社区职位	备注
吴佳	研究助理	kaingwade	1PR 38 ++ 0--	无
母自豪	研究助理，2018级研究生	zihaomu	79PR 42,381 ++ 22,986 --	reviewer
冯远滔	研究助理，2018级研究生	fengyuentau	51PR 7,863 ++ 2,053 --	reviewer	DNN支持CANN后端作者
钟万里	研究助理	WanliZhong	18PR 561 ++ 147 --	reviewer

OpenCV欢迎外部公司合作，可由OpenCV中国团队指导，外部公司软件工程师开发，提交patch到OpenCV项目。可联系中国团队（admin@opencv.org.cn）洽谈。其中提到了在不同硬件平台上的OpenCV的加速，契合昇腾使能的诉求。

社区运作方式

代码仓库和版本控制：OpenCV代码托管在github上，使用git做版本管理，社区成员在这些仓库中提交代码和PR，有reviewer进行代码review，并最终由commitor合入代码。OpenCV有三个主要代码库：
- opencv：opencv主要代码库，包含关键数据结构和成熟算法，HAL方式在此库以3rd_party的方式贡献
- opencv_contrib：opencv扩展模块库，依赖opencv_core，社区提交要求先进入此库，成熟后合入主库。cuda等算法均在此库，命名空间方式以独立模块方式在此库贡献
- opencv_extra：opencv扩展数据库，存放测试数据，供测试使用
讨论和沟通：OpenCV社区使用（邮件列表？未找到）github，论坛进行技术讨论和沟通
问题追踪和解决：OpenCV使用github issue进行问题追踪：
- opencv - 库和稳定模块的一般问题，与构建相关的问题
- opencv_contrib -实验模块及其依赖项的问题
- opencv.org - 官方网站的问题
文档和教程：最新的版本使用Doxygen来展示文档，其中包含了使用教程
社区活动和会议：开发者可以参与谷歌代码之夏活动来提交新的想法和代码，社区领导团队每周三太平洋时间8:00 am在环聊上开展会议，并归档到wiki上。
教育和培训：社区联合Bigision提供了许多在线课程（收费），完成学习后可以获得毕业证书以及优秀证书，针对企业和组织也提供了培训计划。

昇腾接入方式

贡献开源方式

HAL：OpenCV在core，imgproc，feature2d这三个模块都提供了HAL机制，通过include hal_replacement.hpp替换相关宏定义，决定真正的执行后端。
命名空间：使用独立模块，实现与cv相同的函数接口。使用命名空间的方式进行调用。

两种接入方式比较

比较项	HAL	命名空间
实现成本	低，进需要实现定义好的函数接口即可，不涉及Matrix对象，入参出参均为常见数据类型。	较高，需要自定义实现NPU上的Mat结构，代码生成等相关工作。
用户使用成本	对用户完全透明，无缝替换。	用户需要替换函数调用接口，有可能涉及Mat对象转换NPU Mat对象操作。
性能	每个算子执行前后均涉及内存数据搬迁，无法异步执行，效率低。	计算过程中算子无需搬迁，可使用异步执行，效率高。
现有实现	carotene，Nvdia实现的SIMD加速库。	CUDA加速。

Pytorch DDP原理

发表于 2023-06-29

DDP=分布式数据并行(DISTRIBUTED DATA PARALLEL)，和DP一样也是一种数据并行的方法，详细文档可以参考官方手册

DDP使用

以下是使用NPU进行2个节点并行的示例代码。

import os
import torch
import torch_npu
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP


def example(rank, world_size):
    torch.npu.set_device(rank)
    device = torch.device('npu')
    # create default process group
    dist.init_process_group("hccl", rank=rank, world_size=world_size)
    # create local model
    model = nn.Linear(10, 10).to(device)
    # construct DDP model
    ddp_model = DDP(model, device_ids=[rank])
    # define loss function and optimizer
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
    # forward pass
    outputs = ddp_model(torch.randn(20, 10).to(device))
    labels = torch.randn(20, 10).to(device)

    # backward pass
    loss = loss_fn(outputs, labels)
    loss.backward()

    # update parameters
    optimizer.step()

def main():
    world_size = 2
    mp.spawn(example,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    # Environment variables which need to be
    # set when using c10d's default "env"
    # initialization mode.
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "29500"
    main()

DDP原理

文件	功能
torch/nn/parallel/distributed.py	DistributedDataParallel类实现
torch/csrc/distributed/c10d/reducer.cpp	DDP reduce类实现文件
torch/csrc/distributed/c10d/reducer.hpp	DDP reduce接口文件
torch/csrc/distributed/c10d/init.cpp	python C插件注册文件

进程交互

主线程通过torch.multiprocessing，fork出新的进程，满足word_size的要求，然后各个线程独立运行，主进程将自己的host和port放到环境变量中共子进程使用。
各个进程初始化process group，这些进程属于同一个进程组。
每个进程分别创建模型，DDP对象，定义loss函数和优化器。
创建DDP对象时，主进程会同步模型参数，这一步完成后，各个进程的模型一致。
每个模型输入部分数据，这些数据需要调用者自行分割，可以使用DistributedSampler。
然后执行模型的训练过程，每次backward执行完后，调用注册的reducer hook函数，做梯度聚合。

Reducer

普通模型的DDP，除了每次反向执行后的梯度同步之外，进程之间没有其他通信的需求。DDP构造时，会在每个模型参数上配置回调函数(autograd_hook)，当参数的梯度计算完成后，会调用这个回调函数。为了能够一边计算，一边同步梯度数据，DDP将相邻参数分组，每个组叫一个bucket。

这是参数分组和allreduce的官方示意图，回调函数的执行流程如下：

每个参数梯度计算完成后，均会调用回调函数，该回调函数会将参数标记为ready。
然后检查bucket中的pending是否为0，也就是bucket的参数是否全部完成梯度计算，如果完成，则准备进行一步all reduce。
all reduce是一个异步方法，任务执行后不阻塞，继续执行。（如果某个进程的当前bucket没有执行完成，dist backend会等待，执行完成后异步返回）。
当所有的bucket都发起all reduce后，等待每个bucket的all reduce结果，所有结果都返回后，完成本次backward。

参考

DISTRIBUTED DATA PARALLEL

原创深度 PyTorch DDP系列第二篇：实现原理与源代码解析

Pytorch DP原理

发表于 2023-06-21 更新于 2023-06-26

DP(DataParallel)=数据并行，DP框架会将数据切片，然后模型拷贝，并在不同的backend上并行执行。官方手册说明

DP使用

model = LinearModel()
model = torch.nn.DataParallel(model,device_ids=[0,1,2])
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(10):
    y_pred = model(x_data)
    loss = criterion(y_pred, y_data)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

从代码中看到，DP仅支持cuda，xpu或者昇腾，并不支持cpu。

DP原理

文件	功能
torch/nn/parallel/data_parallel.py	DP类实现
torch/nn/parallel/_functions.py	DP中使用的自定义autograd Function
torch/nn/parallel/comm.py	线程间数据传递接口，例如，gather,scatter等
torch/nn/parallel/replicate.py	模型拷贝相关
torch/nn/parallel/parallel_apply.py	多线程执行模型
torch/nn/parallel/scatter_gather.py	scatter和gather的封装
torch/csrc/cuda/python_comm.cpp	C语言实现的数据传递函数extension注册
torch/csrc/cuda/comm.cpp	C语言的数据传递实现

基本原理

DP基于单机多卡，所有的卡都参与训练。相对于非数据并行训练来说，DP会将一个batch数据切分为更小的batch，将数据复制到每一张计算的卡上，然后复制模型到所有的卡上，进行多线程执行，Forward计算完成后，会在device[0]上收集到一组预测值。device[0]计算loss，然后执行Backward计算梯度。（计算Backward没有看到创建线程的过程，单线程执行？？）

DP wapper

回顾DP使用方法，参与数据并行的模型进需要使用DataParalle包一层即可。

1	model = torch.nn.DataParallel(model,device_ids=[0,1,2])

可以理解，下图就是上述代码的实现，DP就是在原本的模型执行之前增加了数据切分(Scatter)，模型复制(Broadcast)，然后将模型的并行执行结果进行合并(Gather)。

数据传递

DP的Forward函数核心代码如下：

def forward(self, *inputs: Any, **kwargs: Any) -> Any:
	    ...
      # 训练数据切分，并拷贝到每一个卡上
      inputs, module_kwargs = self.scatter(inputs, kwargs, self.device_ids)
      
      ...
      # 模型数据拷贝到每一个卡上
      replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
      # 每张卡启动一个线程执行，每个线程执行模型的forward函数
      outputs = self.parallel_apply(replicas, inputs, module_kwargs)
      # 集合所有线程的预测结果
      return self.gather(outputs, self.output_device)

由于是单机单进程，所以数据的拷贝过程相对简单，以上述代码的scatter和broadcast为例，数据切分，然后将切分后的数据移动到指定设备上。但是Tensor还是在同一个数组中。

自定义自动梯度方法

Pytorch实现了自动梯度计算，除了官方实现的梯度计算之外，还能够自定义梯度计算方法，具体描述参考官方说明。以下是代码中对Function类的使用举例说明：

Examples::

    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_AUTOGRAD)
    >>> class Exp(Function):
    >>>     @staticmethod
    >>>     def forward(ctx, i):
    >>>         result = i.exp()
    >>>         ctx.save_for_backward(result)
    >>>         return result
    >>>
    >>>     @staticmethod
    >>>     def backward(ctx, grad_output):
    >>>         result, = ctx.saved_tensors
    >>>         return grad_output * result
    >>>
    >>> # Use it by calling the apply method:
    >>> # xdoctest: +SKIP
    >>> output = Exp.apply(input)
"""

实现自定义自动梯度方法，需要继承Function类，并实现forward和backward方法，并且这两个方法均是静态的。使用自定义自动梯度方法的时候，不应该调用forward，而是调用Exp.apply，这样会返回forward的结果。相当于自己定义了算子的前向和后向的处理方法。当在执行反向传播的时候，会按反顺序调用自定义自动梯度方法的backward方法。

DP实现了三个自定义自动梯度方法，分别是Broadcast，Gather和Scatter，其反向传播方法内分别调用了Reduce_add，Scatter和Gather，即forward和backward中的动作是成对的。

训练流程

正向

①：将输入的一个batch切分成三个更小的batch
②：将模型的参数复制三份，分别发送到三个device上
③：启动三个线程，每个线程在不同的device上进行原始模型的前向计算
④：将三个线程的输出结果拼接到一起

Loss

在device[0]上计算Loss

反向

④：将Loss切分，分别发送给对应的模型
③：执行三个模型的backward方法，更新梯度（Backward Engine多线程处理）
②：将三个模型上的梯度相加，并储存到Original Module中（直接相加？）
①：将三个模型的梯度信息拼接到一起

官方建议

不建议使用DP来进行数据并行，因为这不是一个真正的并发执行（python GIL），并且，由于数据的转移，反而会减慢模型的执行，建议使用DDP。
所有拷贝来的数据都是一次性的，每次迭代后都会销毁重建，所以这些模型上如果有数据修改，将不会持久化。（Input最后做的gather是为了下一层网络的输入）

参考

Pytorch DATAPARALLEL

Extending torch.autograd

PyTorch 源码解读之 DP & DDP：模型并行和分布式训练解析

[原创][深度][PyTorch] DDP系列第二篇：实现原理与源代码解析

mpi入门

发表于 2023-06-13 更新于 2023-06-21

MPI全称叫做消息传递接口（Message passing interface）

安装mpi4py

Python可以使用mpi4py库，使用pip安装的话需要提前安装好openmpi库：

1	sudo apt-get install libopenmpi-dev

由于Anaconda存在的缘故，pip安装编译会报找不到一些so库。解决方法见issue

不过既然Anaconda存在，可以直接使用conda安装：

1	conda install -c conda-forge mpi4py openmpi

名词介绍

world: 一个进程组，聚集通信在这个进程组中进行
rank: 当前进程（自己）的编号，0是master进程
world_size: 进程组中进程的个数

点对点通信

举例

send(isend 非阻塞)
recv(irecv 非阻塞)

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

recv_data = None

if rank == 0:
    matrix = np.random.randint(-9, 9, size=(10, 10))
    comm.send(matrix, 1)
    print("process {} send matrix {} to other processes".format(rank, matrix))
else:
    matrix_recv = comm.recv()
    print("process {} recv matrix {} from master".format(rank, matrix_recv))

mpiexec -np 2 python mpi_test.py 

process 1 recv matrix 
[[ 0  1 -7  3 -7 -1 -2  2 -3  7]
 [ 2 -6  7  4 -5 -1  0 -4  2  4]
 [ 0  1  3  5  8 -3 -8 -4 -3 -7]
 [ 1 -9 -8  5 -9 -7 -7 -6 -6 -2]
 [-2  3 -3 -4  3  7  0  2  7  8]
 [-3 -6 -4 -5 -5 -1  2 -3 -9  3]
 [-6  6  5  4 -5  8 -2 -2  3  6]
 [ 1  7 -6 -7  0 -2  8  4  7 -6]
 [-2 -7 -5 -4 -9  0 -3  8 -2  7]
 [-3  2  2  6  0 -8 -5  0 -4  5]] from master
 
process 0 send matrix 
[[ 0  1 -7  3 -7 -1 -2  2 -3  7]
 [ 2 -6  7  4 -5 -1  0 -4  2  4]
 [ 0  1  3  5  8 -3 -8 -4 -3 -7]
 [ 1 -9 -8  5 -9 -7 -7 -6 -6 -2]
 [-2  3 -3 -4  3  7  0  2  7  8]
 [-3 -6 -4 -5 -5 -1  2 -3 -9  3]
 [-6  6  5  4 -5  8 -2 -2  3  6]
 [ 1  7 -6 -7  0 -2  8  4  7 -6]
 [-2 -7 -5 -4 -9  0 -3  8 -2  7]
 [-3  2  2  6  0 -8 -5  0 -4  5]] to other processes

聚集通信

举例

bcast
scatter
gather(all_gather所有的进程均收到gather结果)
reduce(all_reduce所有的进程均收到reduce结果)

bcast 广播消息到所有进程组内进程，调用者如果是master，则是发送；其他进程是接收

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

recv_data = None

if rank == 0:
    matrix = np.random.randint(-9, 9, size=(10, 10))
    print("process {} prepare random matrix {}".format(rank, matrix))
else:
    matrix = np.zeros((10, 10), dtype=int)
    print("process {} use empty matrix {}".format(rank, matrix))
matrix = comm.bcast(matrix)
print("process {} bcast matrix {}".format(rank, matrix))

process 1 use empty matrix [[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]
 
process 0 prepare random matrix [[ 5 -8  8  4  1 -7 -8 -6  1  3]
 [-6 -5  6 -1 -8  5  7  2  3  7]
 [-2 -9 -9  7  0  8 -4  2 -8 -1]
 [ 3 -1  3  3 -7  7 -5 -5 -9 -7]
 [ 0 -4 -7 -8 -8 -8  7  2  2  1]
 [-2  0  1  7  6  3  8 -1  1  8]
 [ 6 -4  7 -7  6 -1 -4  6 -6 -9]
 [-3  2  8 -7  8 -8  8  3 -1 -3]
 [ 6  7  2 -2  1 -5  1  2  1 -2]
 [ 2 -3 -2 -3 -7  6  7 -8 -9  4]]
 
process 0 bcast matrix [[ 5 -8  8  4  1 -7 -8 -6  1  3]
 [-6 -5  6 -1 -8  5  7  2  3  7]
 [-2 -9 -9  7  0  8 -4  2 -8 -1]
 [ 3 -1  3  3 -7  7 -5 -5 -9 -7]
 [ 0 -4 -7 -8 -8 -8  7  2  2  1]
 [-2  0  1  7  6  3  8 -1  1  8]
 [ 6 -4  7 -7  6 -1 -4  6 -6 -9]
 [-3  2  8 -7  8 -8  8  3 -1 -3]
 [ 6  7  2 -2  1 -5  1  2  1 -2]
 [ 2 -3 -2 -3 -7  6  7 -8 -9  4]]
 
process 1 bcast matrix [[ 5 -8  8  4  1 -7 -8 -6  1  3]
 [-6 -5  6 -1 -8  5  7  2  3  7]
 [-2 -9 -9  7  0  8 -4  2 -8 -1]
 [ 3 -1  3  3 -7  7 -5 -5 -9 -7]
 [ 0 -4 -7 -8 -8 -8  7  2  2  1]
 [-2  0  1  7  6  3  8 -1  1  8]
 [ 6 -4  7 -7  6 -1 -4  6 -6 -9]
 [-3  2  8 -7  8 -8  8  3 -1 -3]
 [ 6  7  2 -2  1 -5  1  2  1 -2]
 [ 2 -3 -2 -3 -7  6  7 -8 -9  4]]

scatter会将发送的数据拆分，然后分给进程组中的进程

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

recv_data = None

if rank == 0:
    matrix = np.random.randint(-9, 9, size=(10, 10))
    line = np.array_split(matrix, 10)
    print("process {} prepare random matrix {}".format(rank, matrix))
else:
    line = None
one_piece = comm.scatter(line)
print("process {} bcast matrix {}".format(rank, one_piece))

mpiexec --oversubscribe -np 10 python mpi_test.py
# 数据的分组数量和进程数要对应。 默认mpi最大进程数量和cpu相同，可以添加--oversubscribe解除限制

process 0 prepare random matrix [[-9 -2 -6  4  1  7  5  5 -1 -3]
 [-4  6  3  5 -9 -6  7  3  0 -1]
 [-9 -6  2  5 -1  2 -2 -5 -8 -7]
 [ 3 -4 -1 -9 -2  5  4 -8  0  1]
 [ 3  1 -2  6 -4  0 -2 -3  6  8]
 [ 6  5 -8 -7 -3 -7 -5 -8  5  8]
 [-3 -9  2 -2  0 -7  6 -5  5 -2]
 [-6 -4  8  5 -6  6  2 -3 -2  4]
 [-5  7  3 -2  4 -8  3 -5  7 -5]
 [ 0  4 -9 -1 -5 -2 -1 -5  7  7]]
process 0 bcast matrix [[-9 -2 -6  4  1  7  5  5 -1 -3]]
process 2 bcast matrix [[-9 -6  2  5 -1  2 -2 -5 -8 -7]]
process 4 bcast matrix [[ 3  1 -2  6 -4  0 -2 -3  6  8]]
process 6 bcast matrix [[-3 -9  2 -2  0 -7  6 -5  5 -2]]
process 1 bcast matrix [[-4  6  3  5 -9 -6  7  3  0 -1]]
process 3 bcast matrix [[ 3 -4 -1 -9 -2  5  4 -8  0  1]]
process 5 bcast matrix [[ 6  5 -8 -7 -3 -7 -5 -8  5  8]]
process 8 bcast matrix [[-5  7  3 -2  4 -8  3 -5  7 -5]]
process 9 bcast matrix [[ 0  4 -9 -1 -5 -2 -1 -5  7  7]]
process 7 bcast matrix [[-6 -4  8  5 -6  6  2 -3 -2  4]]

gather收集所有分发后的数据，比如，我们需要对scatter分发的所有数据+1，然后再收集回来

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

recv_data = None

if rank == 0:
    matrix = np.random.randint(-9, 9, size=(10, 10))
    line = np.array_split(matrix, 10)
    print("process {} prepare random matrix {}".format(rank, matrix))
else:
    line = None
line = comm.scatter(line)

for item in line:
    item += 1

recv_result = comm.gather(line)

if rank == 0:
    matrix = np.vstack(recv_result)
    print("process {} get calculated matrix {}".format(rank, matrix))

process 0 prepare random matrix 
[[-2  4  0  4 -2 -1  7  3 -1  4]
 [-4 -8 -1 -9  8  7  8 -3 -6  1]
 [ 1 -8  0  6 -6 -8 -7  2 -9 -1]
 [ 7 -7  0 -4 -4  0 -2  4 -7  7]
 [-7  3  0 -8  5 -8 -2  0  7  2]
 [-6 -9 -3 -8  0  0  0 -7 -4 -5]
 [ 2 -6  3 -8  8 -8 -2  0 -2 -9]
 [ 8  8 -8 -8  7  8  4 -3 -9  3]
 [ 8 -4  4  2  8 -2  2  7  0 -6]
 [ 3  1 -3  2 -8 -6 -4 -5  3  0]]
 
process 0 get calculated matrix 
[[-1  5  1  5 -1  0  8  4  0  5]
 [-3 -7  0 -8  9  8  9 -2 -5  2]
 [ 2 -7  1  7 -5 -7 -6  3 -8  0]
 [ 8 -6  1 -3 -3  1 -1  5 -6  8]
 [-6  4  1 -7  6 -7 -1  1  8  3]
 [-5 -8 -2 -7  1  1  1 -6 -3 -4]
 [ 3 -5  4 -7  9 -7 -1  1 -1 -8]
 [ 9  9 -7 -7  8  9  5 -2 -8  4]
 [ 9 -3  5  3  9 -1  3  8  1 -5]
 [ 4  2 -2  3 -7 -5 -3 -4  4  1]]

reduce 将收集到的数据进行规约，例如SUM，MAX等

from mpi4py import MPI
import numpy as np

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

recv_data = None

if rank == 0:
    matrix = np.random.randint(-9, 9, size=(10, 10))
    line = np.array_split(matrix, 10)
    print("process {} prepare random matrix {}".format(rank, matrix))
else:
    line = None
line = comm.scatter(line)
sum_result = comm.reduce(line, op=MPI.SUM)

if rank == 0:
    print("process {} get calculated sum result {}".format(rank, sum_result))

process 0 prepare random matrix 
[[ 2 -9 -9 -8 -9 -8 -7  8 -6  5]
 [ 0  6  6 -2 -6  4 -5 -7  3 -4]
 [-9  2  6 -5 -1 -6  5 -2  3  4]
 [ 7  4 -6 -5  1  4 -5  5 -1  1]
 [ 1  2 -8 -7  5  2 -6 -4  5 -8]
 [-8  0  7  1  0 -8  3 -8 -2  3]
 [ 6  5  2 -7 -6 -3 -7  4  2 -1]
 [ 2 -8  8 -1 -3 -9  0  4 -9 -9]
 [-4 -7 -1 -8  4  7  1  6  4  5]
 [-2  6  3  3  7  7 -4  1 -7 -3]]
 
process 0 get calculated sum result [[ -5   1   8 -39  -8 -10 -25   7  -8  -7]]

DeepSpeed+Ubuntu+CPU

目前CPU支持很有限，仅支持部分推理。环境配置可参考CI配置

安装intel_extension_for_pytorch

1 2	python -m pip install intel_extension_for_pytorch python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu

安装oneCCL

git clone https://github.com/oneapi-src/oneCCL
cd oneCCL
mkdir build
cd build
cmake ..
make
make install
source ./_install/env/setvars.sh

安装Transformers(用于跑用例)

git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
pip install .

安装DeepSpeed

1	pip install DeepSpeed

ds_report

--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
deepspeed_not_implemented  [NO] ....... [OKAY]
deepspeed_ccl_comm ....... [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/home/hua/anaconda3/lib/python3.10/site-packages/torch']
torch version .................... 2.0.1+cu117
deepspeed install path ........... ['/home/hua/anaconda3/lib/python3.10/site-packages/deepspeed']
deepspeed info ................... 0.9.4+e5fe5f65, e5fe5f65, master
deepspeed wheel compiled w. ...... torch 0.0

DeepSpeed+Ubuntu+GPU

操作系统

华为云上gpu的镜像是16.04的，很多依赖的软件版本过低，建议升级到20.04，DeepSpeedExample有些需要高版本的glibc，可以直接升级到22.04。

sudo apt-get update
sudo apt-get upgrade
sudo apt-get dist-upgrade
sudo do-release-upgrade

GPU驱动和cuda

驱动下载页面选择相应的型号，选择cuda 11.7
CUDA toolkit下载页面选择系统版本，然后下载runfile（使用apt总是会升级驱动和cuda到最新版本，所以直接下载二进制安装）。

pip源

1	pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

安装pytorch

DeepSpeed 0.9.2（pip版本）目前依赖的是pytorch1.13.1，安装对应的版本。

1	pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu117

triton==1.0.0

DeepSpeed依赖triton版本是1.0.0，pip仓库无法直接安装，git上下载源码编译安装。

# 编译依赖
sudo apt-get install llvm-11 llvm-11-*
# 源码安装
wget https://github.com/openai/triton/archive/refs/tags/v1.0.zip
unzip v1.0.zip
cd triton/python
pip install cmake
pip install .

安装DeepSpeed

# 预编译算子(编译不通过：https://github.com/microsoft/DeepSpeed/issues/425)
# workground: Setting NVCC_PREPEND_FLAGS="--forward-unknown-opts"
DS_BUILD_OPS=1 pip install deepspeed --global-option="build_ext" --global-option="-j8"

# JIT_load
pip install deepspeed

ds_report

--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
async_io ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
sparse_attn ............ [NO] ....... [OKAY]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/home/hua/anaconda3/lib/python3.10/site-packages/torch']
torch version .................... 1.13.1+cu117
deepspeed install path ........... ['/home/hua/anaconda3/lib/python3.10/site-packages/deepspeed']
deepspeed info ................... 0.9.2, unknown, unknown
torch cuda version ............... 11.7
torch hip version ................ None
nvcc version ..................... 11.7
deepspeed wheel compiled w. ...... torch 1.13, cuda 11.7

运行例子

1
2
3

https://github.com/microsoft/DeepSpeedExamples.git
cd DeepSpeedExamples/training/cifar
./run_ds.py

python报错，修改方法：

diff --git a/training/cifar/cifar10_deepspeed.py b/training/cifar/cifar10_deepspeed.py
index 33ea569..d1117c3 100755
--- a/training/cifar/cifar10_deepspeed.py
+++ b/training/cifar/cifar10_deepspeed.py
@@ -159,7 +159,7 @@ def imshow(img):

 # get some random training images
 dataiter = iter(trainloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)

 # show images
 imshow(torchvision.utils.make_grid(images))
@@ -309,7 +309,7 @@ print('Finished Training')
 # Okay, first step. Let us display an image from the test set to get familiar.

 dataiter = iter(testloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)

 # print images
 imshow(torchvision.utils.make_grid(images))
diff --git a/training/cifar/cifar10_tutorial.py b/training/cifar/cifar10_tutorial.py
index 2154e36..114e8c5 100644
--- a/training/cifar/cifar10_tutorial.py
+++ b/training/cifar/cifar10_tutorial.py
@@ -110,7 +110,7 @@ def imshow(img):

 # get some random training images
 dataiter = iter(trainloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)

 # show images
 imshow(torchvision.utils.make_grid(images))
@@ -219,7 +219,7 @@ torch.save(net.state_dict(), PATH)
 # Okay, first step. Let us display an image from the test set to get familiar.

 dataiter = iter(testloader)
-images, labels = dataiter.next()
+images, labels = next(dataiter)

 # print images
 imshow(torchvision.utils.make_grid(images))

简单模型改写为DeepSpeed

import torch

with_ds = True

if with_ds:
    import deepspeed

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

x_data = torch.tensor([[1.0], [2.0], [3.0]]).to(device)
y_data = torch.tensor([[2.0], [4.0], [6.0]]).to(device)


class LinearModel(torch.nn.Module):
    def __init__(self):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(1, 1).to(device)

    def forward(self, x):
        y_pred = self.linear(x)
        return y_pred


model = LinearModel()
criterion = torch.nn.MSELoss(size_average=False)

if not with_ds:
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
else:
    ds_config = {
        "train_micro_batch_size_per_gpu": 2,
        "optimizer": {
            "type": "SGD",
            "params": {
                "lr": 1e-2
            }
        },
    }

    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=ds_config)

for epoch in range(10):
    y_pred = model(x_data)
    loss = criterion(y_pred, y_data)
    print(epoch, loss.item())

    if not with_ds:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    else:
        model.backward(loss)
        model.step()

print('w = ', model.linear.weight.item())
print('b = ', model.linear.bias.item())

x_test = torch.tensor([[4.0]]).to(device)
y_test = model(x_test)
print('y_pred = ', y_test.data)

DeepSpeed基本执行流程

这个简单的模型实际上还是完全调用的pytorch的函数，ds包的wapper其实啥都没干，完全透传

首先获取Accelerator（加速器），判断能不能import intel_extension_for_deepspeed，如果能就用XPU，如果不能就用CUDA（Accelerator将设备管理，内存管理，Tensor等等进行了抽象，不同的后端设备继承实现），高版本这部分逻辑有改变，可以通过环境变量控制。切入点1，这里需要判断是否能够使用npu
选择并行计算后端，如果Torch.distributed已经初始化，则使用直接使用，检查是否在Aure或者aws机器上，针对这些机器做环境变量配置，否则尝试寻找mpi，然后根据Accelerator的comm backend类型初始化TorchBackend.切入点2，这里需要针对华为云机器做专门的环境变量配置，以及支持昇腾并行后端
解析ds的配置文件
创建ds引擎
1. 检查环境变量，配置dist相关配置，包括rank，world size 等等
2. 用dist分发模型参数，所有进程同步模型参数
3. 根据配置创建optimizer，例如，上例中，就会生成torch自带的SGD优化器，也可以指定ds提供的Adam，lamb等优化器
4. 配置checkpoint
5. 编译Utils，就是flateen_unflateen.cpp
执行forward，计算损失 切入点3，以下基本调用的都是pytorch的optimizer，这些需要pytorch有昇腾支持，另外，DS提供的优化器也需要昇腾支持
反向传播
1. 梯度累加
2. 根据是否使用zero优化，自动精度，混合精度等调用optimizer其他wapper的backward，并传入合适的参数
3. 多进程计算梯度并收集结果
更新参数
1. 如果到了梯度累加的预制，根据不同的配置，最终调用optimizer.step更新参数，并清空梯度信息

神经网络

通俗理解，神经网络就是给一组输入，经过神经网络的运算后，得到一个在误差范围内的结果。模型的选择和对模型中各个神经元的系数的确定决定了模型的输出结果，为了得到期望的结果，需要有多组输入，和对应的正确结果，用这些数据来训练模型，确定合适的参数组合。参数确定后，即可预测新的输入对应的输出。

举一个简单的例子，两个神经元组成一个一层的神经网络，给定的两个输入，能够得到一个结果。

前向传播

通俗理解前向传播，就是根据输入计算输出的过程。按上述例子来说，就是 \[ net_o = i1×w1+b1 + i2×w2+b2 \] 假定输入数据为：

1
2
3

i1 = 0.05, i2 = 0.10;
w1 = 0.15, w2 = 0.20;
b1 = 0.35, b2 = 0.60.

那么： \[ net_o = i1×w1+b1 + i2×w2+b2 = 0.05×0.15+0.35+0.10*0.20+0.60=0.9775 \] 假定选择sigmoid函数作为激活函数，那么 \[ out_o = \frac{1}{1+\epsilon^{net_o}}=\frac{1}{1+\epsilon^{0.9775}}=0.2734 \]

如果此时我们期望的输出为：

o=0.01

误差为 \[ o-out_o=0.01-0.2734=-0.2634 \]

梯度下降法

考虑一个抛物线函数： \[ y=f(x)^2 \]

X轴为权值，Y轴为误差，那么我们能计算得到曲线斜率为0的位置为最低位置（x=0），如果是一个复杂的多维函数，不容易直接通过计算得出曲线或者平面的最低位置，所以可以使用梯度下降法来通过迭代计算最低位置。相当于放一个小球，小球会在重力的影响下，沿最陡的方向下降。

假设目前在x=10的点上(x0)，梯度为： \[ \nabla f(x_0)=f'(x0)=2x=20 \] 那梯度的反方向就是下降率最快的方向，如果步长为η=0.2，那么新的位置x1为： \[ x_1 = x_0 - \eta×\nabla f(x_0)=10-0.2*20=6 \] 然后继续迭代，直到梯度曲线趋近于0.

	x0	x1	x2	x3	x4	x5	x6	X7	x8	x9	X10
梯度	20	12	7,2	4.32	2.59	1.56	0.93	0.56	0.34	0.2	0.12

注意步长的选择，如果步长太小，那么经过多次迭代仍然无法达到曲线或者平面的最低点，如果步长过长，那么或跳过最低点，从而不收敛。

同理，可以从二维（一元函数，曲线）推广到三维（二元函数，平面）甚至多维，目标就是找到最陡的反向，然后按步长走到下一个位置，然后一直迭代下去。

参考文档

反向传播

假设使用均方误差（MSE）来作为误差评估方法，那么误差为(这个是MSE么???)： \[ E=\Sigma\frac{1}{2}(o-out_o)^2 \] 由于这个例子中只有一个输出，那么误差为： \[ E=\frac{1}{2}(0.01-0.2734)^2=0.03468978 \] 反向传播就是通过误差来计算当前参数的梯度，然后沿梯度下降的方向走一个步长η，迭代此过程。

根据正向传播的函数，能够得到： \[ out_o = \frac{1}{1+\epsilon^{net_o}} = \frac{1}{1+\epsilon^{(i1×w1+b1 + i2×w2+b2)}} \]

\[ E=\Sigma\frac{1}{2}(o-out_o)^2=\Sigma\frac{1}{2}(o-\frac{1}{1+\epsilon^{(i1×w1+b1 + i2×w2+b2)}})^2 \]

我们要计算梯度，也就是要计算总的误差E在w1和w2的偏导值。

这个公式比较复杂，所以要使用到求导的链式法则。

链式法则

\[ \frac{\partial E}{\partial w1}=\frac{\partial E}{\partial out_o}×\frac{\partial out_o}{\partial net_o}×\frac{\partial net_o}{\partial w1} \]

那么，上述复杂函数的求导就可以转换成几个简单函数求导的乘积。 \[ \frac{\partial E}{\partial out_0}=2×\frac{1}{2}(o-out_o)^1*-1+0=(0.01-0.2734)*-1=0.2634 \]

\[ \frac{\partial out_o}{\partial net_o}=out_o(1-out_o)=0.2734(1-0.2734)=0.19865244 \]

\[ \frac{\partial net_o}{\partial w1}=i1=0.05 \]

相乘可得： \[ \frac{\partial E}{\partial w1}=0.2634×0.19865244×0.05=0.0002616 \] 同理： \[ \frac{\partial E}{\partial w2}=0.2634×0.19865244×0.10=0.0052325 \] 如果步长η=0.2，那么： \[ w1' = w1 - \eta×\frac{\partial E}{\partial w1} =0.004994768 \]

\[ w2' = w2 - \eta×\frac{\partial E}{\partial w2} =0.0989535 \]

然后，使用更新后的权值做下一步迭代。

参考文档

Python C API && pybind11

发表于 2023-06-06

Python C API

什么是Python的C语言扩展

为什么要使用C API ？

C语言实现的复杂计算效率高；
已经有成熟的C语言编写的函数库；
和操作系统相关的操作只能使用C语言实现。

为什么Python能够使用C模块 ？

cpython虚拟机是由C语言编写；
使用动态链接库的方式可以直接调用模块函数。

Python调用C模块都有哪些方法 ？

C api；
pybind11；
ctypes；
SWIG；
Cython。

我理解的C模块调用原理

模块加载

python interpreter

  Module
    |
   \/                       +-----------+
Load lib(libsum.so) ------> |PyModuleDef|
                            +-----------+          +-----------+
                            |PyMethodDef|  ------> |PyMethodDef|
                            +-----------+          +-----------+
                                                   |C function | ------> sum(c function)
                                                   +-----------+

模块内函数调用

python --   call python function
              |
              \/
       --  PyArg_ParseTuple
       |      |
       |      \/
 C     |  call c module's function
       |      |
       |      \/
       --  Py_BuildValue
              |
              \/
 python -- python function return

简单示例

libsum.cc

#include <Python.h>

// 实际业务函数体
int sum(int a, int b) {
    return a + b;
}

/**
 *   业务函数的包装函数，该函数注册到PyMethodDef中，供Python虚拟机调用
 *   传入两个参数，self和args，args中的参数需要从对象中解析出来。
 *   PyArg_ParseTuple接收一个格式串，根据格式传将参数从PyObject中解析出来
 *   返回一个整数，需要包装成PyObject类型。
 **/
static PyObject* sum_wrapper(PyObject* self, PyObject* args) {
    int a, b;
    if (!PyArg_ParseTuple(args, "ii", &a, &b)) {
        return NULL;
    }
    int result = sum(a, b);
    return Py_BuildValue("i", result);
}

/**
 *   描述需要注册到模块中的函数，每个函数定义一行，以{NULL, NULL, 0, NULL}结束
 *   每行结构有4个参数，分别为：python调用的函数名，实际调用的包装函数；传入参数的类型；
 *                          函数的描述。
 **/
static PyMethodDef SumMethods[] = {
    {"sum", sum_wrapper, METH_VARARGS, "Calculate the sum of two integers."},
    {NULL, NULL, 0, NULL}
};

/**
 *   描述模块属性，参数分别为：固定参数；模块名，就是import后的名字；模块描述；？？？；
 *                         函数数组。
 **/
static struct PyModuleDef summodule = {
    PyModuleDef_HEAD_INIT,
    "libsum",
    "A module that adds two numbers",
    -1,
    SumMethods
};

// 初始化模块，以PyInit_模块名 命名
PyMODINIT_FUNC PyInit_libsum(void) {
    return PyModule_Create(&summodule);
}

py_sum.py

import libsum

a = 1
b = 2
c = libsum.sum(a, b)

print(f"{a} + {b} = {c}")

setup.py

from distutils.core import setup, Extension

libsum_module = Extension('libsum',
                    sources = ['libsum.cc'],
                    extra_compile_args=['-g'])

setup (name = 'libsum',
       version = '1.0',
       description = 'This is a libsum module',
       ext_modules = [libsum_module])

python setup.py build_ext --inplace

生成
libsum.cpython-310-x86_64-linux-gnu.so

可以直接被python import进来

数组和自定义类型

数组

double avg(double *arr, size_t len) {
    double sum = 0;
    for (size_t i = 0; i < len; i++) {
        sum += arr[i];
    }
    return (sum/len);
}

static PyObject *avg_wrapper(PyObject *self, PyObject *args) {
  PyObject *bufobj;
  Py_buffer view;
  double result;
  if (!PyArg_ParseTuple(args, "O", &bufobj)) {
    return NULL;
  }

  // 数组内存是连续的
  if (PyObject_GetBuffer(bufobj, &view,
      PyBUF_ANY_CONTIGUOUS | PyBUF_FORMAT) == -1) {
    return NULL;
  }
  
  // 判断是一维数组
  if (view.ndim != 1) {
    PyErr_SetString(PyExc_TypeError, "Expected a 1-dimensional array");
    PyBuffer_Release(&view);
    return NULL;
  }

  // 检查是否是double类型的数组
  if (strcmp(view.format,"d") != 0) {
    PyErr_SetString(PyExc_TypeError, "Expected an array of doubles");
    PyBuffer_Release(&view);
    return NULL;
  }

  // View.buf是double数组的首指针，view.shape[0]是这一维度的长度
  result = avg((double*)view.buf, view.shape[0]);

  // 需要显式释放Py_buffer
  PyBuffer_Release(&view);
  return Py_BuildValue("d", result);
}

自定义类型

// 定义C++类型
class DoubleWarpper {
 private:
  double m_value;

 public:
  DoubleWarpper(double value) : m_value(value) {}
  virtual ~DoubleWarpper() { m_value = 0; }

  double get() { return m_value; }
  void set(double value) { m_value = value; }

  DoubleWarpper operator+(const DoubleWarpper &other) const {
    return DoubleWarpper(m_value + other.m_value);
  }
  DoubleWarpper operator-(const DoubleWarpper &other) const {
    return DoubleWarpper(m_value - other.m_value);
  }
  DoubleWarpper operator*(const DoubleWarpper &other) const {
    return DoubleWarpper(m_value * other.m_value);
  }
  DoubleWarpper operator/(const DoubleWarpper &other) const {
    return DoubleWarpper(m_value / other.m_value);
  }
};

// 包装成Python C API需要的结构
typedef struct {
  PyObject_HEAD;
  DoubleWarpper *warpper = nullptr;
} PyDoubleWarpper;

// Python对象构造时调用，需要返回一个PyDoubleWarpper类型的对象。 kwds？？？
static PyObject *pycal_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
  PyDoubleWarpper *self;
  self = (PyDoubleWarpper *)type->tp_alloc(type, 0);
  char *kwlist[] = {"value", 0};
  double value = 0;
  if (!PyArg_ParseTupleAndKeywords(args, kwds, "d", kwlist, &value)) {
    Py_DECREF(self);
    return nullptr;
  }
  self->warpper = new DoubleWarpper(value);
  return (PyObject *)self;
}

// Python对象销毁时调用
static void *pycal_dealloc(PyObject *py_cal) {
  delete ((PyDoubleWarpper *)py_cal)->warpper;
  Py_TYPE(py_cal)->tp_free(py_cal);
  return (void *)0;
}

// 从PyObject中获取Calculator指针
static DoubleWarpper *get_cal(PyObject *obj) {
  return ((PyDoubleWarpper *)obj)->warpper;
}

// 通过Calculator指针构造PyObject
static PyObject *return_cal(DoubleWarpper *cal, PyTypeObject *type) {
  PyDoubleWarpper *obj = PyObject_NEW(PyDoubleWarpper, type);
  obj->warpper = cal;
  return (PyObject *)obj;
}

// Calculator类提供的函数的包装
static PyObject *pycal_set(PyObject *self, PyObject *args) {
  DoubleWarpper *cal = get_cal(self);
  double value = 0;
  if (!PyArg_ParseTuple(args, "d", &value)) {
    return nullptr;
  }

  cal->set(value);
  return Py_BuildValue("i", 0);
}

// 打印对象
static PyObject *pycal_str(PyObject *self) {
  DoubleWarpper *cal = get_cal(self);
  std::stringstream ss;
  ss<<cal->get();
  return Py_BuildValue("s", ss.str().c_str());
}

// 实现加减乘除
static PyObject *pycal_add(PyObject *a, PyObject *b) {
  DoubleWarpper *cal_a = get_cal(a);
  DoubleWarpper *cal_b = get_cal(b);
  DoubleWarpper *ret = new DoubleWarpper(*cal_a + *cal_b);
  return return_cal(ret, a->ob_type);
}

static PyObject *pycal_minus(PyObject *a, PyObject *b) {
  DoubleWarpper *cal_a = get_cal(a);
  DoubleWarpper *cal_b = get_cal(b);
  DoubleWarpper *ret = new DoubleWarpper(*cal_a - *cal_b);
  return return_cal(ret, a->ob_type);
}

static PyObject *pycal_multipy(PyObject *a, PyObject *b) {
  DoubleWarpper *cal_a = get_cal(a);
  DoubleWarpper *cal_b = get_cal(b);
  DoubleWarpper *ret = new DoubleWarpper(*cal_a * *cal_b);
  return return_cal(ret, a->ob_type);
}

static PyObject *pycal_divide(PyObject *a, PyObject *b) {
  DoubleWarpper *cal_a = get_cal(a);
  DoubleWarpper *cal_b = get_cal(b);
  DoubleWarpper *ret = new DoubleWarpper(*cal_a / *cal_b);
  return return_cal(ret, a->ob_type);
}

// 对象的数字属性，这里仅实现了加减乘除
static PyNumberMethods numberMethods = {
    pycal_add,      // nb_add
    pycal_minus,    // nb_subtract;
    pycal_multipy,  // nb_multiply
    nullptr,        // nb_remainder;
    nullptr,        // nb_divmod;
    nullptr,        // nb_power;
    nullptr,        // nb_negative;
    nullptr,        // nb_positive;
    nullptr,        // nb_absolute;
    nullptr,        // nb_bool;
    nullptr,        // nb_invert;
    nullptr,        // nb_lshift;
    nullptr,        // nb_rshift;
    nullptr,        // nb_and;
    nullptr,        // nb_xor;
    nullptr,        // nb_or;
    nullptr,        // nb_int;
    nullptr,        // nb_reserved;
    nullptr,        // nb_float;
    nullptr,        // nb_inplace_add;
    nullptr,        // nb_inplace_subtract;
    nullptr,        // nb_inplace_multiply;
    nullptr,        // nb_inplace_remainder;
    nullptr,        // nb_inplace_power;
    nullptr,        // nb_inplace_lshift;
    nullptr,        // nb_inplace_rshift;
    nullptr,        // nb_inplace_and;
    nullptr,        // nb_inplace_xor;
    nullptr,        // nb_inplace_or;
    nullptr,        // nb_floor_divide;
    pycal_divide,   // nb_true_divide;
    nullptr,        // nb_inplace_floor_divide;
    nullptr,        // nb_inplace_true_divide;
    nullptr,        // nb_index;
    nullptr,        // nb_matrix_multiply;
    nullptr         // nb_inplace_matrix_multiply;

};

// PyCalculator对象的成员函数
static PyMethodDef pycal_methods[] = {
    {"set", (PyCFunction)pycal_set, METH_VARARGS, "set DoubleWarpper value."},
    {nullptr}};

// PyCalculator对象内容
static PyTypeObject DoubleWarpperType = {
    PyVarObject_HEAD_INIT(nullptr, 0) "libsum.DoubleWarpper", /* tp_name */
    sizeof(PyDoubleWarpper),                                  /* tp_basicsize */
    0,                                                        /* tp_itemsize */
    (destructor)pycal_dealloc,                                /* tp_dealloc */
    0,                                        /* tp_vectorcall_offset */
    nullptr,                                  /* tp_getattr */
    nullptr,                                  /* tp_setattr */
    nullptr,                                  /* tp_reserved */
    nullptr,                                  /* tp_repr */
    &numberMethods,                           /* tp_as_number */
    nullptr,                                  /* tp_as_sequence */
    nullptr,                                  /* tp_as_mapping */
    nullptr,                                  /* tp_hash  */
    nullptr,                                  /* tp_call */
    pycal_str,                                /* tp_str */
    nullptr,                                  /* tp_getattro */
    nullptr,                                  /* tp_setattro */
    nullptr,                                  /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
    "Coustom DoubleWarpper class.",           /* tp_doc */
    nullptr,                                  /* tp_traverse */
    nullptr,                                  /* tp_clear */
    nullptr,                                  /* tp_richcompare */
    0,                                        /* tp_weaklistoffset */
    nullptr,                                  /* tp_iter */
    nullptr,                                  /* tp_iternext */
    pycal_methods,                            /* tp_methods */
    nullptr,                                  /* tp_members */
    nullptr,                                  /* tp_getset */
    nullptr,                                  /* tp_base */
    nullptr,                                  /* tp_dict */
    nullptr,                                  /* tp_descr_get */
    nullptr,                                  /* tp_descr_set */
    0,                                        /* tp_dictoffset */
    nullptr,                                  /* tp_init */
    nullptr,                                  /* tp_alloc */
    pycal_new                                 /* tp_new */
};


PyMODINIT_FUNC PyInit_libsum(void) {
  if (PyType_Ready(&DoubleWarpperType) < 0) {
    return nullptr;
  }

  PyObject *module = PyModule_Create(&summodule);
  if (module == nullptr) {
    return nullptr;
  }

  // 注册对象，这里为啥要引用+1？？
  Py_INCREF(&DoubleWarpperType);
  if (PyModule_AddObject(module, "DoubleWarpper",
                         (PyObject *)&DoubleWarpperType) < 0) {
    Py_DECREF(&DoubleWarpperType);
    Py_DECREF(module);
    return nullptr;
  }

  return module;
}

执行结果

import array
import libsum

print ("Simple case:")
a = 1
b = 2
c = libsum.sum(a, b)
print(f"{a} + {b} = {c}\n")

print("Pass array:")
d = libsum.avg(array.array('d',[1.0,2.0,3.0,4.0,5.0]))
print(f"avg = {d}\n")

print ("Define new struct:")
e = libsum.DoubleWarpper(1);
f = libsum.DoubleWarpper(2);
print (f"e = {e}, f = {f}")

e.set(20);
f.set(10);
print (f"after set: e = {e}, f = {f}")

g = e+f;
h = e-f;
i = e*f;
j = e/f;
print (f"{e} + {f} = {g}")
print (f"{e} - {f} = {h}")
print (f"{e} * {f} = {i}")
print (f"{e} / {f} = {j}\n")

(base) hua@hfc-ascend:~/code/share$ python py_sum.py
Simple case:
1 + 2 = 3

Pass array:
avg = 3.0

Define new struct:
e = 1, f = 2
after set: e = 20, f = 10
20 + 10 = 30
20 - 10 = 10
20 * 10 = 200
20 / 10 = 2

pybind 11

简单例子

#include <Python.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <vector>

namespace py = pybind11;

int sum(int a, int b) { return a + b; }

double avg(std::vector<double> &arr) {
  double sum = 0;
  for (auto it = arr.begin(); it != arr.end(); it++) {
    sum += (*it);
  }
  return (sum / arr.size());
}

PYBIND11_MODULE(libsum, m) {
    m.doc() = "Py module example.";
    m.def("sum", &sum, "Calculate the sum of two integers.");
    m.def("avg", &avg, "alculate the avg of a double array.");
}

自定义类型

class DoubleWarpper {
 private:
  double m_value;

 public:
  DoubleWarpper(double value) : m_value(value) {}
  virtual ~DoubleWarpper() { m_value = 0; }

  double get() { return m_value; }
  void set(double value) { m_value = value; }
  DoubleWarpper operator+(const DoubleWarpper &other) const{
    return DoubleWarpper(m_value + other.m_value);
  }
  DoubleWarpper operator-(const DoubleWarpper &other) const{
    return DoubleWarpper(m_value - other.m_value);
  }
  DoubleWarpper operator*(const DoubleWarpper &other) const{
    return DoubleWarpper(m_value * other.m_value);
  }
  DoubleWarpper operator/(const DoubleWarpper &other) const{
    return DoubleWarpper(m_value / other.m_value);
  }

};

PYBIND11_MODULE(libsum, m) {
    m.doc() = "Py module example.";
    m.def("sum", &sum, "Calculate the sum of two integers.");
    m.def("avg", &avg, "alculate the avg of a double array.");

    py::class_<DoubleWarpper>(m, "DoubleWarpper")
      .def(py::init<double>())
      .def("set", &DoubleWarpper::set)
      .def("get", &DoubleWarpper::get)
      //运算符重载
      .def(py::self + py::self)
      .def(py::self - py::self)
      .def(py::self * py::self)
      .def(py::self / py::self)
      .def("__repr__",
          [](DoubleWarpper &warpper) {
            std::stringstream ss;
            ss<<warpper.get();
            return ss.str().c_str();
          }
      );
}

toml

pyproject.toml

# pyproject.toml
[build-system]
requires = ["setuptools>=61.0", "cython"]
build-backend = "setuptools.build_meta"

[project]
name        = "libsum"
description = "This is a libsum module"
version     = "0.0.1"
readme      = "README.md"
requires-python = ">=3.10"
authors = [
  { name="huafengchun", email="huafengchun@huawei.com" },
]
classifiers = [
    "Programming Language :: Python :: 3",
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
]
keywords = ["sum", "avg"]

[project.urls]
"Homepage" = "w3.huawei.com"

[tool.setuptools]
py-modules = ["_custom_build"]

[tool.setuptools.cmdclass]
build_py = "_custom_build.build_py"

# _custom_build.py

from setuptools import Extension
from setuptools.command.build_py import build_py as _build_py

class build_py(_build_py):
    def run(self):
        self.run_command("build_ext")
        return super().run()

    def initialize_options(self):
        super().initialize_options()
        if self.distribution.ext_modules == None:
            self.distribution.ext_modules = []

        self.distribution.ext_modules.append(

            Extension(
                "libsum",
                sources=["libsum.c"],
                extra_compile_args=['-g', '-I/home/hua/anaconda3/include/python3.10', '-I/home/hua/anaconda3/lib/python3.10/site-packages/pybind11/include'],
            )
        )

python -m build

base) hua@hfc-ascend:~/code/share$ ll
total 2488
drwxrwxr-x 7 hua hua    4096 Jun  5 09:20 ./
drwxrwxr-x 5 hua hua    4096 May 30 11:08 ../
drwxrwxr-x 4 hua hua    4096 Jun  5 09:17 build/
-rw-rw-r-- 1 hua hua     716 Jun  5 09:16 _custom_build.py
drwxrwxr-x 2 hua hua    4096 Jun  5 09:16 dist/
-rw-rw-r-- 1 hua hua    9015 May 29 19:14 libsum.cc
-rwxrwxr-x 1 hua hua 2485776 Jun  5 09:17 libsum.cpython-310-x86_64-linux-gnu.so*
drwxrwxr-x 2 hua hua    4096 Jun  5 09:16 libsum.egg-info/
-rw-rw-r-- 1 hua hua    1715 May 29 19:33 libsum_pybind11.cc
drwxrwxr-x 2 hua hua    4096 Jun  5 09:16 __pycache__/
-rw-rw-r-- 1 hua hua     660 Jun  5 09:16 pyproject.toml
-rw-rw-r-- 1 hua hua     535 May 29 19:17 py_sum.py
-rw-rw-r-- 1 hua hua     477 May 29 19:16 setup.py
drwxrwxr-x 2 hua hua    4096 May 30 11:32 .vscode/

Hello World