Skip to content

Commit 2ae5902

Browse files
authored
Merge branch 'dev' into dev-hardware
2 parents aff3d2b + 817cc8f commit 2ae5902

File tree

24 files changed

+1100
-47
lines changed

24 files changed

+1100
-47
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#ifndef KERNEL_CUDA_PAD_CUH
2+
#define KERNEL_CUDA_PAD_CUH
3+
4+
#include "threads_distributer.cuh"
5+
#include <cstdint>
6+
7+
namespace refactor::kernel::cuda {
8+
9+
struct PadDimInfo {
10+
unsigned int strideI, strideO, padS, dimI;
11+
};
12+
13+
void launchPad(
14+
KernelLaunchParameters const &,
15+
uint8_t const *src, uint8_t const *src_const,
16+
PadDimInfo const *dims, void *output,
17+
unsigned int rank,
18+
unsigned int blockSize);
19+
20+
}// namespace refactor::kernel::cuda
21+
22+
#endif// KERNEL_CUDA_PAD_CUH

src/04kernel/cuda/include/kernel/cuda/slice.cuh

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@
55

66
namespace refactor::kernel::cuda {
77

8-
struct DimInfo {
8+
struct SliceDimInfo {
99
unsigned int strideO, skip;
1010
int strideI;
1111
};
1212

1313
void launchSlice(
1414
KernelLaunchParameters const &,
15-
void const *src, DimInfo const *dims, void *output,
15+
void const *src, SliceDimInfo const *dims, void *output,
1616
unsigned int rank,
1717
unsigned int blockSize);
1818

src/04kernel/cuda/src/pad.cu

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#include "kernel/cuda/pad.cuh"
2+
#include "macro.cuh"
3+
#include <cstdint>
4+
5+
namespace refactor::kernel::cuda {
6+
7+
__global__ static void padKernel(
8+
unsigned long long n,
9+
uint8_t const *__restrict__ src,
10+
uint8_t const *__restrict__ src_const,
11+
PadDimInfo const *__restrict__ dims,
12+
uint8_t *__restrict__ dst,
13+
unsigned int rank,
14+
unsigned int blockSize) {
15+
for (auto tid = blockIdx.x * blockDim.x + threadIdx.x,
16+
step = blockDim.x * gridDim.x;
17+
tid < n;
18+
tid += step) {
19+
long rem = tid, j = 0;
20+
bool flag = false;
21+
for (auto i = 0; i < rank; ++i) {
22+
auto strideO = __ldg(&(dims[i].strideO));
23+
auto strideI = __ldg(&(dims[i].strideI));
24+
auto padS = __ldg(&(dims[i].padS));
25+
auto dimI = __ldg(&(dims[i].dimI));
26+
auto pos = rem / strideO - padS;
27+
if (pos < 0 || pos >= dimI) {
28+
flag = true;
29+
break;
30+
}
31+
j += pos * strideI;
32+
rem %= strideO;
33+
}
34+
if (flag) {
35+
optimizedMemcpy(dst + tid * blockSize, src_const, blockSize);
36+
} else {
37+
optimizedMemcpy(dst + tid * blockSize, src + j * blockSize, blockSize);
38+
}
39+
}
40+
}
41+
42+
void launchPad(
43+
KernelLaunchParameters const &params,
44+
uint8_t const *src, uint8_t const *src_const,
45+
PadDimInfo const *dims, void *output,
46+
unsigned int rank,
47+
unsigned int blockSize) {
48+
49+
padKernel<<<
50+
params.gridSize,
51+
params.blockSize,
52+
0,
53+
reinterpret_cast<cudaStream_t>(params.stream)>>>(
54+
params.n,
55+
src,
56+
src_const,
57+
dims,
58+
reinterpret_cast<uint8_t *>(output),
59+
rank,
60+
blockSize);
61+
}
62+
63+
}// namespace refactor::kernel::cuda

src/04kernel/cuda/src/slice.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ namespace refactor::kernel::cuda {
77
__global__ static void sliceKernel(
88
unsigned long long n,
99
uint8_t const *__restrict__ src,
10-
DimInfo const *__restrict__ dims,
10+
SliceDimInfo const *__restrict__ dims,
1111
uint8_t *__restrict__ dst,
1212
unsigned int rank,
1313
unsigned int blockSize) {
@@ -29,7 +29,7 @@ namespace refactor::kernel::cuda {
2929

3030
void launchSlice(
3131
KernelLaunchParameters const &params,
32-
void const *src, DimInfo const *dims, void *output,
32+
void const *src, SliceDimInfo const *dims, void *output,
3333
unsigned int rank,
3434
unsigned int blockSize) {
3535
sliceKernel<<<
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#ifndef KERNEL_PAD_ATTRIBUTES_H
2+
#define KERNEL_PAD_ATTRIBUTES_H
3+
4+
#include "../tensor.h"
5+
#include "common.h"
6+
7+
namespace refactor::kernel {
8+
9+
struct PadType {
10+
enum : uint8_t {
11+
Constant,
12+
Reflect,
13+
Edge,
14+
Wrap,
15+
} type;
16+
17+
constexpr PadType() noexcept
18+
: type(Constant) {}
19+
constexpr PadType(decltype(type) type_) noexcept
20+
: type(type_) {}
21+
constexpr operator decltype(type)() const noexcept {
22+
return type;
23+
}
24+
constexpr std::string_view toString() const noexcept {
25+
switch (type) {
26+
case Constant:
27+
return "Constant";
28+
case Reflect:
29+
return "Reflect";
30+
case Edge:
31+
return "Edge";
32+
case Wrap:
33+
return "Wrap";
34+
default:
35+
UNREACHABLE();
36+
}
37+
}
38+
};
39+
40+
namespace pad {
41+
struct Dim {
42+
int64_t dimI, dimO, pads;
43+
};
44+
}// namespace pad
45+
46+
using PadDimension = std::vector<pad::Dim>;
47+
48+
struct PadInfo {
49+
struct Dim {
50+
dim_t strideI, strideO, padS, dimI;
51+
};
52+
std::vector<Dim> dims;
53+
dim_t blockCount, blockSize;
54+
55+
PadInfo(decltype(dims), dim_t, dim_t) noexcept;
56+
PadInfo(PadDimension, Tensor const &);
57+
void reform(dim_t) noexcept;
58+
};
59+
60+
}// namespace refactor::kernel
61+
62+
#endif// KERNEL_PAD_ATTRIBUTES_H
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#ifndef KERNEL_PAD_H
2+
#define KERNEL_PAD_H
3+
4+
#include "../attributes/pad_info.h"
5+
#include "../collector.h"
6+
7+
namespace refactor::kernel {
8+
9+
struct PadCollector final : public InfoCollector {
10+
PadDimension dims;
11+
PadType mode;
12+
13+
explicit PadCollector(decltype(_target) target, PadDimension const &dims_, PadType mode_) noexcept
14+
: InfoCollector(target), dims(std::move(dims_)), mode(mode_) {}
15+
16+
std::vector<KernelBox>
17+
filter(TensorRefs inputs, TensorRefs outputs) const final;
18+
};
19+
}// namespace refactor::kernel
20+
21+
#endif// KERNEL_PAD_H
+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#include "kernel/attributes/pad_info.h"
2+
#include <numeric>
3+
4+
namespace refactor::kernel {
5+
using PI = PadInfo;
6+
7+
PI::PadInfo(decltype(dims) dims_, dim_t blockCount_, dim_t blockSize_) noexcept
8+
: dims(std::move(dims_)), blockCount(blockCount_), blockSize(blockSize_) {}
9+
10+
PI::PadInfo(PadDimension dims_, Tensor const &input) : dims{}, blockCount(1),
11+
blockSize(input.dataType.size()) {
12+
size_t rank = input.rank();
13+
ASSERT(dims_.size() == rank, "Invalid to get PadInfo.");
14+
15+
size_t j = 0;
16+
for (auto i : range0_(rank)) {
17+
if (dims_[i].dimI != dims_[i].dimO || dims_[i].dimI != 1) {
18+
if (j < i) { dims_[j] = dims_[i]; }
19+
j++;
20+
}
21+
}
22+
dims_.resize(rank = j);
23+
24+
// 合并末尾连续维度
25+
for (auto i : range0_(rank).rev()) {
26+
if (auto d = dims_[i].dimI; d == dims_[i].dimO) {
27+
blockSize *= d;
28+
dims_.pop_back();
29+
} else {
30+
auto &dim = dims_[i];
31+
if (auto times = std::gcd(std::gcd(dims_[i].dimI, dims_[i].pads), dims_[i].dimO); times > 1) {
32+
blockSize *= times;
33+
dim.dimI /= times;
34+
dim.dimO /= times;
35+
dim.pads /= times;
36+
}
37+
break;
38+
}
39+
}
40+
dims.reserve(rank = dims_.size());
41+
42+
dim_t strideI = 1, strideO = 1;
43+
for (auto i : range0_(rank).rev()) {
44+
auto const &dim = dims_[i];
45+
dims.push_back({
46+
strideI,
47+
strideO,
48+
static_cast<dim_t>(dim.pads),
49+
static_cast<dim_t>(dim.dimI),
50+
});
51+
strideI *= dim.dimI;
52+
strideO *= dim.dimO;
53+
}
54+
std::reverse(dims.begin(), dims.end());
55+
blockCount = strideO;
56+
}
57+
58+
void PI::reform(dim_t maxblockSize) noexcept {
59+
auto blockSize_ = std::gcd(blockSize, maxblockSize);
60+
if (blockSize_ == blockSize) { return; }
61+
auto t = blockSize / blockSize_;
62+
blockCount *= t;
63+
blockSize = blockSize_;
64+
for (auto &d : dims) {
65+
d.strideI *= t;
66+
d.strideO *= t;
67+
d.padS *= t;
68+
d.dimI *= t;
69+
}
70+
dims.resize(dims.size() + 1);
71+
dims.back() = {1, 1, 0, t};
72+
}
73+
74+
}// namespace refactor::kernel

src/04kernel/src/attributes/slice_info.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ namespace refactor::kernel {
4646
shape.pop_back();
4747
dims_.pop_back();
4848
} else {
49-
dims.resize(rank = shape.size());
5049
if (auto &dim = dims_[i]; dim.step == 1) {
5150
if (auto times = std::gcd(std::gcd(dim.start, dim.length), shape[i]); times > 1) {
5251
blockSize *= times;
@@ -58,6 +57,7 @@ namespace refactor::kernel {
5857
break;
5958
}
6059
}
60+
dims.resize(rank = shape.size());
6161
dim_t strideI = 1;
6262
for (auto i : range0_(rank).rev()) {
6363
auto const &dim = dims_[i];

src/04kernel/src/collectors/pad.cc

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#include "kernel/collectors/pad.h"
2+
#include "../kernels/pad/cpu_kernel.hh"
3+
#include "../kernels/pad/cuda_kernel.hh"
4+
5+
namespace refactor::kernel {
6+
7+
std::vector<KernelBox>
8+
PadCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
9+
auto const &input = inputs[0];
10+
PadInfo info(dims, input);
11+
auto const_value = inputs.size() >= 3 ? std::make_optional(inputs[2]) : std::nullopt;
12+
13+
std::vector<KernelBox> ans;
14+
switch (_target) {
15+
case decltype(_target)::Cpu:
16+
if (auto ptr = PadCpu::build(std::move(info), mode, const_value); ptr) {
17+
ans.emplace_back(std::move(ptr));
18+
}
19+
break;
20+
case decltype(_target)::Nvidia:
21+
if (auto ptr = PadCuda::build(std::move(info), mode, const_value); ptr) {
22+
ans.emplace_back(std::move(ptr));
23+
}
24+
break;
25+
default:
26+
UNREACHABLEX(void, "Unknown target");
27+
}
28+
return ans;
29+
}
30+
31+
}// namespace refactor::kernel
32+
+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#include "cpu_kernel.hh"
2+
#include <execution>
3+
4+
namespace refactor::kernel {
5+
using K = PadCpu;
6+
7+
K::PadCpu(PadInfo info_, PadType mode_, size_t value_) noexcept
8+
: Kernel(), info(std::move(info_)), mode(mode_), valueLength(value_) {}
9+
10+
auto K::build(PadInfo info, PadType mode, std::optional<std::reference_wrapper<Tensor const>> value_) noexcept -> KernelBox {
11+
if (mode != PadType::Constant) {
12+
return nullptr;
13+
}
14+
size_t value = value_ ? value_->get().dataType.size() : 0;
15+
return std::make_unique<K>(std::move(info), mode, value);
16+
}
17+
auto K::typeId() noexcept -> size_t {
18+
static uint8_t ID = 1;
19+
return reinterpret_cast<size_t>(&ID);
20+
}
21+
22+
auto K::kernelTypeId() const noexcept -> size_t {
23+
return typeId();
24+
}
25+
auto K::description() const noexcept -> std::string_view {
26+
return "Performing pad operation on generic cpu";
27+
}
28+
29+
30+
auto K::lower(Resources &) const noexcept -> RoutineWorkspace {
31+
using namespace runtime;
32+
33+
return [info = this->info, value = this->valueLength](Resources &, void *workspace, void const *const *inputs, void *const *outputs) {
34+
auto src = reinterpret_cast<uint8_t const *>(inputs[0]);
35+
auto dst = reinterpret_cast<uint8_t *>(outputs[0]);
36+
std::vector<uint8_t> defaultValue(info.blockSize, 0);
37+
if (value != 0) {
38+
auto constValue = reinterpret_cast<uint8_t const *>(inputs[2]);
39+
for (auto i : range0_(info.blockSize / value)) {
40+
std::memcpy(defaultValue.data() + i * value, constValue, value);
41+
}
42+
}
43+
std::for_each_n(std::execution::par_unseq,
44+
natural_t(0), info.blockCount,
45+
[=, &info](auto i) {
46+
long rem = i, j = 0;
47+
bool flag = false;
48+
for (auto const &dim : info.dims) {
49+
auto pos = rem / dim.strideO - dim.padS;
50+
if (pos < 0 || pos >= dim.dimI) {
51+
flag = true;
52+
break;
53+
}
54+
j += pos * dim.strideI;
55+
rem %= dim.strideO;
56+
}
57+
if (flag) {
58+
std::memcpy(dst + i * info.blockSize, defaultValue.data(), info.blockSize);
59+
} else {
60+
std::memcpy(dst + i * info.blockSize, src + j * info.blockSize, info.blockSize);
61+
}
62+
});
63+
};
64+
}
65+
66+
}// namespace refactor::kernel

0 commit comments

Comments
 (0)