-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathgelu.cc
More file actions
71 lines (52 loc) · 2.15 KB
/
gelu.cc
File metadata and controls
71 lines (52 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
#include "../aie_kernel_utils.h"
#include "lut_based_ops.h"
#include <aie_api/aie.hpp>
#include <stdint.h>
using namespace aie;
void gelu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict output_vector, const int32_t vector_size)
{
event0();
auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
aie::vector<bfloat16, 16> input;
// Constants
const bfloat16 k0_5 = 0.5f;
const bfloat16 k1 = 1.0f;
const bfloat16 sqrt_2_over_pi = 0.79788456f; // ≈ sqrt(2/π)
const bfloat16 kBeta = 0.044715f;
auto v05 = aie::broadcast<bfloat16, 16>(k0_5);
auto v1 = aie::broadcast<bfloat16, 16>(k1);
auto vs2opi = aie::broadcast<bfloat16, 16>(sqrt_2_over_pi);
auto vBeta = aie::broadcast<bfloat16, 16>(kBeta);
AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < vector_size; i += 16) {
input = *it_in++;
auto x = input;
// Compute x^3
aie::vector<bfloat16, 16> x2 = aie::mul(x, x); // x^2
aie::vector<bfloat16, 16> x3 = aie::mul(x, x2); // x^3
// inner = sqrt(2/pi) * (x + 0.044715 * x^3)
aie::vector<bfloat16, 16> x3_beta = aie::mul(x3, vBeta);
aie::vector<bfloat16, 16> inner = aie::add(x, x3_beta);
auto inner1 = aie::mul(inner, vs2opi);
// tanh_out = tanh(inner)
aie::vector<bfloat16, 16> tanh_out = getTanhBf16(inner1.to_vector<bfloat16>());
// result = 0.5 * x * (1 + tanh_out)
aie::vector<bfloat16, 16> one_plus_tanh = aie::add(tanh_out, v1);
// Multiply by x and 0.5
aie::vector<bfloat16, 16> mul_v05 = aie::mul(v05, one_plus_tanh);
auto result = aie::mul(x, mul_v05);
*it_out++ = result.to_vector<bfloat16>();
}
event1();
return;
}
extern "C" {
void gelu_bf16(bfloat16 *restrict input, bfloat16 *restrict output, int input_size)
{
gelu_tanh_approx_bf16(input, output, input_size);
}
} // extern "C"