Skip to content

Commit

Permalink
sve - initial SVE backend framework
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Sep 23, 2024
1 parent 3156c35 commit 3475cec
Show file tree
Hide file tree
Showing 8 changed files with 421 additions and 0 deletions.
13 changes: 13 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ blocked.c := $(sort $(wildcard backends/blocked/*.c))
ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
opt.c := $(sort $(wildcard backends/opt/*.c))
avx.c := $(sort $(wildcard backends/avx/*.c))
sve.c := $(sort $(wildcard backends/sve/*.c))
xsmm.c := $(sort $(wildcard backends/xsmm/*.c))
cuda.c := $(sort $(wildcard backends/cuda/*.c))
cuda.cpp := $(sort $(wildcard backends/cuda/*.cpp))
Expand Down Expand Up @@ -338,6 +339,7 @@ info:
$(info ------------------------------------)
$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
$(info AVX_STATUS = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
$(info SVE_STATUS = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS)))
$(info XSMM_DIR = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
$(info OCCA_DIR = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
$(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
Expand Down Expand Up @@ -391,6 +393,17 @@ ifneq ($(AVX),)
BACKENDS_MAKE += $(AVX_BACKENDS)
endif

# SVE Backends
SVE_STATUS = Disabled
AVX_FLAG := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve)
SVE := $(filter $(SVE_FLAG),$(shell $(CC) $(CFLAGS:-M%=) -v -E -x c /dev/null 2>&1))
SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked
ifneq ($(SVE),)
SVE_STATUS = Enabled
libceed.c += $(sve.c)
BACKENDS_MAKE += $(SVE_BACKENDS)
endif

# Collect list of libraries and paths for use in linking and pkg-config
PKG_LIBS =
# Stubs that will not be RPATH'd
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ There are multiple supported backends, which can be selected at runtime in the e
| `/cpu/self/opt/blocked` | Blocked optimized C implementation | Yes |
| `/cpu/self/avx/serial` | Serial AVX implementation | Yes |
| `/cpu/self/avx/blocked` | Blocked AVX implementation | Yes |
| `/cpu/self/sve/serial` | Serial SVE implementation | Yes |
| `/cpu/self/sve/blocked` | Blocked SVE implementation | Yes |
||
| **CPU Valgrind** |
| `/cpu/self/memcheck/*` | Memcheck backends, undefined value checks | Yes |
Expand Down Expand Up @@ -200,6 +202,8 @@ The `/cpu/self/opt/*` backends are written in pure C and use partial e-vectors t

The `/cpu/self/avx/*` backends rely upon AVX instructions to provide vectorized CPU performance.

The `/cpu/self/sve/*` backends rely upon SVE instructions to provide vectorized CPU performance.

The `/cpu/self/memcheck/*` backends rely upon the [Valgrind](https://valgrind.org/) Memcheck tool to help verify that user QFunctions have no undefined values.
To use, run your code with Valgrind and the Memcheck backends, e.g. `valgrind ./build/ex1 -ceed /cpu/self/ref/memcheck`.
A 'development' or 'debugging' version of Valgrind with headers is required to use this backend.
Expand Down
2 changes: 2 additions & 0 deletions backends/ceed-backend-list.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,7 @@ CEED_BACKEND(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
CEED_BACKEND(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
CEED_BACKEND(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
CEED_BACKEND(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
CEED_BACKEND(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial")
CEED_BACKEND(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked")
CEED_BACKEND(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
CEED_BACKEND(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
41 changes: 41 additions & 0 deletions backends/sve/ceed-sve-blocked.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed.h>
#include <ceed/backend.h>
#include <stdbool.h>
#include <string.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Backend Init
//------------------------------------------------------------------------------
static int CeedInit_Sve(const char *resource, Ceed ceed) {
Ceed ceed_ref;

CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve") && strcmp(resource, "/cpu/self/sve/blocked"), ceed,
CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
CeedCallBackend(CeedSetDeterministic(ceed, true));

// Create reference CEED that implementation will be dispatched through unless overridden
CeedCallBackend(CeedInit("/cpu/self/opt/blocked", &ceed_ref));
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));

if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
} else {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve);
}
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Backend Register
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sve_Blocked(void) { return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30); }
//------------------------------------------------------------------------------
42 changes: 42 additions & 0 deletions backends/sve/ceed-sve-serial.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed.h>
#include <ceed/backend.h>
#include <stdbool.h>
#include <string.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Backend Init
//------------------------------------------------------------------------------
static int CeedInit_Sve(const char *resource, Ceed ceed) {
Ceed ceed_ref;

CeedCheck(!strcmp(resource, "/cpu/self") || !strcmp(resource, "/cpu/self/sve/serial"), ceed, CEED_ERROR_BACKEND,
"SVE backend cannot use resource: %s", resource);
CeedCallBackend(CeedSetDeterministic(ceed, true));

// Create reference CEED that implementation will be dispatched through unless overridden
CeedCallBackend(CeedInit("/cpu/self/opt/serial", &ceed_ref));
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));

if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
} else {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve));
}
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Backend Register
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sve_Serial(void) { return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35); }

//------------------------------------------------------------------------------
153 changes: 153 additions & 0 deletions backends/sve/ceed-sve-tensor-f32.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed.h>
#include <ceed/backend.h>
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
#include <stdbool.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Blocked Tensor Contract
//------------------------------------------------------------------------------
static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
const CeedInt JJ) {
CeedInt t_stride_0 = B, t_stride_1 = 1;

if (t_mode == CEED_TRANSPOSE) {
t_stride_0 = 1;
t_stride_1 = J;
}

for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
// Blocks of JJ rows
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
// C vectorization by compiler
for (int32_t c = 0; c < C; c += svcntd()) {
svbool_t pg = svwhilelt_b32(c, C);
// Load u, v into vectors
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
// Basis matrix value
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];

// fmadd
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
}
}
}
}
}

// Remainder of rows
const CeedInt j = (J / JJ) * JJ;

if (j < J) {
for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
// Blocks of JJ rows
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
// C vectorization by compiler
for (int32_t c = 0; c < C; c += svcntd()) {
svbool_t pg = svwhilelt_b32(c, C);
// Load u, v into vectors
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
// Basis matrix value
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];

// fmadd
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
}
}
}
}
}
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Blocked Tensor Contract
//------------------------------------------------------------------------------
static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
const CeedInt JJ) {
CeedInt t_stride_0 = B, t_stride_1 = 1;

if (t_mode == CEED_TRANSPOSE) {
t_stride_0 = 1;
t_stride_1 = J;
}

for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
}
}
}
}

const CeedInt j = (J / JJ) * JJ;

if (j < J) {
for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
}
}
}
}
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Tensor Contract - Common Sizes
//------------------------------------------------------------------------------
static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8);
}
static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8);
}

//------------------------------------------------------------------------------
// Tensor Contract Apply
//------------------------------------------------------------------------------
static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
if (!add) {
for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0;
}

if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Tensor Contract Create
//------------------------------------------------------------------------------
int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) {
Ceed ceed;

CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));
CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve));
return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
Loading

0 comments on commit 3475cec

Please sign in to comment.