diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index a1ac12acd..02ee913a6 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -18,16 +18,17 @@ jobs: uses: actions/checkout@v2 with: path: PLSSVM - + # install dependencies - name: Dependancies run: sudo apt-get install -y doxygen graphviz - + # configure project via CMake - name: Configure run: | cd PLSSVM mkdir build cd build - cmake -DPLSSVM_TARGET_PLATFORMS=cpu -DPLSSVM_ENABLE_DOCUMENTATION=ON -DPLSSVM_GENERATE_TEST_FILE=OFF PLSSVM_ENABLE_TESTING=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=OFF -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF .. + cmake -DPLSSVM_TARGET_PLATFORMS=cpu -DPLSSVM_ENABLE_DOCUMENTATION=ON -DPLSSVM_ENABLE_TESTING=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=OFF -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=OFF .. + # build project - name: Generate run: | cd PLSSVM/build diff --git a/.github/workflows/msvc_windows.yml b/.github/workflows/msvc_windows.yml new file mode 100644 index 000000000..9c108298f --- /dev/null +++ b/.github/workflows/msvc_windows.yml @@ -0,0 +1,29 @@ +name: Windows CPU +on: push +jobs: + MSVC-Test: + runs-on: windows-latest + steps: + - name: "Install MSVC 14.29" + uses: ilammy/msvc-dev-cmd@v1.9.0 + with: + toolset: 14.29 + - name: "Install cmake 3.20.1" + uses: lukka/get-cmake@v3.20.1 + - name: "Clone Step" + uses: actions/checkout@v2 + with: + path: PLSSVM + - name: "CMake Step" + run: | + mkdir PLSSVM/build + cd PLSSVM/build + cmake -DCMAKE_BUILD_TYPE=Debug -DPLSSVM_TARGET_PLATFORMS="cpu" -DPLSSVM_ENABLE_TESTING=ON -DPLSSVM_GENERATE_TEST_FILE=OFF -DPLSSVM_ENABLE_LTO=OFF -DPLSSVM_ENABLE_ASSERTS=ON.. + - name: "Build Step" + run: | + cd PLSSVM/build + cmake --build . + - name: "Test Step" + run: | + cd PLSSVM/build + ctest -C Debug -V \ No newline at end of file diff --git a/.jenkins/Jenkinsfile-multigpu-tests b/.jenkins/Jenkinsfile-multigpu-tests new file mode 100644 index 000000000..b961898ba --- /dev/null +++ b/.jenkins/Jenkinsfile-multigpu-tests @@ -0,0 +1,307 @@ +#!groovy + +def buildbadge = addEmbeddableBadgeConfiguration(id: "Jenkins", subject: "Jenkins Tests", status: "skipped") + +if (currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) { + print "INFO: Build on ${env.BRANCH_NAME}/${env.BUILD_NUMBER} triggered by branch indexing..." + if (env.BRANCH_NAME != "master") { + if (env.BUILD_NUMBER != "1") { // Always execute first build to load this configuration and thus the triggers + print "INFO: Build on ${env.BRANCH_NAME}/${env.BUILD_NUMBER} skipped due being triggered by Branch Indexing instead of SCM change!" + buildbadge.setStatus('skipped') + currentBuild.result = 'ABORTED' + return // early exit to avoid redundant builds + } + } +} else { + print "INFO: Build on ${env.BRANCH_NAME}/${env.BUILD_NUMBER} triggered by SCM change..." + print "Proceeding!" +} + + +pipeline { + agent { label 'argon-fs'} + + options { + buildDiscarder( + logRotator( + daysToKeepStr: "21", + numToKeepStr: "50", + artifactDaysToKeepStr: "21", + artifactNumToKeepStr: "50" + ) + ) + } + + triggers { + githubPush() // Trigger by push to respective github branch + pollSCM 'H/15 * * * *' // Fallback polling solution as some pushes are somehow lost + } + + environment { + GITHUB_TOKEN = credentials('GITHUB_TOKEN') + BRANCH_NAME = "${env.BRANCH_NAME}" + } + + stages { + stage('init') { + steps { + sh ''' + gitlab_token=$(echo ${GITHUB_TOKEN} | cut -f2 -d':') + curl --verbose\ + --request POST \ + --url "https://api.github.com/repos/SC-SGS/PLSSVM/statuses/$GIT_COMMIT" \ + --header "Content-Type: application/json" \ + --header "authorization: Bearer ${gitlab_token}" \ + --data "{ + \\"state\\": \\"pending\\", + \\"context\\": \\"jenkins-ctest-multigpu\\", + \\"description\\": \\"Jenkins CI Job: jenkins-ctest-multigpu\\", + \\"target_url\\": \\"https://simsgs.informatik.uni-stuttgart.de/jenkins/view/PLSSVM/job/PLSSVM/job/Github-Multigpu/job/${BRANCH_NAME}/$BUILD_NUMBER\\" + }" + ''' + } + } + stage('checkout') { + steps { + sh ''' + srun -w argon-gtx -N 1 -n 1 -t 03:00:00 -D /scratch bash -c "\ + cd /scratch && \ + mkdir -p jenkins && cd jenkins; \ + if [[ ! -d spack ]]; then git clone --depth 1 https://github.com/spack/spack.git;fi; \ + source spack/share/spack/setup-env.sh && \ + spack compiler find && \ + spack install cmake@3.20.2 && \ + spack load cmake@3.20.2 && echo 'Successfully installed/loaded spack cmake'" + ''' + sh ''' + srun -w argon-gtx -N 1 -n 1 -t 0:05:00 -D /scratch bash -c "\ + cd /scratch && \ + mkdir -p jenkins && cd jenkins; \ + mkdir -p plssvm/${BUILD_TAG} && cd plssvm/${BUILD_TAG} && \ + rm -rf PLSSVM && \ + git clone git@github.com:SC-SGS/PLSSVM.git PLSSVM && \ + cd PLSSVM && \ + pwd && \ + git checkout ${GIT_COMMIT}" + ''' + sh ''' + mkdir ${BUILD_TAG} + ''' + } + } + stage('setup python'){ + steps{ + sh ''' + /usr/bin/python3.8 -m pip install --user arff + /usr/bin/python3.8 -m pip install --user pandas + /usr/bin/python3.8 -m pip install --user sklearn + /usr/bin/python3.8 -m pip install --user argparse + ''' + } + } + stage('build and test'){ + parallel { + stage('OpenMP, OpenCL, CUDA'){ + stages{ + stage('build plssvm Release') { + steps { + dir('plssvm') { + sh ''' + srun -w argon-gtx -N 1 -n 1 -t 00:05:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM bash -c "\ + source /scratch/jenkins/spack/share/spack/setup-env.sh && spack load cmake@3.20.2 &&\ + module load cuda &&\ + mkdir -p build/Release &&\ + cd build/Release &&\ + rm -rf * &&\ + cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS='cpu;nvidia:61' -DPLSSVM_ENABLE_ASSERTS=ON -S ../../ &&\ + make -j4" + ''' + } + } + } + stage('run tests Release') { + steps { + dir('plssvm') { + warnError('Release tests failed!') { + sh ''' + srun -w argon-gtx -N 1 -n 1 -t 01:00:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM --gres=gpu:2 bash -c "\ + module load cuda &&\ + cd build/Release &&\ + ctest -j4 --no-compress-output -T Test --timeout 1200; \ + returncode=$? && \ + cp -r Testing /data/argon-fs/sgs/jenkins/workspace/$(basename ${WORKSPACE})/${BUILD_TAG}/Testing &&\ + exit $returncode" + ''' + } + } + } + } + } + } + stage('hipSYCL'){ + stages{ + stage('build plssvm hipSYCL Release') { + steps { + dir('plssvm') { + sh ''' + srun -w argon-gtx -N 1 -n 1 -t 00:05:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM bash -c "\ + source /scratch/jenkins/spack/share/spack/setup-env.sh && spack load cmake@3.20.2 &&\ + module load cuda &&\ + module use /home/breyerml/.modulefiles/ &&\ + module load pcsgs05/hipsycl &&\ + mkdir -p build/Release_hip &&\ + cd build/Release_hip &&\ + rm -rf * &&\ + cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS='cpu;nvidia:sm_61' -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=OFF -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON -S ../../ &&\ + make -j4 " + ''' + } + } + } + stage('run tests hipSYCL Release') { + steps { + dir('plssvm') { + warnError('hipSYCL Release tests failed!') { + sh ''' + srun -w argon-gtx -N 1 -n 1 -t 01:00:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM --gres=gpu:2 bash -c "\ + module load cuda &&\ + module use /home/breyerml/.modulefiles/ &&\ + module load pcsgs05/hipsycl &&\ + cd build/Release_hip &&\ + ctest -j4 --no-compress-output -T Test --timeout 1200; \ + returncode=$? && \ + cp -r Testing /data/argon-fs/sgs/jenkins/workspace/$(basename ${WORKSPACE})/${BUILD_TAG}/Testing_hip && \ + exit $returncode" + ''' + } + } + } + } + } + } + stage('DPC++'){ + stages{ + stage('build plssvm DPC++ Release') { + steps { + dir('plssvm') { + sh ''' + srun -w argon-gtx -N 1 -n 1 -t 00:05:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM bash -c "\ + source /scratch/jenkins/spack/share/spack/setup-env.sh && spack load cmake@3.20.2 &&\ + module load cuda &&\ + module use /home/breyerml/.modulefiles/ &&\ + module load pcsgs05/dpcpp_rt &&\ + mkdir -p build/Release_dpcpp &&\ + cd build/Release_dpcpp &&\ + rm -rf * &&\ + cmake -DCMAKE_BUILD_TYPE=Release -DPLSSVM_TARGET_PLATFORMS='cpu;nvidia:sm_61' -DCMAKE_CXX_COMPILER=clang++ -DPLSSVM_ENABLE_OPENMP_BACKEND=OFF -DPLSSVM_ENABLE_CUDA_BACKEND=ON -DPLSSVM_ENABLE_OPENCL_BACKEND=OFF -DPLSSVM_ENABLE_SYCL_BACKEND=ON -DPLSSVM_ENABLE_ASSERTS=ON -DPLSSVM_ENABLE_LTO=OFF -S ../../ &&\ + make -j4 " + ''' + } + } + } + stage('run tests DPC++ Release') { + steps { + dir('plssvm') { + warnError('DPC++ Release tests failed!') { + sh ''' + srun -w argon-gtx -N 1 -n 1 -t 01:00:00 -D /scratch/jenkins/plssvm/${BUILD_TAG}/PLSSVM --gres=gpu:2 bash -c "\ + module load cuda &&\ + module use /home/breyerml/.modulefiles/ &&\ + module load pcsgs05/dpcpp_rt &&\ + cd build/Release_dpcpp &&\ + ctest -j4 --no-compress-output -T Test --timeout 1200; \ + returncode=$? && \ + cp -r Testing /data/argon-fs/sgs/jenkins/workspace/$(basename ${WORKSPACE})/${BUILD_TAG}/Testing_dpcpp &&\ + exit $returncode" + ''' + } + } + } + } + } + } + } + } + } + post { + always { + // Process the CTest xml output with the xUnit plugin + xunit ( + testTimeMargin: '3000', + thresholdMode: 1, + thresholds: [ + skipped(failureThreshold: '0'), + failed(failureThreshold: '0') + ], + tools: [CTest( + pattern: '${BUILD_TAG}/Testing*/**/*.xml', + deleteOutputFiles: true, + failIfNotNew: false, + skipNoTestFiles: true, + stopProcessingIfError: true + )] + ) + sh ''' + srun -w argon-gtx -n 1 -t 00:05:00 bash -c "rm -rf /data/scratch/jenkins/plssvm/${BUILD_TAG}" + rm -rf ${BUILD_TAG} + ''' + } + success { + script { + buildbadge.setStatus('success') + } + sh ''' + gitlab_token=$(echo ${GITHUB_TOKEN} | cut -f2 -d':') + curl --verbose\ + --request POST \ + --url "https://api.github.com/repos/SC-SGS/PLSSVM/statuses/$GIT_COMMIT" \ + --header "Content-Type: application/json" \ + --header "authorization: Bearer ${gitlab_token}" \ + --data "{ + \\"state\\": \\"success\\", + \\"context\\": \\"jenkins-ctest-multigpu\\", + \\"description\\": \\"Jenkins CI Job: jenkins-ctest-multigpu\\", + \\"target_url\\": \\"https://simsgs.informatik.uni-stuttgart.de/jenkins/view/PLSSVM/job/PLSSVM/job/Github-Multigpu/job/${BRANCH_NAME}/$BUILD_NUMBER\\" + }" + ''' + } + failure { + script { + buildbadge.setStatus('failing') + } + sh ''' + gitlab_token=$(echo ${GITHUB_TOKEN} | cut -f2 -d':') + curl --verbose\ + --request POST \ + --url "https://api.github.com/repos/SC-SGS/PLSSVM/statuses/$GIT_COMMIT" \ + --header "Content-Type: application/json" \ + --header "authorization: Bearer ${gitlab_token}" \ + --data "{ + \\"state\\": \\"failure\\", + \\"context\\": \\"jenkins-ctest-multigpu\\", + \\"description\\": \\"Jenkins CI Job: jenkins-ctest-multigpu\\", + \\"target_url\\": \\"https://simsgs.informatik.uni-stuttgart.de/jenkins/view/PLSSVM/job/PLSSVM/job/Github-Multigpu/job/${BRANCH_NAME}/$BUILD_NUMBER\\" + }" + ''' + } + aborted { + script { + buildbadge.setStatus('aborted') + } + sh ''' + gitlab_token=$(echo ${GITHUB_TOKEN} | cut -f2 -d':') + curl --verbose\ + --request POST \ + --url "https://api.github.com/repos/SC-SGS/PLSSVM/statuses/$GIT_COMMIT" \ + --header "Content-Type: application/json" \ + --header "authorization: Bearer ${gitlab_token}" \ + --data "{ + \\"state\\": \\"error\\", + \\"context\\": \\"jenkins-ctest-multigpu\\", + \\"description\\": \\"Jenkins CI Job: jenkins-ctest-multigpu\\", + \\"target_url\\": \\"https://simsgs.informatik.uni-stuttgart.de/jenkins/view/PLSSVM/job/PLSSVM/job/Github-Multigpu/job/${BRANCH_NAME}/$BUILD_NUMBER\\" + }" + ''' + } + } +} diff --git a/CMakeLists.txt b/CMakeLists.txt index 10c686451..5efc626e2 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 3.18) -project(ParallelLeastSquaresSupportVectorMachine +project("PLSSVM - Parallel Least-Squares Support Vector Machine" VERSION 1.0.0 LANGUAGES CXX DESCRIPTION "A Support Vector Machine implementation using different backends.") @@ -27,11 +27,18 @@ configure_file( ######################################################################################################################## ## set base sources set(PLSSVM_BASE_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/execution_range.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/file_reader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/detail/string_utility.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/exceptions/exceptions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/exceptions/source_location.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/backend_types.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/kernel_types.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/csvm.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/csvm_IO.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_train.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_predict.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/parameter_train.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/plssvm/target_platforms.cpp ) ## create base library: linked against all backend libraries @@ -50,24 +57,34 @@ target_include_directories(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC $ ) -# set library cxx standard +## set library cxx standard target_compile_features(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC cxx_std_17) ## additional base library compile options -target_compile_options(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC -Wall -Wextra) -#target_compile_options(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC -Wdouble-promotion -fno-common -Wconversion -Wshadow) # TODO: make warning free +target_compile_options(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC + $<$:-Wall -Wextra -Wdouble-promotion -fno-common -Wshadow -Wcast-qual -Wnull-dereference -Wctor-dtor-privacy -Wnon-virtual-dtor -Wsuggest-override -Wextra-semi -Wunreachable-code -Wuninitialized> + $<$:-Wstrict-null-sentinel -Wlogical-op -Wduplicated-branches -Wimplicit-fallthrough=5> + $<$:-Wdocumentation -Wmost> + $<$:/W4> + ) ## nvcc doesn't recognize -Werror=??? option so only set it when using a CXX compiler -target_compile_options(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC $<$:-Werror=switch -fstrict-enums>) +target_compile_options(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC + $<$:-Werror=switch -fstrict-enums> + $<$:/we4061> + ) ## enable additional optimization flags only in RELEASE mode -target_compile_options(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC $<$:-ffast-math -march=native>) +target_compile_options(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC + $<$,$>:-ffast-math -march=native> + $<$,$>:/fp:fast> + ) ######################################################################################################################## ## check for optional and necessary dependencies ## ######################################################################################################################## # check for OpenMP (not for the backend!) -find_package(OpenMP QUIET) -if(OPENMP_FOUND) - message(STATUS "Found OpenMP to speed up file parsing.") +find_package(OpenMP 4.0 QUIET) +if(OpenMP_FOUND) + message(STATUS "Found OpenMP ${OpenMP_CXX_VERSION} to speed up file parsing.") target_link_libraries(${PLSSVM_BASE_LIBRARY_NAME} PUBLIC OpenMP::OpenMP_CXX) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler '-fopenmp'") endif() @@ -328,7 +345,7 @@ if(DEFINED PLSSVM_THREAD_BLOCK_SIZE) if (PLSSVM_THREAD_BLOCK_SIZE MATCHES "^[0-9]+$" AND PLSSVM_THREAD_BLOCK_SIZE GREATER 0) message(STATUS "Set THREAD_BLOCK_SIZE to ${PLSSVM_THREAD_BLOCK_SIZE}.") # add target definition - target_compile_definitions(${PLSSVM_EXECUTABLE_NAME} PUBLIC PLSSVM_THREAD_BLOCK_SIZE=${PLSSVM_THREAD_BLOCK_SIZE}) + target_compile_definitions(${PLSSVM_ALL_LIBRARY_NAME} PUBLIC PLSSVM_THREAD_BLOCK_SIZE=${PLSSVM_THREAD_BLOCK_SIZE}) else() message(FATAL_ERROR "PLSSVM_THREAD_BLOCK_SIZE must be an integer greater than 0 but is \"${PLSSVM_THREAD_BLOCK_SIZE}\"!") endif() @@ -342,12 +359,33 @@ if(DEFINED PLSSVM_INTERNAL_BLOCK_SIZE) if (PLSSVM_INTERNAL_BLOCK_SIZE MATCHES "^[0-9]+$" AND PLSSVM_INTERNAL_BLOCK_SIZE GREATER 0) message(STATUS "Set INTERNAL_BLOCK_SIZE to ${PLSSVM_INTERNAL_BLOCK_SIZE}.") # add target definition - target_compile_definitions(${PLSSVM_EXECUTABLE_NAME} PUBLIC PLSSVM_INTERNAL_BLOCK_SIZE=${PLSSVM_INTERNAL_BLOCK_SIZE}) + target_compile_definitions(${PLSSVM_ALL_LIBRARY_NAME} PUBLIC PLSSVM_INTERNAL_BLOCK_SIZE=${PLSSVM_INTERNAL_BLOCK_SIZE}) else() message(FATAL_ERROR "PLSSVM_INTERNAL_BLOCK_SIZE must be an integer greater than 0 but is \"${PLSSVM_INTERNAL_BLOCK_SIZE}\"!") endif() endif() +## set specific internal block sizes of requested +if(DEFINED ENV{PLSSVM_OPENMP_BLOCK_SIZE}) + set(PLSSVM_OPENMP_BLOCK_SIZE $ENV{PLSSVM_OPENMP_BLOCK_SIZE} CACHE STRING "The used block size for the OpenMP kernel." FORCE) +endif() +if(DEFINED PLSSVM_OPENMP_BLOCK_SIZE) + if (PLSSVM_OPENMP_BLOCK_SIZE MATCHES "^[0-9]+$" AND PLSSVM_OPENMP_BLOCK_SIZE GREATER 0) + message(STATUS "Set PLSSVM_OPENMP_BLOCK_SIZE to ${PLSSVM_OPENMP_BLOCK_SIZE}.") + # add target definition + target_compile_definitions(${PLSSVM_ALL_LIBRARY_NAME} PUBLIC PLSSVM_OPENMP_BLOCK_SIZE=${PLSSVM_OPENMP_BLOCK_SIZE}) + else() + message(FATAL_ERROR "PLSSVM_OPENMP_BLOCK_SIZE must be an integer greater than 0 but is \"${PLSSVM_OPENMP_BLOCK_SIZE}\"!") + endif() +endif() + +## change executable floating points from double precision to single precision +option(PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION "Build the svm-train and svm-predict executables with single precision instead of double precision." OFF) +if(PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION) + message(STATUS "Using single precision floating point numbers for svm-train and svm-predict.") + target_compile_definitions(${PLSSVM_EXECUTABLE_TRAIN_NAME} PRIVATE PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION) + target_compile_definitions(${PLSSVM_EXECUTABLE_PREDICT_NAME} PRIVATE PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION) +endif() ######################################################################################################################## ## check for Link Time Optimization ## @@ -386,6 +424,20 @@ if(PLSSVM_ENABLE_TESTING) endif() + +######################################################################################################################## +## enable timing generation via a script ## +######################################################################################################################## +option(PLSSVM_GENERATE_TIMING_SCRIPT "Generate a timing script used for performance measurement." OFF) +if(PLSSVM_GENERATE_TIMING_SCRIPT) + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/time.sh.in + ${CMAKE_BINARY_DIR}/time.sh + @ONLY + ) +endif() + + ######################################################################################################################## ## print short (backend) summary ## ######################################################################################################################## @@ -453,19 +505,3 @@ install(FILES "${PROJECT_BINARY_DIR}/plssvmConfig.cmake" "${PROJECT_BINARY_DIR}/plssvmConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/plssvm/cmake ) - - - - - -# TODO: separate repo? -## generate scripts -option(PLSSVM_GENERATE_TIMINGSCRIPT "Generate script for timings." ON) -if(PLSSVM_GENERATE_TIMINGSCRIPT) - configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/time.sh.in - ${CMAKE_BINARY_DIR}/time.sh - @ONLY - ) -endif() - diff --git a/README.md b/README.md index 33ff323f9..8140bcfef 100644 --- a/README.md +++ b/README.md @@ -2,44 +2,39 @@ Implementation of a parallel [least-squares support-vector machine](https://en.wikipedia.org/wiki/Least-squares_support-vector_machine) using multiple different backends. The currently available backends are: - -- [OpenMP](https://www.openmp.org/) -- [CUDA](https://developer.nvidia.com/cuda-zone) -- [OpenCL](https://www.khronos.org/opencl/) -- [SYCL](https://www.khronos.org/sycl/) + - [OpenMP](https://www.openmp.org/) + - [CUDA](https://developer.nvidia.com/cuda-zone) + - [OpenCL](https://www.khronos.org/opencl/) + - [SYCL](https://www.khronos.org/sycl/) ## Getting Started ### Dependencies -General dependencies: - -- a C++17 capable compiler (e.g. [`gcc`](https://gcc.gnu.org/) or [`clang`](https://clang.llvm.org/)) -- [CMake](https://cmake.org/) 3.18 or newer -- [cxxopts](https://github.com/jarro2783/cxxopts), [fast_float](https://github.com/fastfloat/fast_float) and [{fmt}](https://github.com/fmtlib/fmt) (all three are automatically build during the CMake configuration if they couldn't be found using the respective `find_package` call) -- [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful) -- [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled +General dependencies: + - a C++17 capable compiler (e.g. [`gcc`](https://gcc.gnu.org/) or [`clang`](https://clang.llvm.org/)) + - [CMake](https://cmake.org/) 3.18 or newer + - [cxxopts](https://github.com/jarro2783/cxxopts), [fast_float](https://github.com/fastfloat/fast_float) and [{fmt}](https://github.com/fmtlib/fmt) (all three are automatically build during the CMake configuration if they couldn't be found using the respective `find_package` call) + - [GoogleTest](https://github.com/google/googletest) if testing is enabled (automatically build during the CMake configuration if `find_package(GTest)` wasn't successful) + - [doxygen](https://www.doxygen.nl/index.html) if documentation generation is enabled + - [OpenMP](https://www.openmp.org/) 4.0 or newer (optional) to speed-up file parsing Additional dependencies for the OpenMP backend: - -- compiler with OpenMP support + - compiler with OpenMP support Additional dependencies for the CUDA backend: - -- CUDA SDK -- either NVIDIA [`nvcc`](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) or [`clang` with CUDA support enabled](https://llvm.org/docs/CompileCudaWithLLVM.html) + - CUDA SDK + - either NVIDIA [`nvcc`](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) or [`clang` with CUDA support enabled](https://llvm.org/docs/CompileCudaWithLLVM.html) Additional dependencies for the OpenCL backend: - -- OpenCL runtime and header files + - OpenCL runtime and header files Additional dependencies for the SYCL backend: - -- the code must be compiled with a SYCL capable compiler; currently tested with [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL) + - the code must be compiled with a SYCL capable compiler; currently tested with [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL) Additional dependencies if `PLSSVM_ENABLE_TESTING` and `PLSSVM_GENERATE_TEST_FILE` are both set to `ON`: + - [Python3](https://www.python.org/) with the [`argparse`](https://docs.python.org/3/library/argparse.html) and [`sklearn`](https://scikit-learn.org/stable/) modules -- [Python3](https://www.python.org/) with the [`argparse`](https://docs.python.org/3/library/argparse.html) and [`sklearn`](https://scikit-learn.org/stable/) modules ### Building @@ -57,11 +52,10 @@ Building the library can be done using the normal CMake approach: The **required** CMake option `PLSSVM_TARGET_PLATFORMS` is used to determine for which targets the backends should be compiled. Valid targets are: - -- `cpu`: compile for the CPU; **no** architectural specifications is allowed -- `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g. `nvidia:sm_86,sm_70` -- `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g. `amd:gfx906` -- `intel`: compile for Intel GPUs; **no** architectural specification is allowed + - `cpu`: compile for the CPU; **no** architectural specifications is allowed + - `nvidia`: compile for NVIDIA GPUs; **at least one** architectural specification is necessary, e.g. `nvidia:sm_86,sm_70` + - `amd`: compile for AMD GPUs; **at least one** architectural specification is necessary, e.g. `amd:gfx906` + - `intel`: compile for Intel GPUs; **no** architectural specification is allowed At least one of the above targets must be present. @@ -80,9 +74,9 @@ optional arguments: Example invocations: ```bash -> python3 utility/gpu_name_to_arch.py --name "GeForce RTX 3080" +> python3 utility_scripts/gpu_name_to_arch.py --name "GeForce RTX 3080" sm_86 -> python3 utility/gpu_name_to_arch.py --name "Radeon VII" +> python3 utility_scripts/gpu_name_to_arch.py --name "Radeon VII" gfx906 ``` @@ -90,49 +84,49 @@ If no GPU name is provided, the script tries to automatically detect any NVIDIA (requires the Python3 dependencies [`GPUtil`](https://pypi.org/project/GPUtil/) and [`pyamdgpuinfo`](https://pypi.org/project/pyamdgpuinfo/)). If the architectural information for the requested GPU could not be retrieved, one option would be to have a look at: + - for NVIDIA GPUs: [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) + - for AMD GPUs: [ROCm Documentation](https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Native-ISA.rst) -- for NVIDIA GPUs: [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) -- for AMD GPUs: [ROCm Documentation](https://github.com/RadeonOpenCompute/ROCm_Documentation/blob/master/ROCm_Compiler_SDK/ROCm-Native-ISA.rst) #### Optional CMake Options The `[optional_options]` can be one or multiple of: -- `PLSSVM_ENABLE_OPENMP_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - - `ON`: check for the OpenMP backend and fail if not available - - `AUTO`: check for the OpenMP backend but **do not** fail if not available - - `OFF`: do not check for the OpenMP backend -- `PLSSVM_ENABLE_CUDA_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - - `ON`: check for the CUDA backend and fail if not available - - `AUTO`: check for the CUDA backend but **do not** fail if not available - - `OFF`: do not check for the CUDA backend - -- `PLSSVM_ENABLE_OPENCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - - `ON`: check for the OpenCL backend and fail if not available - - `AUTO`: check for the OpenCL backend but **do not** fail if not available - - `OFF`: do not check for the OpenCL backend -- `PLSSVM_ENABLE_SYCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`): - - `ON`: check for the SYCL backend and fail if not available - - `AUTO`: check for the SYCL backend but **do not** fail if not available - - `OFF`: do not check for the SYCL backend + - `PLSSVM_ENABLE_OPENMP_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the OpenMP backend and fail if not available + - `AUTO`: check for the OpenMP backend but **do not** fail if not available + - `OFF`: do not check for the OpenMP backend + - `PLSSVM_ENABLE_CUDA_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the CUDA backend and fail if not available + - `AUTO`: check for the CUDA backend but **do not** fail if not available + - `OFF`: do not check for the CUDA backend + - `PLSSVM_ENABLE_OPENCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the OpenCL backend and fail if not available + - `AUTO`: check for the OpenCL backend but **do not** fail if not available + - `OFF`: do not check for the OpenCL backend + - `PLSSVM_ENABLE_SYCL_BACKEND=ON|OFF|AUTO` (default: `AUTO`): + - `ON`: check for the SYCL backend and fail if not available + - `AUTO`: check for the SYCL backend but **do not** fail if not available + - `OFF`: do not check for the SYCL backend **Attention:** at least one backend must be enabled and available! -- `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions regardless whether the `DEBUG` macro is defined or not -- `PLSSVM_THREAD_BLOCK_SIZE` (default: `16`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations) -- `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `6`: set a specific internal block size used in the GPU kernels (for fine-tuning optimizations) -- `PLSSVM_ENABLE_LTO=ON|OFF` (default: `ON`): enable interprocedural optimization (IPO/LTO) if supported by the compiler -- `PLSSVM_ENABLE_DOCUMENTATION=ON|OFF` (default: `OFF`): enable the `doc` target using doxygen -- `PLSSVM_ENABLE_TESTING=ON|OFF` (default: ON): enable testing using GoogleTest and ctest + - `PLSSVM_ENABLE_ASSERTS=ON|OFF` (default: `OFF`): enables custom assertions regardless whether the `DEBUG` macro is defined or not + - `PLSSVM_THREAD_BLOCK_SIZE` (default: `16`): set a specific thread block size used in the GPU kernels (for fine-tuning optimizations) + - `PLSSVM_INTERNAL_BLOCK_SIZE` (default: `6`: set a specific internal block size used in the GPU kernels (for fine-tuning optimizations) + - `PLSSVM_EXECUTABLES_USE_SINGLE_PRECISION` (default: `OFF`): enables single precision calculations instead of double precision for the `svm-train` and `svm-predict` executables + - `PLSSVM_ENABLE_LTO=ON|OFF` (default: `ON`): enable interprocedural optimization (IPO/LTO) if supported by the compiler + - `PLSSVM_ENABLE_DOCUMENTATION=ON|OFF` (default: `OFF`): enable the `doc` target using doxygen + - `PLSSVM_ENABLE_TESTING=ON|OFF` (default: `ON`): enable testing using GoogleTest and ctest + - `PLSSVM_GENERATE_TIMING_SCRIPT=ON|OFF` (default: `OFF`): configure a timing script usable for performance measurement If `PLSSVM_ENABLE_TESTING` is set to `ON`, the following options can also be set: - -- `PLSSVM_GENERATE_TEST_FILE=ON|OFF` (default: `ON`): automatically generate test files - - `PLSSVM_TEST_FILE_NUM_DATA_POINTS` (default: `5000`): the number of data points in the test file - - `PLSSVM_TEST_FILE_NUM_FEATURES` (default: `2000`): the number of features per data point + - `PLSSVM_GENERATE_TEST_FILE=ON|OFF` (default: `ON`): automatically generate test files + - `PLSSVM_TEST_FILE_NUM_DATA_POINTS` (default: `5000`): the number of data points in the test file If the SYCL backend is available and DPC++ is used, the option `PLSSVM_SYCL_DPCPP_USE_LEVEL_ZERO` can be used to select Level-Zero as the DPC++ backend instead of OpenCL. +To use DPC++ as compiler simply set the `CMAKE_CXX_COMPILER` to the respective DPC++ clang path during CMake invocation. ### Running the tests @@ -156,6 +150,14 @@ Additionally, it's advisable to use smaller test files to shorten the `ctest` st The resulting `html` coverage report is located in the `coverage` folder in the build directory. +### Creating the documentation + +If doxygen is installed and `PLSSVM_ENABLE_DOCUMENTATION` is set to `ON` the documentation can be build using +```bash +> make doc +``` +The documentation of the current state of the main branch can be found [here](https://vancraar.github.io/PLSSVM/). + ## Installing The library supports the `install` target: @@ -333,3 +335,8 @@ add_executable(prog main.cpp) target_compile_features(prog PUBLIC cxx_std_17) target_link_libraries(prog PUBLIC plssvm::svm-all) ``` + + +## License + +The PLSSVM library is distributed under the MIT [license](https://github.com/SC-SGS/PLSSVM/blob/main/LICENSE.md). diff --git a/cmake/compile_tests/test_sycl_dpcpp.cpp b/cmake/compile_tests/test_sycl_dpcpp.cpp index 511c06867..d4ca23f21 100644 --- a/cmake/compile_tests/test_sycl_dpcpp.cpp +++ b/cmake/compile_tests/test_sycl_dpcpp.cpp @@ -11,6 +11,6 @@ #include "sycl/sycl.hpp" int main() { - const auto version = __SYCL_COMPILER_VERSION; + [[maybe_unused]] const auto version = __SYCL_COMPILER_VERSION; return 0; } \ No newline at end of file diff --git a/cmake/time.sh.in b/cmake/time.sh.in index 3fa91c06c..81ffbf697 100644 --- a/cmake/time.sh.in +++ b/cmake/time.sh.in @@ -6,29 +6,31 @@ # See the LICENSE.md file in the project root for full license information. -DATAGENERATOR="@PROJECT_SOURCE_DIR@/data/generate_data.py" -PLSSSVM_TRAIN="@CMAKE_BINARY_DIR@/svm-train" -COLLECT_RUNTIME="@PROJECT_SOURCE_DIR@/utility/collect_runtimes.py" -MANIPULATE_CSV="python3 @PROJECT_SOURCE_DIR@/utility/manipulate_csv.py" +# set paths to executables and scripts +PLSSVM_TRAIN="@CMAKE_BINARY_DIR@/svm-train" +PLSSVM_PREDICT="@CMAKE_BINARY_DIR@/svm-predict" +DATA_GENERATOR="@PROJECT_SOURCE_DIR@/data/generate_data.py" +COLLECT_RUNTIME="@PROJECT_SOURCE_DIR@/utility_scripts/collect_runtimes.py" +MANIPULATE_CSV="python3 @PROJECT_SOURCE_DIR@/utility_scripts/manipulate_csv.py" GIT_HASH=$(git log -n1 --format=format:%H) +# TODO: remove hardcoded paths THUNDER_SVM_TRAIN="/import/sgs.scratch/vancraar/thundersvm/build_pcsgs05/bin/thundersvm-train" THUNDER_SVM_PREDICT="/import/sgs.scratch/vancraar/thundersvm/build_pcsgs05/bin/thundersvm-predict" LIBSVM_TRAIN="/import/sgs.scratch/vancraar/libsvm/svm-train" LIBSVM_PREDICT="/import/sgs.scratch/vancraar/libsvm/svm-predict" - - run_backend (){ echo "start PLSSVM with backend ${1} with ${num_points} points and ${num_features} features with e=${e} ..." rm -f runtime_backend.csv T="$(date +%s%N)" - ${PLSSSVM_TRAIN} --backend $1 -e ${e} ${num_points}x${num_features}.libsvm ${num_points}x${num_features}.libsvm.${1}.model | python3 ${COLLECT_RUNTIME} --output runtime_backend.csv + ${PLSSVM_TRAIN} --backend $1 -e ${e} ${num_points}x${num_features}.libsvm ${num_points}x${num_features}.libsvm.${1}.model | python3 ${COLLECT_RUNTIME} --output runtime_backend.csv T="$(($(date +%s%N)-T))" T="$((T/1000000))" + # TODO: use own predict if [ -f "$THUNDER_SVM_PREDICT" ] then ACCURACY=$($THUNDER_SVM_PREDICT ${num_points}x${num_features}.libsvm ${num_points}x${num_features}.libsvm.${1}.model predict | grep "Accuracy" | grep -oP "=.*" | cut -c 3-) @@ -43,7 +45,7 @@ run_backend (){ rm ${num_points}x${num_features}.libsvm.$1.model ${MANIPULATE_CSV} --file runtime_backend.csv --col "git_tag" --row 0 --val "${GIT_HASH}" - ${MANIPULATE_CSV} --file runtime_backend.csv --col "datagenerator_parameter" --row 0 --val "${DATENGENERATOR_PARAMETER}" + ${MANIPULATE_CSV} --file runtime_backend.csv --col "datagenerator_parameter" --row 0 --val "${DATA_GENERATOR_PARAMETER}" ${MANIPULATE_CSV} --file runtime_backend.csv --col "e" --row 0 --val "${e}" ${MANIPULATE_CSV} --file runtime_backend.csv --col "build_type" --row 0 --val "@CMAKE_BUILD_TYPE@" ${MANIPULATE_CSV} --file runtime_backend.csv --col "training_file_hash" --row 0 --val "${md5}" @@ -54,6 +56,8 @@ run_backend (){ then ${MANIPULATE_CSV} --file runtime_backend.csv --col "accuracy" --row 0 --val "${ACCURACY}" fi + + # TODO: remove upload curl -f -s -X GET https://ipvs.informatik.uni-stuttgart.de/cloud/s/pYcggBo9bAJjb9B/download -o runtime.csv python3 -c "import pandas as pd; data = pd.read_csv('runtime.csv'); data = pd.concat([pd.read_csv('runtime_backend.csv'),data]); data.to_csv('runtime.csv', index=False)" curl -f -s -u 'ds2WBHSkws43yLy:jN9fYeRPyt' -T runtime.csv -H 'X-Requested-With: XMLHttpRequest' -X PUT 'https://ipvs.informatik.uni-stuttgart.de/cloud/public.php/webdav/' @@ -61,7 +65,7 @@ run_backend (){ } -# run datengenerator +# run data generator for problem in "planes" "blobs" do for num_points in 100 1000 10000 100000 @@ -70,8 +74,8 @@ do do if (( $num_points >= $num_features)) then - DATENGENERATOR_PARAMETER="--output ${num_points}x${num_features} --format libsvm --problem planes --samples ${num_points} --features ${num_features}" - python3 ${DATAGENERATOR} ${DATENGENERATOR_PARAMETER} + DATA_GENERATOR_PARAMETER="--output ${num_points}x${num_features} --format libsvm --problem planes --samples ${num_points} --features ${num_features}" + python3 ${DATA_GENERATOR} ${DATA_GENERATOR_PARAMETER} md5=`md5sum ${num_points}x${num_features}.libsvm | awk '{ print $1 }'` for e in "0.001" "0.0001" "0.00001" "0.000001" "0.0000001" "0.00000001" "0.000000001" "0.0000000001" "0.00000000001" "0.000000000001" @@ -102,7 +106,7 @@ do T="$((T/1000000))" rm -f runtime_backend.csv - ${MANIPULATE_CSV} --file runtime_backend.csv --col "datagenerator_parameter" --row 0 --val "${DATENGENERATOR_PARAMETER}" + ${MANIPULATE_CSV} --file runtime_backend.csv --col "datagenerator_parameter" --row 0 --val "${DATA_GENERATOR_PARAMETER}" ${MANIPULATE_CSV} --file runtime_backend.csv --col "e" --row 0 --val "${e}" ${MANIPULATE_CSV} --file runtime_backend.csv --col "training_file_hash" --row 0 --val "${md5}" ${MANIPULATE_CSV} --file runtime_backend.csv --col "svm" --row 0 --val "thundersvm" @@ -123,6 +127,7 @@ do fi rm ${num_points}x${num_features}.libsvm.thundersvm.model + # TODO: remove upload curl -f -s -X GET https://ipvs.informatik.uni-stuttgart.de/cloud/s/pYcggBo9bAJjb9B/download -o runtime.csv python3 -c "import pandas as pd; data = pd.read_csv('runtime.csv'); data = pd.concat([pd.read_csv('runtime_backend.csv'),data]); data.to_csv('runtime.csv', index=False)" curl -f -s -u 'ds2WBHSkws43yLy:jN9fYeRPyt' -T runtime.csv -H 'X-Requested-With: XMLHttpRequest' -X PUT 'https://ipvs.informatik.uni-stuttgart.de/cloud/public.php/webdav/' @@ -140,7 +145,7 @@ do T="$((T/1000000))" echo " done in $((T/1000))s" rm -f runtime_backend.csv - ${MANIPULATE_CSV} --file runtime_backend.csv --col "datagenerator_parameter" --row 0 --val "${DATENGENERATOR_PARAMETER}" + ${MANIPULATE_CSV} --file runtime_backend.csv --col "datagenerator_parameter" --row 0 --val "${DATA_GENERATOR_PARAMETER}" ${MANIPULATE_CSV} --file runtime_backend.csv --col "e" --row 0 --val "${e}" ${MANIPULATE_CSV} --file runtime_backend.csv --col "training_file_hash" --row 0 --val "${md5}" ${MANIPULATE_CSV} --file runtime_backend.csv --col "svm" --row 0 --val "libsvm" @@ -161,6 +166,7 @@ do rm ${num_points}x${num_features}.libsvm.libsvm.model + # TODO: remove upload curl -f -s -X GET https://ipvs.informatik.uni-stuttgart.de/cloud/s/pYcggBo9bAJjb9B/download -o runtime.csv python3 -c "import pandas as pd; data = pd.read_csv('runtime.csv'); data = pd.concat([pd.read_csv('runtime_backend.csv'),data]); data.to_csv('runtime.csv', index=False)" curl -f -s -u 'ds2WBHSkws43yLy:jN9fYeRPyt' -T runtime.csv -H 'X-Requested-With: XMLHttpRequest' -X PUT 'https://ipvs.informatik.uni-stuttgart.de/cloud/public.php/webdav/' diff --git a/data/generate_data.py b/data/generate_data.py index c4521336c..06df410e7 100644 --- a/data/generate_data.py +++ b/data/generate_data.py @@ -1,10 +1,12 @@ -""" -@author Alexander Van Craen -@author Marcel Breyer -@copyright 2018-today The PLSSVM project - All Rights Reserved -@license This file is part of the PLSSVM project which is released under the MIT license. - See the LICENSE.md file in the project root for full license information. -""" +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +######################################################################################################################## +# Authors: Alexander Van Craen, Marcel Breyer # +# Copyright (C): 2018-today The PLSSVM project - All Rights Reserved # +# License: This file is part of the PLSSVM project which is released under the MIT license. # +# See the LICENSE.md file in the project root for full license information. # +######################################################################################################################## import argparse @@ -12,18 +14,23 @@ from sklearn.datasets import make_classification from sklearn.datasets import make_blobs from sklearn.datasets import make_gaussian_quantiles - +from sklearn.preprocessing import minmax_scale # parse command line arguments parser = argparse.ArgumentParser() -parser.add_argument("--output", help="the output file to write the samples to (without extension)") -parser.add_argument("--format", help="the file format; either arff or libsvm", required=True) +parser.add_argument( + "--output", help="the output file to write the samples to (without extension)") +parser.add_argument( + "--format", help="the file format; either arff or libsvm", required=True) parser.add_argument("--problem", help="the problem to solve; one of: blobs, blobs_merged, planes, planes_merged, ball", default="blobs") -parser.add_argument("--samples", help="the number of training samples to generate", required=True, type=int) -parser.add_argument("--test_samples", help="the number of test samples to generate; default: 0", type=int, default=0) -parser.add_argument("--features", help="the number of features per data point", required=True, type=int) +parser.add_argument( + "--samples", help="the number of training samples to generate", required=True, type=int) +parser.add_argument( + "--test_samples", help="the number of test samples to generate; default: 0", type=int, default=0) +parser.add_argument( + "--features", help="the number of features per data point", required=True, type=int) parser.add_argument("--plot", help="plot training samples; only possible if 0 < samples <= 2000 and 1 < features <= 3", action="store_true") @@ -31,18 +38,22 @@ # check for valid command line arguments if args.samples <= 0 or args.test_samples < 0 or args.features <= 0: - raise RuntimeError("Number of samples and/or features cannot be 0 or negative!") + raise RuntimeError( + "Number of samples and/or features cannot be 0 or negative!") if args.plot and (args.samples > 2000 and (args.features != 2 or args.features != 3)): - raise RuntimeError("Invalid number of samples and/or features for plotting!") + raise RuntimeError( + "Invalid number of samples and/or features for plotting!") # set total number of samples num_samples = args.samples + args.test_samples # create labeled data set if args.problem == "blobs": - samples, labels = make_blobs(n_samples=num_samples, n_features=args.features, centers=2) + samples, labels = make_blobs( + n_samples=num_samples, n_features=args.features, centers=2) elif args.problem == "blobs_merged": - samples, labels = make_blobs(n_samples=num_samples, n_features=args.features, centers=2, cluster_std=4.0) + samples, labels = make_blobs( + n_samples=num_samples, n_features=args.features, centers=2, cluster_std=4.0) elif args.problem == "planes": samples, labels = make_classification(n_samples=num_samples, n_features=args.features, n_redundant=0, n_informative=2, n_clusters_per_class=1) @@ -50,16 +61,19 @@ samples, labels = make_classification(n_samples=num_samples, n_features=args.features, n_redundant=0, n_informative=args.features) elif args.problem == "ball": - samples, labels = make_gaussian_quantiles(n_samples=num_samples, n_features=args.features, n_classes=2) + samples, labels = make_gaussian_quantiles( + n_samples=num_samples, n_features=args.features, n_classes=2) else: raise RuntimeError("Invalid problem!") # map labels to -1 and 1 labels = labels * 2 - 1 +minmax_scale(samples, feature_range=[-1, 1], copy=False) # set file names -rawfile = args.output if args.output is not None else "{}x{}".format(args.sampls, args.features) +rawfile = args.output if args.output is not None else "{}x{}".format( + args.samples, args.features) if rawfile.endswith(args.format): rawfile = rawfile[:-(len(args.format)+1)] file = rawfile + "." + args.format @@ -69,9 +83,11 @@ if args.format == "libsvm": from sklearn.datasets import dump_svmlight_file # dump data in libsvm format - dump_svmlight_file(samples[:args.samples, :], labels[:args.samples], file) + dump_svmlight_file(samples[:args.samples, :], + labels[:args.samples], file) if args.test_samples > 0: - dump_svmlight_file(samples[args.samples:, :], labels[args.samples:], test_file) + dump_svmlight_file(samples[args.samples:, :], + labels[args.samples:], test_file) elif args.format == "arff": import numpy import arff @@ -87,7 +103,8 @@ def dump_arff_file(out_data, out_file, relation): # dump dataframe as arff file pd_data = pandas.DataFrame(data=out_data, columns=col_names) - arff.dump(out_file, pd_data.values, relation=relation, names=pd_data.columns) + arff.dump(out_file, pd_data.values, + relation=relation, names=pd_data.columns) # replace 'real' with 'numeric' in arff file with open(file) as f: @@ -104,7 +121,8 @@ def dump_arff_file(out_data, out_file, relation): # output info -print("Created training data set '{}' with {} data points and {} features.".format(file, args.samples, args.features)) +print("Created training data set '{}' with {} data points and {} features.".format( + file, args.samples, args.features)) if args.test_samples > 0: print("Created test data set '{}' with {} data points and {} features." .format(test_file, args.test_samples, args.features)) @@ -116,9 +134,11 @@ def dump_arff_file(out_data, out_file, relation): from mpl_toolkits.mplot3d import Axes3D if args.features == 2: - plt.scatter(samples[:args.samples, 0], samples[:args.samples, 1], c=labels) + plt.scatter(samples[:args.samples, 0], + samples[:args.samples, 1], c=labels) elif args.features == 3: fig = plt.figure() ax = Axes3D(fig) - ax.scatter(samples[:args.samples, 0], samples[:args.samples, 1], samples[:args.samples, 2], c=labels) + ax.scatter(samples[:args.samples, 0], samples[:args.samples, + 1], samples[:args.samples, 2], c=labels) plt.show() diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 3c1bc84e7..e756648f9 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -14,10 +14,10 @@ find_package(Doxygen REQUIRED OPTIONAL_COMPONENTS dot) set(DOXYGEN_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/docs") set(DOXYGEN_IMAGE_PATH "${PROJECT_SOURCE_DIR}/docs/resources") set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "README.md") -set(DOXYGEN_FILE_PATTERNS "*.hpp;*.cuh") -set(DOXYGEN_EXTENSION_MAPPING "cu=c++;cuh=c++") +set(DOXYGEN_FILE_PATTERNS "*.hpp;*.cuh;*.cl;*.dox") +set(DOXYGEN_EXTENSION_MAPPING "cu=c++;cuh=c++;cl=c++") set(DOXYGEN_STRIP_FROM_PATH "${PROJECT_SOURCE_DIR}") -set(DOXYGEN_EXCLUDE "${PROJECT_SOURCE_DIR}/src/main_train.cpp") +set(DOXYGEN_EXCLUDE "${PROJECT_SOURCE_DIR}/src/main_train.cpp;${PROJECT_SOURCE_DIR}/src/main_predict.cpp") set(DOXYGEN_ABBREVIATE_BRIEF "") set(DOXYGEN_QUIET "YES") set(DOXYGEN_HTML_TIMESTAMP "YES") @@ -25,6 +25,12 @@ set(DOXYGEN_NUM_PROC_THREADS 0) set(DOXYGEN_WARN_NO_PARAMDOC "YES") set(DOXYGEN_SORT_MEMBER_DOCS "NO") +## enable processing of specific attributes and macros +set(DOXYGEN_ENABLE_PREPROCESSING "YES") +set(DOXYGEN_MACRO_EXPANSION "YES") +set(DOXYGEN_EXPAND_ONLY_PREDEF "YES") +set(DOXYGEN_PREDEFINED "__attribute__((x))=;__CUDA_ARCH__=0") + set(DOXYGEN_VERBATIM_VARS DOXYGEN_ALIASES) set(DOXYGEN_ALIASES [[license="\par License^^\parblock^^" ]] @@ -33,7 +39,7 @@ set(DOXYGEN_ALIASES ## add doxygen as target doxygen_add_docs( doc - "${PROJECT_SOURCE_DIR}/include;${PROJECT_SOURCE_DIR}/README.md" + "${PROJECT_SOURCE_DIR}/include;${PROJECT_SOURCE_DIR}/docs/resources;${PROJECT_SOURCE_DIR}/README.md" WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}" COMMENT "Generating API documentation with Doxygen" ) diff --git a/docs/resources/dirs.dox b/docs/resources/dirs.dox new file mode 100644 index 000000000..52a7904f1 --- /dev/null +++ b/docs/resources/dirs.dox @@ -0,0 +1,133 @@ +/** + * @dir include/plssvm + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing the implementation of the PLSSVM library. + */ + + +/** + * @dir include/plssvm/backends + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing the implementation of all four available backends: OpenMP, CUDA, OpenCL, and SYCL. + */ + +/** + * @dir include/plssvm/backends/CUDA + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing the implementation for the CUDA backend. + */ + +/** + * @dir include/plssvm/backends/CUDA/detail + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing implementation details for the CUDA backend. + */ + +/** + * @dir include/plssvm/backends/OpenCL + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing the implementation for the OpenCL backend. + */ + +/** + * @dir include/plssvm/backends/OpenCL/detail + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing implementation details for the OpenCL backend. + */ + +/** + * @dir include/plssvm/backends/OpenMP + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing the implementation for the OpenMP backend. + */ + +/** + * @dir include/plssvm/backends/SYCL + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing the implementation for the SYCL backend. + */ + +/** + * @dir include/plssvm/backends/SYCL/detail + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing implementation details for the SYCL backend. + */ + + +/** + * @dir include/plssvm/detail + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing implementation details which **should not** be used by users. + */ + +/** + * @dir include/plssvm/exceptions + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing custom exception types used to be able to better distinguish errors. + */ + +/** + * @dir include/plssvm/version + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Directory containing compile-time constant version information. + */ \ No newline at end of file diff --git a/include/plssvm/backend_types.hpp b/include/plssvm/backend_types.hpp index 0554435bb..a8e923657 100644 --- a/include/plssvm/backend_types.hpp +++ b/include/plssvm/backend_types.hpp @@ -6,79 +6,43 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Defines all possible backends. Can also include backends not available on the target platform. + * @brief Defines all possible backends. Can also include backends not available on the current target platform. */ #pragma once -#include "plssvm/detail/string_utility.hpp" // plssvm::detail::to_lower_case - -#include "fmt/ostream.h" // use operator<< to enable fmt::format with custom type - -#include // std::ios::failbit -#include // std::istream -#include // std::ostream -#include // std::string +#include // forward declare std::ostream and std::istream namespace plssvm { /** - * @brief Enum class for the different backend types. + * @brief Enum class for all possible backend types. */ enum class backend_type { - /** [OpenMP](https://www.openmp.org/) */ - openmp = 0, - /** [CUDA](https://developer.nvidia.com/cuda-zone) */ - cuda = 1, - /** [OpenCL](https://www.khronos.org/opencl/) */ - opencl = 2, - /** [SYCL](https://www.khronos.org/sycl/) */ - sycl = 3 + /** [OpenMP](https://www.openmp.org/) to target CPUs only. */ + openmp, + /** [CUDA](https://developer.nvidia.com/cuda-zone) to target NVIDIA GPUs only. */ + cuda, + /** [OpenCL](https://www.khronos.org/opencl/) to target GPUs from different vendors and CPUs. */ + opencl, + /** [SYCL](https://www.khronos.org/sycl/) to target GPUs from different vendors and CPUs. Currently tested SYCL implementations are [DPC++](https://github.com/intel/llvm) and [hipSYCL](https://github.com/illuhad/hipSYCL). */ + sycl }; /** - * @brief Stream-insertion-operator overload for convenient printing of the backend type @p backend. + * @brief Output the @p backend to the given output-stream @p out. * @param[in,out] out the output-stream to write the backend type to * @param[in] backend the backend type * @return the output-stream */ -inline std::ostream &operator<<(std::ostream &out, const backend_type backend) { - switch (backend) { - case backend_type::openmp: - return out << "openmp"; - case backend_type::cuda: - return out << "cuda"; - case backend_type::opencl: - return out << "opencl"; - case backend_type::sycl: - return out << "sycl"; - } - return out << "unknown"; -} +std::ostream &operator<<(std::ostream &out, backend_type backend); /** - * @brief Stream-extraction-operator overload for convenient converting a string to a backend type. + * @brief Use the input-stream @p in to initialize the @p backend type. * @param[in,out] in input-stream to extract the backend type from * @param[in] backend the backend type * @return the input-stream */ -inline std::istream &operator>>(std::istream &in, backend_type &backend) { - std::string str; - in >> str; - detail::to_lower_case(str); - - if (str == "openmp") { - backend = backend_type::openmp; - } else if (str == "cuda") { - backend = backend_type::cuda; - } else if (str == "opencl") { - backend = backend_type::opencl; - } else if (str == "sycl") { - backend = backend_type::sycl; - } else { - in.setstate(std::ios::failbit); - } - return in; -} +std::istream &operator>>(std::istream &in, backend_type &backend); } // namespace plssvm diff --git a/include/plssvm/backends/CUDA/csvm.hpp b/include/plssvm/backends/CUDA/csvm.hpp index 0ff0fc473..b35a3c054 100644 --- a/include/plssvm/backends/CUDA/csvm.hpp +++ b/include/plssvm/backends/CUDA/csvm.hpp @@ -12,30 +12,38 @@ #pragma once #include "plssvm/backends/CUDA/detail/device_ptr.cuh" // plssvm::cuda::detail::device_ptr -#include "plssvm/csvm.hpp" // plssvm::csvm -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/target_platform.hpp" // plssvm::target_platform +#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm -#include // std::vector +#include // std::size_t -namespace plssvm::cuda { +namespace plssvm { + +// forward declare parameter class +template +class parameter; + +namespace detail { + +// forward declare execution_range class +class execution_range; + +} // namespace detail + +namespace cuda { /** - * @brief The C-SVM class using the CUDA backend. + * @brief A C-SVM implementation using CUDA as backend. * @tparam T the type of the data */ template -class csvm : public ::plssvm::csvm { +class csvm : public ::plssvm::detail::gpu_csvm, int> { protected: // protected for the test mock class /// The template base type of the CUDA C-SVM class. - using base_type = ::plssvm::csvm; - using base_type::alpha_ptr_; - using base_type::bias_; + using base_type = ::plssvm::detail::gpu_csvm, int>; + using base_type::coef0_; using base_type::cost_; - using base_type::data_ptr_; using base_type::degree_; using base_type::gamma_; using base_type::kernel_; @@ -44,78 +52,67 @@ class csvm : public ::plssvm::csvm { using base_type::print_info_; using base_type::QA_cost_; using base_type::target_; - using base_type::w_; + + using base_type::data_d_; + using base_type::data_last_d_; + using base_type::devices_; + using base_type::num_cols_; + using base_type::num_rows_; + + using base_type::boundary_size_; + using base_type::dept_; public: /// The type of the data. Must be either `float` or `double`. using real_type = typename base_type::real_type; - /// Unsigned integer type. - using size_type = typename base_type::size_type; + + /// The type of the CUDA device pointer. + using device_ptr_type = ::plssvm::cuda::detail::device_ptr; + /// The type of the CUDA device queue. + using queue_type = int; /** * @brief Construct a new C-SVM using the CUDA backend with the parameters given through @p params. * @param[in] params struct encapsulating all possible parameters + * @throws plssvm::csvm::csvm() exceptions + * @throws plssvm::cuda::backend_exception if the target platform isn't plssvm::target_platform::automatic or plssvm::target_platform::gpu_nvidia + * @throws plssvm::cuda::backend_exception if the plssvm::target_platform::gpu_nvidia target isn't available + * @throws plssvm::cuda::backend_exception if no CUDA devices could be found */ explicit csvm(const parameter ¶ms); /** - * @brief Wait for all operations on all devices to finish. - * @details Terminates the program, if any exceptions are thrown. + * @brief Wait for all operations on all CUDA devices to finish. + * @details Terminates the program, if any exception is thrown. */ ~csvm() override; + protected: /** - * @brief Uses the already learned model to predict the class of multiple (new) data points. - * @param[in] points the data points to predict - * @return a `std::vector` filled with negative values for each prediction for a data point with the negative class and positive values otherwise ([[nodiscard]]) + * @copydoc plssvm::detail::gpu_csvm::device_synchronize */ - [[nodiscard]] virtual std::vector predict(const std::vector> &points) override; + void device_synchronize(queue_type &queue) final; - protected: - void setup_data_on_device() override; - std::vector generate_q() override; - std::vector solver_CG(const std::vector &b, size_type imax, real_type eps, const std::vector &q) override; /** - * @brief updates the `w_` vector to the current data and alpha values. + * @copydoc plssvm::detail::gpu_csvm::run_q_kernel */ - virtual void update_w() override; - + void run_q_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, std::size_t num_features) final; + /** + * @copydoc plssvm::detail::gpu_csvm::run_svm_kernel + */ + void run_svm_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, real_type add, std::size_t num_features) final; /** - * @brief Select the correct kernel based on the value of @p kernel_ and run it on the CUDA @p device. - * @param[in] device the CUDA device to run the kernel on - * @param[in] q_d subvector of the least-squares matrix equation - * @param[in,out] r_d the result vector - * @param[in] x_d the `x` vector - * @param[in] data_d the data - * @param[in] add denotes whether the values are added or subtracted from the result vector + * @copydoc plssvm::detail::gpu_csvm::run_w_kernel */ - void run_device_kernel(int device, const detail::device_ptr &q_d, detail::device_ptr &r_d, const detail::device_ptr &x_d, const detail::device_ptr &data_d, real_type add); + void run_w_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, std::size_t num_features) final; /** - * @brief Combines the data in @p buffer_d from all devices into @p buffer and distributes them back to each devices. - * @param[in,out] buffer_d the data to gather - * @param[in,out] buffer the reduces data + * @copydoc plssvm::detail::gpu_csvm::run_predict_kernel */ - void device_reduction(std::vector> &buffer_d, std::vector &buffer); - - /// The number of available/used CUDA devices. - int num_devices_{}; - /// The number of data points excluding the last data point. - size_type dept_{}; - /// The boundary size used to remove boundary condition checks inside the kernels. - size_type boundary_size_{}; - /// The number of rows to calculate including the boundary values. - int num_rows_{}; - /// The number of columns in the data matrix (= the number of features per data point). - int num_cols_{}; - /// The data saved across all devices. - std::vector> data_d_{}; - /// The last row of the data matrix. - std::vector> data_last_d_{}; - /// The normal vector used for speeding up the prediction in case of the linear kernel function saved on the first device. - detail::device_ptr w_d_{}; + void run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, std::size_t num_predict_points) final; }; extern template class csvm; extern template class csvm; -} // namespace plssvm::cuda +} // namespace cuda +} // namespace plssvm diff --git a/include/plssvm/backends/CUDA/detail/atomics.cuh b/include/plssvm/backends/CUDA/detail/atomics.cuh index 3b0f13bab..2fc83f3bf 100644 --- a/include/plssvm/backends/CUDA/detail/atomics.cuh +++ b/include/plssvm/backends/CUDA/detail/atomics.cuh @@ -11,19 +11,18 @@ #pragma once -// TODO: include necessary? - namespace plssvm::cuda::detail { -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 -#else +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 /** - * @brief Atomically add the double precision value @p val to the value denoted by @p address. - * @param[in,out] address the value to increment + * @brief Atomically add the double precision @p val to the value denoted by @p addr. + * @details Needed since CUDA versions before 6 don't nativelly support double-precisions floating point atomics. + * @param[in,out] addr the value to increment * @param[in] val the value to add + * @return the old value before atomically adding @p val */ -__device__ __forceinline__ double atomicAdd(double *address, const double val) { - unsigned long long int *address_as_ull = (unsigned long long int *) address; +__device__ __forceinline__ double atomicAdd(double *addr, const double val) { + unsigned long long int *address_as_ull = (unsigned long long int *) addr; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; diff --git a/include/plssvm/backends/CUDA/detail/device_ptr.cuh b/include/plssvm/backends/CUDA/detail/device_ptr.cuh index 6c73fb945..0cab2c8b6 100644 --- a/include/plssvm/backends/CUDA/detail/device_ptr.cuh +++ b/include/plssvm/backends/CUDA/detail/device_ptr.cuh @@ -6,90 +6,67 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Small wrapper around a CUDA device pointer and functions. + * @brief Small wrapper around a CUDA device pointer. */ #pragma once -#include // std::size_t -#include // std::vector +#include // std::size_t +#include // std::is_same_v +#include // std::vector namespace plssvm::cuda::detail { -/** - * @brief Returns the number of available devices. - * @return the number of devices (`[[nodiscard]]`) - */ -[[nodiscard]] int get_device_count(); -/** - * @brief Set the device @p device to the active CUDA device. - * @param[in] device the now active device - */ -void set_device(int device); - -/** - * @brief Returns the last error from a runtime call. - */ -void peek_at_last_error(); -/** - * @brief Wait for the current compute device to finish. - * @details Calls `peek_at_last_error()` before synchronizing. - */ -void device_synchronize(); -/** - * @brief Wait for the compute device @p device to finish. - * @details Calls `peek_at_last_error()` before synchronizing. - * @param[in] device the CUDA device to synchronize - * @throws plssvm::cuda::backend_exception if the given device ID is smaller than `0` or greater or equal than the available number of devices - */ -void device_synchronize(int device); - /** * @brief Small wrapper class around a CUDA device pointer together with commonly used device functions. * @tparam T the type of the kernel pointer to wrap */ template class device_ptr { + // only float and doubles are allowed + static_assert(std::is_same_v || std::is_same_v, "The template type can only be 'float' or 'double'!"); + public: - /// The type of the values used in the CUDA device pointer. + /// The type of the values used in the device_ptr. using value_type = T; - /// The type of the wrapped CUDA device pointer. + /// The type of the wrapped device_ptr. using pointer = value_type *; - /// The const type of the wrapped CUDA device pointer. + /// The const type of the wrapped device_ptr. using const_pointer = const value_type *; /// The used size type. using size_type = std::size_t; /** - * @brief Default construct a `device_ptr` with a size of `0`. - * @details Always associated with device `0`. + * @brief Default construct a device_ptr with a size of 0. + * @details Always associated with device 0. */ device_ptr() = default; /** * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device. - * @param[in] size the number of elements represented by the device pointer + * @param[in] size the number of elements represented by the device_ptr * @param[in] device the associated CUDA device - * @throws plssvm::cuda::backend_exception if the given device ID is smaller than `0` or greater or equal than the available number of devices + * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices */ explicit device_ptr(size_type size, int device = 0); /** - * @brief Move only type, therefore deleted copy-constructor. + * @brief Delete copy-constructor to make device_ptr a move only type. */ device_ptr(const device_ptr &) = delete; /** - * @brief Move-constructor. - * @param[in,out] other the `device_ptr` to move-construct from + * @brief Move-constructor as device_ptr is a move-only type. + * @param[in,out] other the device_ptr to move-construct from */ device_ptr(device_ptr &&other) noexcept; /** - * @brief Move only type, therefore deleted copy-assignment operator. + * @brief Delete copy-assignment-operator to make device_ptr a move only type. */ device_ptr &operator=(const device_ptr &) = delete; /** - * @brief Move-assignment operator. Uses the copy-and-swap idiom. - * @param[in] other the `device_ptr` to move-assign from + * @brief Move-assignment-operator as device_ptr is a move-only type. + * @details Uses the copy-and-swap idiom. + * @param[in,out] other the device_ptr to move-assign from * @return `*this` */ device_ptr &operator=(device_ptr &&other) noexcept; @@ -101,26 +78,27 @@ class device_ptr { /** * @brief Swap the contents of `*this` with the contents of @p other. - * @param[in,out] other the other `device_ptr` + * @param[in,out] other the other device_ptr */ void swap(device_ptr &other) noexcept; /** * @brief Swap the contents of @p lhs and @p rhs. - * @param[in,out] lhs a `device_ptr` - * @param[in,out] rhs a `device_ptr` + * @param[in,out] lhs a device_ptr + * @param[in,out] rhs a device_ptr */ friend void swap(device_ptr &lhs, device_ptr &rhs) noexcept { lhs.swap(rhs); } /** * @brief Checks whether `*this` currently wraps a CUDA device pointer. - * @return `true` if `*this` wraps a device pointer, `false` otherwise (`[[nodiscard]]`) + * @details Same as `device_ptr::get() != nullptr`. + * @return `true` if `*this` wraps a CUDA device pointer, `false` otherwise (`[[nodiscard]]`) */ [[nodiscard]] explicit operator bool() const noexcept { return data_ != nullptr; } /** * @brief Access the underlying CUDA device pointer. - * @return the device pointer (`[[nodiscard]]`) + * @return the device_ptr (`[[nodiscard]]`) */ [[nodiscard]] pointer get() noexcept { return data_; @@ -132,14 +110,15 @@ class device_ptr { return data_; } /** - * @brief Get the number of elements in the wrapped CUDA device pointer. + * @brief Get the number of elements in the wrapped device_ptr. * @return the size (`[[nodiscard]]`) */ [[nodiscard]] size_type size() const noexcept { return size_; } /** - * @brief Check whether no elements are currently associated to the CUDA device pointer. + * @brief Check whether the device_ptr currently maps zero elements. + * @details Same as `device_ptr::size() == 0`. * @return `true` if no elements are wrapped, `false` otherwise (`[[nodiscard]]`) */ [[nodiscard]] bool empty() const noexcept { @@ -149,15 +128,15 @@ class device_ptr { * @brief Return the device associated with the wrapped CUDA device pointer. * @return the device ID (`[[nodiscard]]`) */ - [[nodiscard]] int device() const noexcept { + [[nodiscard]] int queue() const noexcept { return device_; } /** * @brief Memset all values to @p value starting at position @p pos. * @param[in] value the memset value - * @param[in] pos the position to start the memset (default: `0`) - * @throws plssvm::cuda::backend_exception if @p pos is greater or equal than `device_ptr::size()` + * @param[in] pos the position to start the memset (default: 0) + * @throws plssvm::cuda::backend_exception if @p pos is greater or equal than device_ptr::size() */ void memset(int value, size_type pos = 0); /** @@ -166,12 +145,12 @@ class device_ptr { * @param[in] value the memset value * @param[in] pos the position to start the memset * @param[in] count the number of values to set - * @throws plssvm::cuda::backend_exception if @p pos is greater or equal than `device_ptr::size()` + * @throws plssvm::cuda::backend_exception if @p pos is greater or equal than device_ptr::size() */ void memset(int value, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from @p data_to_copy to the device. + * @brief Memcpy device_ptr::size() many values from @p data_to_copy to the device. * @param[in] data_to_copy the data to copy onto the device * @throws plssvm::cuda::backend_exception if @p data_to_copy is too small to satisfy the memcpy */ @@ -186,7 +165,7 @@ class device_ptr { */ void memcpy_to_device(const std::vector &data_to_copy, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from @p data_to_copy to the device. + * @brief Memcpy device_ptr::size() many values from @p data_to_copy to the device. * @param[in] data_to_copy the data to copy onto the device */ void memcpy_to_device(const_pointer data_to_copy); @@ -200,38 +179,38 @@ class device_ptr { void memcpy_to_device(const_pointer data_to_copy, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from the device to the host buffer @p buffer. - * @param[in] buffer the buffer to copy the data to + * @brief Memcpy device_ptr::size() many values from the device to the host buffer @p buffer. + * @param[out] buffer the buffer to copy the data to * @throws plssvm::cuda::backend_exception if @p buffer is too small */ void memcpy_to_host(std::vector &buffer) const; /** * @brief Memcpy up-to @p count many values from the device starting at CUDA device pointer position @p pos to the host buffer @p buffer. * @details Copies `[p, rcount)` values where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`. - * @param[in] buffer the buffer to copy the data to + * @param[out] buffer the buffer to copy the data to * @param[in] pos the starting position for the copying in the CUDA device pointer * @param[in] count the number of elements to copy * @throws plssvm::cuda::backend_exception if @p data_to_copy is too small */ void memcpy_to_host(std::vector &buffer, size_type pos, size_type count) const; /** - * @brief Memcpy `device_ptr::size()` many values from the device to the host buffer @p buffer. - * @param[in] buffer the buffer to copy the data to + * @brief Memcpy device_ptr::size() many values from the device to the host buffer @p buffer. + * @param[out] buffer the buffer to copy the data to */ void memcpy_to_host(pointer buffer) const; /** * @brief Memcpy up-to @p count many values from the device starting at CUDA device pointer position @p pos to the host buffer @p buffer. * @details Copies `[p, rcount)` values where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`. - * @param[in] buffer the buffer to copy the data to + * @param[out] buffer the buffer to copy the data to * @param[in] pos the starting position for the copying in the CUDA device pointer * @param[in] count the number of elements to copy */ void memcpy_to_host(pointer buffer, size_type pos, size_type count) const; private: - int device_ = 0; - pointer data_ = nullptr; - size_type size_ = 0; + int device_{ 0 }; + pointer data_{ nullptr }; + size_type size_{ 0 }; }; extern template class device_ptr; diff --git a/include/plssvm/backends/CUDA/detail/utility.cuh b/include/plssvm/backends/CUDA/detail/utility.cuh new file mode 100644 index 000000000..76a0262dc --- /dev/null +++ b/include/plssvm/backends/CUDA/detail/utility.cuh @@ -0,0 +1,55 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Utility functions for the CUDA backend. + */ + +#pragma once + +/** + * @def PLSSVM_CUDA_ERROR_CHECK + * @brief Macro used for error checking CUDA runtime functions. + */ +#define PLSSVM_CUDA_ERROR_CHECK(err) plssvm::cuda::detail::gpu_assert((err)) + +namespace plssvm::cuda::detail { + +/** + * @brief Check the CUDA error @p code. If @p code signals an error, throw a plssvm::cuda::backend_exception. + * @details The exception contains the error name and error string. + * @param[in] code the CUDA error code to check + * @throws plssvm::cuda::backend_exception if the error code signals a failure + */ +void gpu_assert(cudaError_t code); + +/** + * @brief Returns the number of available CUDA devices. + * @return the number of devices (`[[nodiscard]]`) + */ +[[nodiscard]] int get_device_count(); + +/** + * @brief Set the device @p device to the active CUDA device. + * @param[in] device the now active device + */ +void set_device(int device); + +/** + * @brief Returns the last error from a CUDA runtime call. + */ +void peek_at_last_error(); + +/** + * @brief Wait for the compute @p device to finish. + * @details Calls plssvm::cuda::detail::peek_at_last_error() before synchronizing. + * @param[in] device the CUDA device to synchronize + * @throws plssvm::cuda::backend_exception if the given device ID is smaller than 0 or greater or equal than the available number of devices + */ +void device_synchronize(int device); + +} // namespace plssvm::cuda::detail \ No newline at end of file diff --git a/include/plssvm/backends/CUDA/exceptions.hpp b/include/plssvm/backends/CUDA/exceptions.hpp index 2dc7f9f66..510dafdfe 100644 --- a/include/plssvm/backends/CUDA/exceptions.hpp +++ b/include/plssvm/backends/CUDA/exceptions.hpp @@ -24,12 +24,11 @@ namespace plssvm::cuda { class backend_exception : public exception { public: /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. * @param[in] msg the exception's `what()` message * @param[in] loc the exception's call side information */ - explicit backend_exception(const std::string &msg, source_location loc = source_location::current()) : - ::plssvm::exception{ msg, "cuda::backend_exception", loc } {} + explicit backend_exception(const std::string &msg, source_location loc = source_location::current()); }; -}; // namespace plssvm::cuda \ No newline at end of file +} // namespace plssvm::cuda \ No newline at end of file diff --git a/include/plssvm/backends/CUDA/predict.cuh b/include/plssvm/backends/CUDA/predict.cuh deleted file mode 100644 index ad6c5344a..000000000 --- a/include/plssvm/backends/CUDA/predict.cuh +++ /dev/null @@ -1,24 +0,0 @@ -/** - * @file - * @author Alexander Van Craen - * @author Marcel Breyer - * @copyright 2018-today The PLSSVM project - All Rights Reserved - * @license This file is part of the PLSSVM project which is released under the MIT license. - * See the LICENSE.md file in the project root for full license information. - * - * @brief TODO: brief description - */ - -#pragma once - -namespace plssvm::cuda { -template -__global__ void kernel_w(real_type *w_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const int num_data_points, const int num_features); - -template -__global__ void predict_points_poly(real_type *out_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const int num_data_points, const real_type *points, const int num_predict_points, const int num_features, const int degree, const real_type gamma, const real_type coef0); - -template -__global__ void predict_points_rbf(real_type *out_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const int num_data_points, const real_type *points, const int num_predict_points, const int num_features, const real_type gamma); - -} // namespace plssvm::cuda \ No newline at end of file diff --git a/include/plssvm/backends/CUDA/predict_kernel.cuh b/include/plssvm/backends/CUDA/predict_kernel.cuh new file mode 100644 index 000000000..894000d88 --- /dev/null +++ b/include/plssvm/backends/CUDA/predict_kernel.cuh @@ -0,0 +1,68 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the functions used for prediction for the C-SVM using the CUDA backend. + */ + +#pragma once + +#include "plssvm/constants.hpp" // plssvm::kernel_index_type + +namespace plssvm::cuda { + +/** + * @brief Calculate the `w` vector to speed up the prediction of the labels for data points using the linear kernel function. + * @details Supports multi-GPU execution. + * @tparam real_type the type of the data + * @param[out] w_d the `w` vector to assemble + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] num_features the number of features per support vector + */ +template +__global__ void device_kernel_w_linear(real_type *w_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const kernel_index_type num_data_points, const kernel_index_type num_features); + +/** + * @brief Predicts the labels for data points using the polynomial kernel function. + * @details Currently only single GPU execution is supported. + * @tparam real_type the type of the data + * @param[in] out_d the calculated predictions + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] points the data points to predict + * @param[in] num_predict_points the total number of data points to predict + * @param[in] num_features the number of features per support vector and point to predict + * @param[in] degree the degree parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] coef0 the coef0 parameter used in the polynomial kernel function + */ +template +__global__ void device_kernel_predict_poly(real_type *out_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const kernel_index_type num_data_points, const real_type *points, const kernel_index_type num_predict_points, const kernel_index_type num_features, const int degree, const real_type gamma, const real_type coef0); + +/** + * @brief Predicts the labels for data points using the radial basis functions kernel function. + * @details Currently only single GPU execution is supported. + * @tparam real_type the type of the data + * @param[in] out_d the calculated predictions + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] points the data points to predict + * @param[in] num_predict_points the total number of data points to predict + * @param[in] num_features the number of features per support vector and point to predict + * @param[in] gamma the gamma parameter used in the rbf kernel function + */ +template +__global__ void device_kernel_predict_radial(real_type *out_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const kernel_index_type num_data_points, const real_type *points, const kernel_index_type num_predict_points, const kernel_index_type num_features, const real_type gamma); + +} // namespace plssvm::cuda \ No newline at end of file diff --git a/include/plssvm/backends/CUDA/q_kernel.cuh b/include/plssvm/backends/CUDA/q_kernel.cuh index b61ffd974..83ade808e 100644 --- a/include/plssvm/backends/CUDA/q_kernel.cuh +++ b/include/plssvm/backends/CUDA/q_kernel.cuh @@ -11,6 +11,8 @@ #pragma once +#include "plssvm/constants.hpp" // plssvm::kernel_index_type + namespace plssvm::cuda { /** @@ -21,11 +23,10 @@ namespace plssvm::cuda { * @param[in] data_d the one-dimensional data matrix * @param[in] data_last the last row in the data matrix * @param[in] num_rows the number of rows in the data matrix - * @param[in] first_feature the first feature used in the calculations (depending on the current device) - * @param[in] last_feature the last feature used in the calculations (depending on the current device) + * @param[in] feature_range number of features used for the calculation */ template -__global__ void device_kernel_q_linear(real_type *q, const real_type *data_d, const real_type *data_last, const int num_rows, const int first_feature, const int last_feature); +__global__ void device_kernel_q_linear(real_type *q, const real_type *data_d, const real_type *data_last, const kernel_index_type num_rows, const kernel_index_type feature_range); /** * @brief Calculates the `q` vector using the polynomial C-SVM kernel. @@ -41,7 +42,7 @@ __global__ void device_kernel_q_linear(real_type *q, const real_type *data_d, co * @param[in] coef0 the coef0 parameter used in the polynomial kernel function */ template -__global__ void device_kernel_q_poly(real_type *q, const real_type *data_d, const real_type *data_last, const int num_rows, const int num_cols, const int degree, const real_type gamma, const real_type coef0); +__global__ void device_kernel_q_poly(real_type *q, const real_type *data_d, const real_type *data_last, const kernel_index_type num_rows, const kernel_index_type num_cols, const int degree, const real_type gamma, const real_type coef0); /** * @brief Calculates the `q` vector using the radial basis functions C-SVM kernel. @@ -55,6 +56,6 @@ __global__ void device_kernel_q_poly(real_type *q, const real_type *data_d, cons * @param[in] gamma the gamma parameter used in the rbf kernel function */ template -__global__ void device_kernel_q_radial(real_type *q, const real_type *data_d, const real_type *data_last, const int num_rows, const int num_cols, const real_type gamma); +__global__ void device_kernel_q_radial(real_type *q, const real_type *data_d, const real_type *data_last, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type gamma); } // namespace plssvm::cuda \ No newline at end of file diff --git a/include/plssvm/backends/CUDA/svm_kernel.cuh b/include/plssvm/backends/CUDA/svm_kernel.cuh index 31ad24597..7935eac46 100644 --- a/include/plssvm/backends/CUDA/svm_kernel.cuh +++ b/include/plssvm/backends/CUDA/svm_kernel.cuh @@ -11,6 +11,8 @@ #pragma once +#include "plssvm/constants.hpp" // plssvm::kernel_index_type + namespace plssvm::cuda { /** @@ -18,25 +20,25 @@ namespace plssvm::cuda { * @details Supports multi-GPU execution. * @tparam real_type the type of the data * @param[in] q the `q` vector - * @param[in] ret the result vector + * @param[out] ret the result vector * @param[in] d the right-hand side of the equation * @param[in] data_d the one-dimension data matrix - * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] QA_cost the bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] num_rows the number of columns in the data matrix + * @param[in] feature_range number of features used for the calculation on the device @p id * @param[in] add denotes whether the values are added or subtracted from the result vector - * @param[in] first_feature the first feature used in the calculations (depending on the current device) - * @param[in] last_feature the last feature used in the calculations (depending on the current device) + * @param[in] id the id of the current device */ template -__global__ void device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const int num_rows, const real_type add, const int first_feature, const int last_feature); +__global__ void device_kernel_linear(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id); /** * @brief Calculates the C-SVM kernel using the polynomial kernel function. * @details Currently only single GPU execution is supported. * @tparam real_type the type of the data * @param[in] q the `q` vector - * @param[in] ret the result vector + * @param[out] ret the result vector * @param[in] d the right-hand side of the equation * @param[in] data_d the one-dimension data matrix * @param[in] QA_cost he bottom right matrix entry multiplied by cost @@ -49,14 +51,14 @@ __global__ void device_kernel_linear(const real_type *q, real_type *ret, const r * @param[in] coef0 the coef0 parameter used in the polynomial kernel function */ template -__global__ void device_kernel_poly(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const int num_rows, const int num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0); +__global__ void device_kernel_poly(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0); /** * @brief Calculates the C-SVM kernel using the radial basis function kernel function. * @details Currently only single GPU execution is supported. * @tparam real_type the type of the data * @param[in] q the `q` vector - * @param[in] ret the result vector + * @param[out] ret the result vector * @param[in] d the right-hand side of the equation * @param[in] data_d the one-dimension data matrix * @param[in] QA_cost he bottom right matrix entry multiplied by cost @@ -67,6 +69,6 @@ __global__ void device_kernel_poly(const real_type *q, real_type *ret, const rea * @param[in] gamma the gamma parameter used in the rbf kernel function */ template -__global__ void device_kernel_radial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const int num_rows, const int num_cols, const real_type add, const real_type gamma); +__global__ void device_kernel_radial(const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma); } // namespace plssvm::cuda \ No newline at end of file diff --git a/include/plssvm/backends/OpenCL/csvm.hpp b/include/plssvm/backends/OpenCL/csvm.hpp index d907eaa1e..87d5ddf20 100644 --- a/include/plssvm/backends/OpenCL/csvm.hpp +++ b/include/plssvm/backends/OpenCL/csvm.hpp @@ -14,30 +14,39 @@ #include "plssvm/backends/OpenCL/detail/command_queue.hpp" // plssvm::opencl::detail::command_queue #include "plssvm/backends/OpenCL/detail/device_ptr.hpp" // plssvm::opencl::detail::device_ptr #include "plssvm/backends/OpenCL/detail/kernel.hpp" // plssvm::opencl::detail::kernel -#include "plssvm/csvm.hpp" // plssvm::csvm -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/target_platform.hpp" // plssvm::target_platform +#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm -#include // std::vector +#include // std::size_t +#include // std::vector -namespace plssvm::opencl { +namespace plssvm { + +// forward declare parameter class +template +class parameter; + +namespace detail { + +// forward declare execution_range class +class execution_range; + +} // namespace detail + +namespace opencl { /** - * @brief The C-SVM class using the OpenCL backend. + * @brief A C-SVM implementation using OpenCL as backend. * @tparam T the type of the data */ template -class csvm : public ::plssvm::csvm { +class csvm : public ::plssvm::detail::gpu_csvm, ::plssvm::opencl::detail::command_queue> { protected: // protected for test MOCK class /// The template base type of the OpenCL C-SVM class. - using base_type = ::plssvm::csvm; - using base_type::alpha_ptr_; - using base_type::bias_; + using base_type = ::plssvm::detail::gpu_csvm, ::plssvm::opencl::detail::command_queue>; + using base_type::coef0_; using base_type::cost_; - using base_type::data_ptr_; using base_type::degree_; using base_type::gamma_; using base_type::kernel_; @@ -46,81 +55,64 @@ class csvm : public ::plssvm::csvm { using base_type::print_info_; using base_type::QA_cost_; using base_type::target_; - using base_type::w_; + + using base_type::data_d_; + using base_type::data_last_d_; + using base_type::devices_; + using base_type::num_cols_; + using base_type::num_rows_; public: /// The type of the data. Must be either `float` or `double`. using real_type = typename base_type::real_type; - /// Unsigned integer type. - using size_type = typename base_type::size_type; + + /// The type of the OpenCL device pointer. + using device_ptr_type = ::plssvm::opencl::detail::device_ptr; + /// The type of the OpenCL device queue. + using queue_type = ::plssvm::opencl::detail::command_queue; /** * @brief Construct a new C-SVM using the OpenCL backend with the parameters given through @p params. * @param[in] params struct encapsulating all possible parameters + * @throws plssvm::csvm::csvm() exceptions + * @throws plssvm::opencl::backend_exception if the requested plssvm::target_platform isn't available + * @throws plssvm::opencl::backend_exception if no possible OpenCL devices could be found */ explicit csvm(const parameter ¶ms); /** * @brief Wait for all operations on all devices to finish. - * @details Terminates the program, if any exceptions are thrown. + * @details Terminates the program, if any exception is thrown. */ ~csvm() override; + protected: /** - * @brief Uses the already learned model to predict the class of multiple (new) data points. - * @param[in] points the data points to predict - * @return a `std::vector` filled with negative values for each prediction for a data point with the negative class and positive values otherwise ([[nodiscard]]) + * @copydoc plssvm::detail::gpu_csvm::device_synchronize */ - [[nodiscard]] virtual std::vector predict(const std::vector> &points) override; + void device_synchronize(queue_type &queue) final; - protected: - void setup_data_on_device() override; - std::vector generate_q() override; - std::vector solver_CG(const std::vector &b, size_type imax, real_type eps, const std::vector &q) override; /** - * @brief updates the `w_` vector to the current data and alpha values. + * @copydoc plssvm::detail::gpu_csvm::run_q_kernel */ - virtual void update_w() override; - + void run_q_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, std::size_t num_features) final; /** - * @brief Select the correct kernel based on the value of @p kernel_ and run it on the OpenCL @p device. - * @param[in] device the OpenCL device to run the kernel on - * @param[in] q_d subvector of the least-squares matrix equation - * @param[in,out] r_d the result vector - * @param[in] x_d the `x` vector - * @param[in] data_d the data - * @param[in] add denotes whether the values are added or subtracted from the result vector + * @copydoc plssvm::detail::gpu_csvm::run_svm_kernel */ - void run_device_kernel(size_type device, const detail::device_ptr &q_d, detail::device_ptr &r_d, const detail::device_ptr &x_d, const detail::device_ptr &data_d, real_type add); + void run_svm_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, real_type add, std::size_t num_features) final; /** - * @brief Combines the data in @p buffer_d from all devices into @p buffer and distributes them back to each devices. - * @param[in,out] buffer_d the data to gather - * @param[in,out] buffer the reduces data + * @copydoc plssvm::detail::gpu_csvm::run_w_kernel */ - void device_reduction(std::vector> &buffer_d, std::vector &buffer); - - /// The available/used OpenCL devices. - std::vector devices_{}; - /// The number of data points excluding the last data point. - size_type dept_{}; - /// The boundary size used to remove boundary condition checks inside the kernels. - size_type boundary_size_{}; - /// The number of rows to calculate including the boundary values. - int num_rows_{}; - /// The number of columns in the data matrix (= the number of features per data point). - int num_cols_{}; - /// The data saved across all devices. - std::vector> data_d_{}; - /// The last row of the data matrix. - std::vector> data_last_d_{}; - /// The normal vector used for speeding up the prediction in case of the linear kernel function saved on the first device. - detail::device_ptr w_d_{}; + void run_w_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, std::size_t num_features) final; + /** + * @copydoc plssvm::detail::gpu_csvm::run_predict_kernel + */ + void run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, std::size_t num_predict_points) final; /// OpenCL kernel for the generate q function compiled for each device. std::vector q_kernel_{}; /// OpenCL kernel for the svm kernel function compiled for each device. std::vector svm_kernel_{}; - /// OpenCL kernel for the kernel_w function compiled for each device. std::vector kernel_w_kernel_{}; /// OpenCL kernel for the prediction function compiled for each device. @@ -130,4 +122,5 @@ class csvm : public ::plssvm::csvm { extern template class csvm; extern template class csvm; -} // namespace plssvm::opencl \ No newline at end of file +} // namespace opencl +} // namespace plssvm \ No newline at end of file diff --git a/include/plssvm/backends/OpenCL/detail/atomics.cl b/include/plssvm/backends/OpenCL/detail/atomics.cl new file mode 100644 index 000000000..1b96bb8a0 --- /dev/null +++ b/include/plssvm/backends/OpenCL/detail/atomics.cl @@ -0,0 +1,53 @@ +/** +* @file +* @author Alexander Van Craen +* @author Marcel Breyer +* @copyright 2018-today The PLSSVM project - All Rights Reserved +* @license This file is part of the PLSSVM project which is released under the MIT license. +* See the LICENSE.md file in the project root for full license information. +* +* @brief Defines atomic functions for floating point types. +*/ + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable + +/** + * @brief Implementation of an atomic add function for double-precision floating point types. + * @param[in,out] addr the source value to add @p val to + * @param[in] val the value to add to @p addr + */ +inline void __attribute__((overloadable)) atomicAdd(__global const double *addr, const double val) { + union { + ulong u32; + double f32; + } next, expected, current; + current.f32 = *addr; + do { + expected.f32 = current.f32; + next.f32 = expected.f32 + val; + current.u32 = atomic_cmpxchg((volatile __global ulong *) addr, + expected.u32, + next.u32); + } while (current.u32 != expected.u32); +} + +/** + * @brief Implementation of an atomic add function for single-precision floating point types. + * @param[in,out] addr the source value to add @p val to + * @param[in] val the value to add to @p addr + */ +inline void __attribute__((overloadable)) atomicAdd(__global const float *addr, const float val) { + union { + unsigned int u32; + float f32; + } next, expected, current; + current.f32 = *addr; + do { + expected.f32 = current.f32; + next.f32 = expected.f32 + val; + current.u32 = atomic_cmpxchg((volatile __global unsigned int *) addr, + expected.u32, + next.u32); + } while (current.u32 != expected.u32); +} \ No newline at end of file diff --git a/include/plssvm/backends/OpenCL/detail/command_queue.hpp b/include/plssvm/backends/OpenCL/detail/command_queue.hpp index 4dbd6af3c..805823748 100644 --- a/include/plssvm/backends/OpenCL/detail/command_queue.hpp +++ b/include/plssvm/backends/OpenCL/detail/command_queue.hpp @@ -31,17 +31,32 @@ class command_queue { * @param[in] queue the OpenCL cl_command_queue to wrap * @param[in] device the associated OpenCL cl_device_id */ - command_queue(cl_context context, cl_command_queue queue, cl_device_id device) : - context{ context }, queue{ queue }, device{ device } {} + command_queue(cl_context context, cl_command_queue queue, cl_device_id device); + + /** + * @brief Delete copy-constructor to make command_queue a move only type. + */ + command_queue(const command_queue &) = delete; + /** + * @brief Move-constructor as command_queue is a move-only type. + * @param[in,out] other the command_queue to move the resources from + */ + command_queue(command_queue &&other) noexcept; + /** + * @brief Delete copy-assignment-operator to make command_queue a move only type. + */ + command_queue &operator=(const command_queue &) = delete; + /** + * @brief Move-assignment-operator as command_queue is a move-only type. + * @param[in,out] other the command_queue to move the resources from + * @return `*this` + */ + command_queue &operator=(command_queue &&other); /** * @brief Release the cl_command_queue resources on destruction. */ - ~command_queue() { - if (queue) { - clReleaseCommandQueue(queue); - } - } + ~command_queue(); /// The OpenCL context associated with the wrapped cl_command_queue. cl_context context{}; diff --git a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp index 4988519ce..31ec3b6cc 100644 --- a/include/plssvm/backends/OpenCL/detail/device_ptr.hpp +++ b/include/plssvm/backends/OpenCL/detail/device_ptr.hpp @@ -6,44 +6,30 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Small wrapper around a OpenCL device pointer and functions. + * @brief Small wrapper around a OpenCL device pointer. */ #pragma once #include "plssvm/backends/OpenCL/detail/command_queue.hpp" // plssvm::opencl::detail::command_queue -#include "plssvm/target_platform.hpp" // plssvm::target_platform #include "CL/cl.h" // cl_mem -#include // std::size_t -#include // std::vector +#include // std::size_t +#include // std::is_same_v +#include // std::vector namespace plssvm::opencl::detail { -/** - * @brief Returns the list devices matching the target platform @p target. - * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order: - * 1. NVIDIA GPUs - * 2. AMD GPUs - * 3. Intel GPUs - * 4. CPUs - * @param[in] target the target platform for which the devices must match - * @return the command queues (`[[nodiscard]]`) - */ -[[nodiscard]] std::vector get_command_queues(target_platform target); -/** - * @brief Wait for the compute device associated with @p queue to finish. - * @param[in] queue the command queue to synchronize - */ -void device_synchronize(const command_queue &queue); - /** * @brief Small wrapper class around an OpenCL device pointer together with commonly used device functions. * @tparam T the type of the kernel pointer to wrap */ template class device_ptr { + // only float and doubles are allowed + static_assert(std::is_same_v || std::is_same_v, "The template type can only be 'float' or 'double'!"); + public: /// The type of the values used in the OpenCL device pointer. using value_type = T; @@ -55,35 +41,35 @@ class device_ptr { using size_type = std::size_t; /** - * @brief Default construct a `device_ptr` with a size of `0`. - * @details Always associated with device `0`. + * @brief Default construct a device_ptr with a size of 0. + * @details Always associated with device 0. */ device_ptr() = default; /** * @brief Allocates `size * sizeof(T)` bytes on the device with ID @p device. - * @param[in] size the number of elements represented by the device pointer + * @param[in] size the number of elements represented by the device_ptr * @param[in] queue the associated command queue - * @throws plssvm::opencl::backend_exception if the given device ID is smaller than `0` or greater or equal than the available number of devices */ explicit device_ptr(size_type size, command_queue &queue); /** - * @brief Move only type, therefore deleted copy-constructor. + * @brief Delete copy-constructor to make device_ptr a move only type. */ device_ptr(const device_ptr &) = delete; /** - * @brief Move-constructor. - * @param[in,out] other the `device_ptr` to move-construct from + * @brief Move-constructor as device_ptr is a move-only type. + * @param[in,out] other the device_ptr to move-construct from */ device_ptr(device_ptr &&other) noexcept; /** - * @brief Move only type, therefore deleted copy-assignment operator. + * @brief Delete copy-assignment-operator to make device_ptr a move only type. */ device_ptr &operator=(const device_ptr &) = delete; /** - * @brief Move-assignment operator. Uses the copy-and-swap idiom. - * @param[in] other the `device_ptr` to move-assign from + * @brief Move-assignment-operator as device_ptr is a move-only type. + * @details Uses the copy-and-swap idiom. + * @param[in,out] other the device_ptr to move-assign from * @return `*this` */ device_ptr &operator=(device_ptr &&other) noexcept; @@ -95,26 +81,27 @@ class device_ptr { /** * @brief Swap the contents of `*this` with the contents of @p other. - * @param[in,out] other the other `device_ptr` + * @param[in,out] other the other device_ptr */ void swap(device_ptr &other) noexcept; /** * @brief Swap the contents of @p lhs and @p rhs. - * @param[in,out] lhs a `device_ptr` - * @param[in,out] rhs a `device_ptr` + * @param[in,out] lhs a device_ptr + * @param[in,out] rhs a device_ptr */ friend void swap(device_ptr &lhs, device_ptr &rhs) noexcept { lhs.swap(rhs); } /** - * @brief Checks whether `*this` currently wraps a OpenCL device pointer. - * @return `true` if `*this` wraps a device pointer, `false` otherwise (`[[nodiscard]]`) + * @brief Checks whether `*this` currently wraps an OpenCL device pointer. + * @details Same as `device_ptr::get() != nullptr`. + * @return `true` if `*this` wraps an OpenCL device pointer, `false` otherwise (`[[nodiscard]]`) */ [[nodiscard]] explicit operator bool() const noexcept { return data_ != nullptr; } /** * @brief Access the underlying OpenCL device pointer. - * @return the device pointer (`[[nodiscard]]`) + * @return the device_ptr (`[[nodiscard]]`) */ [[nodiscard]] cl_mem &get() noexcept { return data_; @@ -126,14 +113,14 @@ class device_ptr { return data_; } /** - * @brief Get the number of elements in the wrapped OpenCL device pointer. + * @brief Get the number of elements in the device_ptr. * @return the size (`[[nodiscard]]`) */ [[nodiscard]] size_type size() const noexcept { return size_; } /** - * @brief Check whether no elements are currently associated to the OpenCL device pointer. + * @brief Check whether the device_ptr currently maps zero elements. * @return `true` if no elements are wrapped, `false` otherwise (`[[nodiscard]]`) */ [[nodiscard]] bool empty() const noexcept { @@ -147,7 +134,7 @@ class device_ptr { return *queue_; } /** - * @copydoc queue() + * @copydoc device_ptr::queue() */ [[nodiscard]] const command_queue &queue() const noexcept { return *queue_; @@ -156,8 +143,8 @@ class device_ptr { /** * @brief Memset all values to @p value starting at position @p pos. * @param[in] value the memset value - * @param[in] pos the position to start the memset (default: `0`) - * @throws plssvm::opencl::backend_exception if @p pos is greater or equal than `device_ptr::size()` + * @param[in] pos the position to start the memset + * @throws plssvm::opencl::backend_exception if @p pos is greater or equal than device_ptr::size() */ void memset(value_type value, size_type pos = 0); /** @@ -166,12 +153,12 @@ class device_ptr { * @param[in] value the memset value * @param[in] pos the position to start the memset * @param[in] count the number of values to set - * @throws plssvm::opencl::backend_exception if @p pos is greater or equal than `device_ptr::size()` + * @throws plssvm::opencl::backend_exception if @p pos is greater or equal than device_ptr::size() */ void memset(value_type value, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from @p data_to_copy to the device. + * @brief Memcpy device_ptr::size() many values from @p data_to_copy to the device. * @param[in] data_to_copy the data to copy onto the device * @throws plssvm::opencl::backend_exception if @p data_to_copy is too small to satisfy the memcpy */ @@ -180,13 +167,13 @@ class device_ptr { * @brief Memcpy up-to @p count many values from @p data_to_copy to the device starting at OpenCL device pointer position @p pos. * @details Copies `[p, rcount)` values where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`. * @param[in] data_to_copy the data to copy onto the device - * @param[in] pos the starting position for the copying in the CUDA device pointer + * @param[in] pos the starting position for the copying in the device_ptr * @param[in] count the number of elements to copy * @throws plssvm::opencl::backend_exception if @p data_to_copy is too small to satisfy the memcpy */ void memcpy_to_device(const std::vector &data_to_copy, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from @p data_to_copy to the device. + * @brief Memcpy device_ptr::size() many values from @p data_to_copy to the device. * @param[in] data_to_copy the data to copy onto the device */ void memcpy_to_device(const_pointer data_to_copy); @@ -200,38 +187,38 @@ class device_ptr { void memcpy_to_device(const_pointer data_to_copy, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from the device to the host buffer @p buffer. - * @param[in] buffer the buffer to copy the data to + * @brief Memcpy device_ptr::size() many values from the device to the host buffer @p buffer. + * @param[out] buffer the buffer to copy the data to * @throws plssvm::opencl::backend_exception if @p buffer is too small */ void memcpy_to_host(std::vector &buffer) const; /** * @brief Memcpy up-to @p count many values from the device starting at OpenCL device pointer position @p pos to the host buffer @p buffer. * @details Copies `[p, rcount)` values where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`. - * @param[in] buffer the buffer to copy the data to + * @param[out] buffer the buffer to copy the data to * @param[in] pos the starting position for the copying in the OpenCL device pointer * @param[in] count the number of elements to copy * @throws plssvm::opencl::backend_exception if @p data_to_copy is too small */ void memcpy_to_host(std::vector &buffer, size_type pos, size_type count) const; /** - * @brief Memcpy `device_ptr::size()` many values from the device to the host buffer @p buffer. - * @param[in] buffer the buffer to copy the data to + * @brief Memcpy device_ptr::size() many values from the device to the host buffer @p buffer. + * @param[out] buffer the buffer to copy the data to */ void memcpy_to_host(pointer buffer) const; /** * @brief Memcpy up-to @p count many values from the device starting at OpenCL device pointer position @p pos to the host buffer @p buffer. * @details Copies `[p, rcount)` values where `rcount` is the smaller value of @p count and `device_ptr::size() - pos`. - * @param[in] buffer the buffer to copy the data to + * @param[out] buffer the buffer to copy the data to * @param[in] pos the starting position for the copying in the OpenCL device pointer * @param[in] count the number of elements to copy */ void memcpy_to_host(pointer buffer, size_type pos, size_type count) const; private: - command_queue *queue_ = nullptr; - cl_mem data_ = nullptr; - size_type size_ = 0; + command_queue *queue_{ nullptr }; + cl_mem data_{ nullptr }; + size_type size_{ 0 }; }; extern template class device_ptr; diff --git a/include/plssvm/backends/OpenCL/detail/error_code.hpp b/include/plssvm/backends/OpenCL/detail/error_code.hpp index 0b509144a..0e7002d37 100644 --- a/include/plssvm/backends/OpenCL/detail/error_code.hpp +++ b/include/plssvm/backends/OpenCL/detail/error_code.hpp @@ -11,10 +11,9 @@ #pragma once -#include "CL/cl.h" // cl_int, CL_SUCCESS -#include "fmt/ostream.h" // use operator<< to enable fmt::format with custom type +#include "CL/cl.h" // cl_int, CL_SUCCESS -#include // std::ostream (forward declaration only) +#include // forward declare std::ostream #include // std::string_view namespace plssvm::opencl::detail { @@ -30,22 +29,22 @@ class error_code { */ error_code() = default; /** - * @brief Construct a new error code wrapping the OpenCL error code @p err. - * @param[in] err the OpenCL error code + * @brief Construct a new error code wrapping the OpenCL @p error code. + * @param[in] error the OpenCL error code */ - error_code(cl_int err) noexcept; + error_code(cl_int error) noexcept; /** - * @brief Assign an OpenCL error code to this. - * @param[in] err the OpenCL error code + * @brief Assign the OpenCL @p error code to `*this`. + * @param[in] error the OpenCL error code * @return `*this` */ - error_code &operator=(cl_int err) noexcept; + error_code &operator=(cl_int error) noexcept; /** - * @brief Assign an OpenCL error code to this. - * @param[in] err the OpenCL error code + * @brief Assign the OpenCL @p error code to `*this`. + * @param[in] error the OpenCL error code */ - void assign(cl_int err) noexcept; + void assign(cl_int error) noexcept; /** * @brief Sets to error code value back to `CL_SUCCESS`. @@ -59,32 +58,33 @@ class error_code { [[nodiscard]] cl_int value() const noexcept; /** * @brief Obtains the explanatory string of the error code. - * @return the string representation of the error code + * @return the string representation of the error code (`[[nodiscard]]`) */ [[nodiscard]] std::string_view message() const noexcept; /** * @brief Checks whether the error code indicates success or not. - * @return `true` if the error code is `CL_SUCCESS`, otherwise `false` + * @return `true` if the error code is `CL_SUCCESS`, otherwise `false` (`[[nodiscard]]`) */ [[nodiscard]] explicit operator bool() const noexcept; /** * @brief Overloads the addressof operator to be able to set the wrapped error code value using an out-parameter * in calls to OpenCL functions. - * @return pointer to the wrapped OpenCL error code + * @return pointer to the wrapped OpenCL error code (`[[nodiscard]]`) */ [[nodiscard]] cl_int *operator&() noexcept; private: - cl_int err_ = CL_SUCCESS; + // The wrapped OpenCL error code. + cl_int err_{ CL_SUCCESS }; }; /** - * @brief Overload stream-insertion operator to print the error code and its message to the output. + * @brief Output the error code encapsulated by @p ev to the given output-stream @p out. * @details Example output of an error code: * @code - * "-1; CL_DEVICE_NOT_FOUND" + * "-1: CL_DEVICE_NOT_FOUND" * @endcode - * @param[in,out] out the output-stream to write the kernel type to + * @param[in,out] out the output-stream to write the error code to * @param[in] ec the error code * @return the output-stream */ @@ -93,14 +93,14 @@ std::ostream &operator<<(std::ostream &out, error_code ec); * @brief Compares two error codes for equality. * @param[in] lhs the first error code * @param[in] rhs the second error code - * @return `true` if both error codes are equal, `false` otherwise + * @return `true` if both error codes are equal, `false` otherwise (`[[nodiscard]]`) */ [[nodiscard]] bool operator==(error_code lhs, error_code rhs) noexcept; /** * @brief Compares two error codes for inequality. * @param[in] lhs the first error code * @param[in] rhs the second error code - * @return `true` if both error codes are unequal, `false` otherwise + * @return `true` if both error codes are unequal, `false` otherwise (`[[nodiscard]]`) */ [[nodiscard]] bool operator!=(error_code lhs, error_code rhs) noexcept; diff --git a/include/plssvm/backends/OpenCL/detail/kernel.hpp b/include/plssvm/backends/OpenCL/detail/kernel.hpp index 47ccf1752..350633994 100644 --- a/include/plssvm/backends/OpenCL/detail/kernel.hpp +++ b/include/plssvm/backends/OpenCL/detail/kernel.hpp @@ -24,28 +24,43 @@ class kernel { * @brief Construct a new wrapper around the provided @p compute_kernel. * @param[in] compute_kernel the cl_kernel to wrap */ - explicit kernel(cl_kernel compute_kernel) noexcept : - compute_kernel{ compute_kernel } {} + explicit kernel(cl_kernel compute_kernel) noexcept; + + /** + * @brief Delete copy-constructor to make #kernel a move only type. + */ + kernel(const kernel &) = delete; + /** + * @brief Move-constructor as #kernel is a move-only type. + * @param[in,out] other the kernel to move the resources from + */ + kernel(kernel &&other) noexcept; + /** + * @brief Delete copy-assignment-operator to make #kernel a move only type. + */ + kernel &operator=(const kernel &) = delete; + /** + * @brief Move-assignment-operator as #kernel is a move-only type. + * @param[in,out] other the kernel to move the resources from + * @return `*this` + */ + kernel &operator=(kernel &&other); /** * @brief Release the cl_kernel resources on destruction. */ - ~kernel() { - if (compute_kernel) { - clReleaseKernel(compute_kernel); - } - } + ~kernel(); /** * @brief Implicitly convert a kernel wrapper to an OpenCL cl_kernel. - * @return the wrapped OpenCL cl_kernel + * @return the wrapped OpenCL cl_kernel (`[[nodiscard]]`) */ - operator cl_kernel &() noexcept { return compute_kernel; } + [[nodiscard]] operator cl_kernel &() noexcept { return compute_kernel; } /** * @brief Implicitly convert a kernel wrapper to an OpenCL cl_kernel. - * @return the wrapped OpenCL cl_kernel + * @return the wrapped OpenCL cl_kernel (`[[nodiscard]]`) */ - operator const cl_kernel &() const noexcept { return compute_kernel; } + [[nodiscard]] operator const cl_kernel &() const noexcept { return compute_kernel; } /// The wrapped OpenCL cl_kernel. cl_kernel compute_kernel; diff --git a/include/plssvm/backends/OpenCL/detail/utility.hpp b/include/plssvm/backends/OpenCL/detail/utility.hpp index b32326b06..fad3153dc 100644 --- a/include/plssvm/backends/OpenCL/detail/utility.hpp +++ b/include/plssvm/backends/OpenCL/detail/utility.hpp @@ -14,146 +14,81 @@ #include "plssvm/backends/OpenCL/detail/command_queue.hpp" // plssvm::opencl::detail::command_queue #include "plssvm/backends/OpenCL/detail/error_code.hpp" // plssvm::opencl::detail::error_code #include "plssvm/backends/OpenCL/detail/kernel.hpp" // plssvm::opencl::detail::kernel -#include "plssvm/backends/OpenCL/exceptions.hpp" // plssvm::opencl::backend_exception -#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE -#include "plssvm/detail/arithmetic_type_name.hpp" // plssvm::detail::arithmetic_type_name #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/detail/string_utility.hpp" // plssvm::detail::replace_all -#include "plssvm/exceptions/exceptions.hpp" // plssvm::unsupported_kernel_type_exception +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include "CL/cl.h" // cl_kernel, cl_uint, cl_int -#include "CL/cl.h" // cl_device_id, cl_program, cl_kernel, cl_uint, cl_int, CL_QUEUE_DEVICE, CL_DEVICE_NAME, CL_PROGRAM_BUILD_LOG - // clGetCommandQueueInfo, clGetDeviceInfo, clCreateProgramWithSource, clBuildProgram, clGetProgramBuildInfo, - // clCreateKernel, clReleaseProgram, clSetKernelArg, clEnqueueNDRangeKernel, clFinish #include "fmt/core.h" // fmt::format -#include // std::size_t -#include // std::ifstream -#include // std::iosf, std::streamsize -#include // std::string -#include // std::forward, std::pair, std::make_pair -#include // std::vector +#include // std::size_t +#include // std::string +#include // std::string_view +#include // std::forward, std::pair +#include // std::vector + +/** + * @def PLSSVM_OPENCL_ERROR_CHECK + * @brief Macro used for error checking OpenCL runtime functions. + */ +#define PLSSVM_OPENCL_ERROR_CHECK(err, ...) plssvm::opencl::detail::device_assert((err), ##__VA_ARGS__) namespace plssvm::opencl::detail { +/** + * @brief Check the OpenCL error @p code. If @p code signals an error, throw a `plssvm::opencl::backend_exception`. + * @details The exception contains the error name and additional debug information. + * @param[in] code the OpenCL error code to check + * @param[in] msg optional message printed if the error code check failed + * @throws `plssvm::opencl::backend_exception` if the error code signals a failure + */ +void device_assert(error_code code, std::string_view msg = ""); + +/** + * @brief Returns the list devices matching the target platform @p target. + * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order: + * 1. NVIDIA GPUs + * 2. AMD GPUs + * 3. Intel GPUs + * 4. CPUs + * + * @param[in] target the target platform for which the devices must match + * @return the command queues (`[[nodiscard]]`) + */ +[[nodiscard]] std::vector get_command_queues(target_platform target); + +/** + * @brief Wait for the compute device associated with @p queue to finish. + * @param[in] queue the command queue to synchronize + */ +void device_synchronize(const command_queue &queue); + /** * @brief Get the name of the device associated with the OpenCL command queue @p queue. * @param[in] queue the OpenCL command queue - * @return the device name + * @return the device name (`[[nodiscard]]`) */ -std::string get_device_name(const command_queue &queue) { - error_code err; - // get device - cl_device_id device_id; - err = clGetCommandQueueInfo(queue.queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, nullptr); - if (!err) { - throw backend_exception{ fmt::format("Error obtaining device ({})!", err) }; - } - // get device name - std::string device_name(128, '\0'); - err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, device_name.size() * sizeof(char), device_name.data(), nullptr); - if (!err) { - throw backend_exception{ fmt::format("Error obtaining device name ({})!", err) }; - } - return device_name.substr(0, device_name.find_first_of('\0')); -} +[[nodiscard]] std::string get_device_name(const command_queue &queue); /** * @brief Convert the kernel type @p kernel to the function names for the q and svm kernel functions. * @param[in] kernel the kernel type - * @return the kernel function names (first: q_kernel name, second: svm_kernel name) + * @return the kernel function names (first: q_kernel name, second: svm_kernel name) (`[[nodiscard]]`) */ -[[nodiscard]] std::pair kernel_type_to_function_name(const kernel_type kernel) { - switch (kernel) { - case kernel_type::linear: - return std::make_pair("device_kernel_q_linear", "device_kernel_linear"); - case kernel_type::polynomial: - return std::make_pair("device_kernel_q_poly", "device_kernel_poly"); - case kernel_type::rbf: - return std::make_pair("device_kernel_q_radial", "device_kernel_radial"); - } - throw unsupported_kernel_type_exception{ fmt::format("Unknown kernel type (value: {})!", ::plssvm::detail::to_underlying(kernel)) }; -} +[[nodiscard]] std::pair kernel_type_to_function_name(kernel_type kernel); /** * @brief Create a kernel with @p kernel_name for the given command queues from the file @p file. * @tparam real_type the floating point type used to replace the placeholders in the kernel file - * @tparam size_type the unsigned integer type used to replace the placeholders in the kernel file * @param[in] queues the used OpenCL command queues * @param[in] file the file containing the kernel * @param[in] kernel_name the name of the kernel to create - * @return the kernel + * @throws plssvm::invalid_file_format_exception if the file couldn't be read using [`std::ifstream::read`](https://en.cppreference.com/w/cpp/io/basic_istream/read) + * @return the kernel (`[[nodiscard]]`) */ -template -std::vector create_kernel(const std::vector &queues, const std::string &file, const std::string &kernel_name) { - // read kernel - std::string kernel_src_string; - { - std::ifstream in{ file }; - if (in.fail()) { - throw backend_exception{ fmt::format("Couldn't open kernel source file: {}!", file) }; - } - in.seekg(0, std::ios::end); - std::streamsize len = in.tellg(); - in.seekg(0, std::ios::beg); - - kernel_src_string.resize(len); - in.read(kernel_src_string.data(), len); - } - - // replace type - ::plssvm::detail::replace_all(kernel_src_string, "real_type", ::plssvm::detail::arithmetic_type_name()); - ::plssvm::detail::replace_all(kernel_src_string, "size_type", ::plssvm::detail::arithmetic_type_name()); - // replace constants - ::plssvm::detail::replace_all(kernel_src_string, "INTERNAL_BLOCK_SIZE", fmt::format("{}", INTERNAL_BLOCK_SIZE)); - ::plssvm::detail::replace_all(kernel_src_string, "THREAD_BLOCK_SIZE", fmt::format("{}", THREAD_BLOCK_SIZE)); - - error_code err; - - // create program - const char *kernel_src_ptr = kernel_src_string.c_str(); - // TODO: not all command queue must have the same context (but this would be highly unlikely) - cl_program program = clCreateProgramWithSource(queues[0].context, 1, &kernel_src_ptr, nullptr, &err); - if (!err) { - throw backend_exception{ fmt::format("Error creating OpenCL program ({})!", err) }; - } - // TODO: add optimization flags? - err = clBuildProgram(program, 0, nullptr, nullptr, nullptr, nullptr); - if (!err) { - // TODO: c++-ify - // Determine the size of the log - size_t log_size; - clGetProgramBuildInfo(program, queues[0].device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - - // Allocate memory for the log - char *log = (char *) malloc(log_size); - - // Get the log - clGetProgramBuildInfo(program, queues[0].device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - - // Print the log - throw backend_exception{ fmt::format("Error building OpenCL program ({})!: {}\n", err, log) }; - } - - // build kernels - std::vector kernels; - for ([[maybe_unused]] const command_queue &q : queues) { - // create kernel - kernels.emplace_back(clCreateKernel(program, kernel_name.c_str(), &err)); - if (!err) { - throw backend_exception{ fmt::format("Error creating OpenCL kernel ({})!", err) }; - } - } - - // release resource - if (program) { - err = clReleaseProgram(program); - if (!err) { - throw backend_exception{ fmt::format("Error releasing OpenCL program resources ({})!", err) }; - } - } - - return kernels; -} +template +[[nodiscard]] std::vector create_kernel(const std::vector &queues, const std::string &file, const std::string &kernel_name); /** * @brief Set all arguments in the parameter pack @p args for the kernel @p kernel. @@ -162,20 +97,18 @@ std::vector create_kernel(const std::vector &queu * @param[in] args the arguments to set */ template -void set_kernel_args(cl_kernel kernel, Args... args) { +inline void set_kernel_args(cl_kernel kernel, Args... args) { cl_uint i = 0; // iterate over parameter pack and set OpenCL kernel ([&](auto &arg) { - error_code err = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg); - if (!err) { - throw backend_exception{ fmt::format("Error setting OpenCL kernel argument {} ({})!", i - 1, err) }; - } + const error_code ec = clSetKernelArg(kernel, i++, sizeof(decltype(arg)), &arg); + PLSSVM_OPENCL_ERROR_CHECK(ec, fmt::format("error setting OpenCL kernel argument {}", i - 1)); }(args), ...); } /** - * @brief + * @brief Run the 1D @p kernel on the @p queue with the additional parameters @p args. * @tparam Args the types of the arguments * @param[in,out] queue the command queue on which the kernel should be executed * @param[in,out] kernel the kernel to run @@ -184,7 +117,7 @@ void set_kernel_args(cl_kernel kernel, Args... args) { * @param[in] args the arguments to set */ template -void run_kernel(const command_queue &queue, cl_kernel kernel, const std::vector &grid_size, const std::vector &block_size, Args &&...args) { +inline void run_kernel(const command_queue &queue, cl_kernel kernel, const std::vector &grid_size, const std::vector &block_size, Args &&...args) { PLSSVM_ASSERT(grid_size.size() == block_size.size(), "grid_size and block_size must have the same number of dimensions!: {} != {}", grid_size.size(), block_size.size()); PLSSVM_ASSERT(grid_size.size() <= 3, "The number of dimensions must be less or equal than 3!: {} > 3", grid_size.size()); @@ -192,20 +125,13 @@ void run_kernel(const command_queue &queue, cl_kernel kernel, const std::vector< set_kernel_args(kernel, std::forward(args)...); // enqueue kernel in command queue - error_code err = clEnqueueNDRangeKernel(queue.queue, kernel, static_cast(grid_size.size()), nullptr, grid_size.data(), block_size.data(), 0, nullptr, nullptr); - if (!err) { - throw backend_exception{ fmt::format("Error enqueuing OpenCL kernel ({})!", err) }; - } - + PLSSVM_OPENCL_ERROR_CHECK(clEnqueueNDRangeKernel(queue.queue, kernel, static_cast(grid_size.size()), nullptr, grid_size.data(), block_size.data(), 0, nullptr, nullptr), "error enqueuing OpenCL kernel"); // wait until kernel computation finished - err = clFinish(queue.queue); - if (!err) { - throw backend_exception{ fmt::format("Error running OpenCL kernel ({})!", err) }; - } + PLSSVM_OPENCL_ERROR_CHECK(clFinish(queue.queue), "error running OpenCL kernel"); } /** - * @brief + * @brief Run the 1D @p kernel on the @p queue with the additional parameters @p args. * @tparam Args the types of the arguments * @param[in,out] queue the command queue on which the kernel should be executed * @param[in,out] kernel the kernel to run @@ -214,7 +140,7 @@ void run_kernel(const command_queue &queue, cl_kernel kernel, const std::vector< * @param[in] args the arguments to set */ template -void run_kernel(const command_queue &queue, cl_kernel kernel, std::size_t grid_size, std::size_t block_size, Args &&...args) { +inline void run_kernel(const command_queue &queue, cl_kernel kernel, std::size_t grid_size, std::size_t block_size, Args &&...args) { run_kernel(queue, kernel, std::vector{ grid_size }, std::vector{ block_size }, std::forward(args)...); } diff --git a/include/plssvm/backends/OpenCL/exceptions.hpp b/include/plssvm/backends/OpenCL/exceptions.hpp index ff0b48c30..f6bbb5ab8 100644 --- a/include/plssvm/backends/OpenCL/exceptions.hpp +++ b/include/plssvm/backends/OpenCL/exceptions.hpp @@ -24,12 +24,11 @@ namespace plssvm::opencl { class backend_exception : public exception { public: /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. * @param[in] msg the exception's `what()` message * @param[in] loc the exception's call side information */ - explicit backend_exception(const std::string &msg, source_location loc = source_location::current()) : - ::plssvm::exception{ msg, "opencl::backend_exception", loc } {} + explicit backend_exception(const std::string &msg, source_location loc = source_location::current()); }; -}; // namespace plssvm::opencl \ No newline at end of file +} // namespace plssvm::opencl \ No newline at end of file diff --git a/include/plssvm/backends/OpenCL/predict_kernel.cl b/include/plssvm/backends/OpenCL/predict_kernel.cl new file mode 100644 index 000000000..c9e6be852 --- /dev/null +++ b/include/plssvm/backends/OpenCL/predict_kernel.cl @@ -0,0 +1,103 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the functions used for prediction for the C-SVM using the OpenCL backend. + */ + +//#include "detail/atomics.cl" // atomicAdd -> included via string concatenation when building the device kernels + +/** + * @brief Calculate the `w` vector to speed up the prediction of the labels for data points using the linear kernel function. + * @details Supports multi-GPU execution. + * @tparam real_type the type of the data + * @param[out] w_d the `w` vector to assemble + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] num_features the number of features per support vector + */ +__kernel void device_kernel_w_linear(__global real_type *w_d, __global real_type *data_d, __global real_type *data_last_d, __global real_type *alpha_d, const kernel_index_type num_data_points, const kernel_index_type num_features) { + const kernel_index_type index = get_global_id(0); + real_type temp = 0.0; + if (index < num_features) { + for (kernel_index_type dat = 0; dat < num_data_points - 1; ++dat) { + temp += alpha_d[dat] * data_d[dat + (num_data_points - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * index]; + } + temp += alpha_d[num_data_points - 1] * data_last_d[index]; + w_d[index] = temp; + } +} + +/** + * @brief Predicts the labels for data points using the polynomial kernel function. + * @details Currently only single GPU execution is supported. + * @tparam real_type the type of the data + * @param[in] out_d the calculated predictions + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] points the data points to predict + * @param[in] num_predict_points the total number of data points to predict + * @param[in] num_features the number of features per support vector and point to predict + * @param[in] degree the degree parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] coef0 the coef0 parameter used in the polynomial kernel function + */ +__kernel void device_kernel_predict_poly(__global real_type *out_d, __global const real_type *data_d, __global const real_type *data_last_d, __global const real_type *alpha_d, const kernel_index_type num_data_points, __global const real_type *points, const kernel_index_type num_predict_points, const kernel_index_type num_features, const int degree, const real_type gamma, const real_type coef0) { + const kernel_index_type data_point_index = get_global_id(0); + const kernel_index_type predict_point_index = get_global_id(1); + + real_type temp = 0.0; + if (predict_point_index < num_predict_points) { + for (kernel_index_type feature_index = 0; feature_index < num_features; ++feature_index) { + if (data_point_index == num_data_points) { + temp += data_last_d[feature_index] * points[predict_point_index + (num_predict_points + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]; + } else { + temp += data_d[data_point_index + (num_data_points - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] * points[predict_point_index + (num_predict_points + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]; + } + } + + temp = alpha_d[data_point_index] * pow(gamma * temp + coef0, degree); + atomicAdd(&out_d[predict_point_index], temp); + } +} + +/** + * @brief Predicts the labels for data points using the radial basis functions kernel function. + * @details Currently only single GPU execution is supported. + * @tparam real_type the type of the data + * @param[in] out_d the calculated predictions + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] points the data points to predict + * @param[in] num_predict_points the total number of data points to predict + * @param[in] num_features the number of features per support vector and point to predict + * @param[in] gamma the gamma parameter used in the rbf kernel function + */ +__kernel void device_kernel_predict_radial(__global real_type *out_d, __global const real_type *data_d, __global const real_type *data_last_d, __global const real_type *alpha_d, const kernel_index_type num_data_points, __global const real_type *points, const kernel_index_type num_predict_points, const kernel_index_type num_features, const real_type gamma) { + const kernel_index_type data_point_index = get_global_id(0); + const kernel_index_type predict_point_index = get_global_id(1); + + real_type temp = 0.0; + if (predict_point_index < num_predict_points) { + for (kernel_index_type feature_index = 0; feature_index < num_features; ++feature_index) { + if (data_point_index == num_data_points) { + temp += (data_last_d[feature_index] - points[predict_point_index + (num_predict_points + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]) * (data_last_d[feature_index] - points[predict_point_index + (num_predict_points + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]); + } else { + temp += (data_d[data_point_index + (num_data_points - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] - points[predict_point_index + (num_predict_points + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]) * (data_d[data_point_index + (num_data_points - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] - points[predict_point_index + (num_predict_points + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]); + } + } + + temp = alpha_d[data_point_index] * exp(-gamma * temp); + atomicAdd(&out_d[predict_point_index], temp); + } +} \ No newline at end of file diff --git a/include/plssvm/backends/OpenCL/q_kernel.cl b/include/plssvm/backends/OpenCL/q_kernel.cl new file mode 100644 index 000000000..af9524495 --- /dev/null +++ b/include/plssvm/backends/OpenCL/q_kernel.cl @@ -0,0 +1,71 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines CUDA functions for generating the `q` vector. + */ + +/** + * @brief Calculates the `q` vector using the linear C-SVM kernel. + * @details Supports multi-GPU execution. + * @tparam real_type the type of the data + * @param[out] q the calculated `q` vector + * @param[in] data_d the one-dimensional data matrix + * @param[in] data_last the last row in the data matrix + * @param[in] num_rows the number of rows in the data matrix + * @param[in] feature_range number of features used for the calculation + */ +__kernel void device_kernel_q_linear(__global real_type *q, __global real_type *data_d, __global real_type *data_last, const kernel_index_type num_rows, const kernel_index_type feature_range) { + const kernel_index_type index = get_global_id(0); + real_type temp = 0.0; + for (kernel_index_type i = 0; i < feature_range; ++i) { + temp += data_d[i * num_rows + index] * data_last[i]; + } + q[index] = temp; +} + +/** + * @brief Calculates the `q` vector using the polynomial C-SVM kernel. + * @details Currently only single GPU execution is supported. + * @tparam real_type the type of the data + * @param[out] q the calculated `q` vector + * @param[in] data_d the one-dimensional data matrix + * @param[in] data_last the last row in the data matrix + * @param[in] num_rows the number of rows in the data matrix + * @param[in] num_cols the number of columns in the data matrix + * @param[in] degree the degree parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] coef0 the coef0 parameter used in the polynomial kernel function + */ +__kernel void device_kernel_q_poly(__global real_type *q, __global real_type *data_d, __global real_type *data_last, const kernel_index_type num_rows, const kernel_index_type num_cols, const int degree, const real_type gamma, const real_type coef0) { + const kernel_index_type index = get_global_id(0); + real_type temp = 0.0; + for (int i = 0; i < num_cols; ++i) { + temp += data_d[i * num_rows + index] * data_last[i]; + } + q[index] = pow(gamma * temp + coef0, degree); +} + +/** + * @brief Calculates the `q` vector using the radial basis functions C-SVM kernel. + * @details Currently only single GPU execution is supported. + * @tparam real_type the type of the data + * @param[out] q the calculated `q` vector + * @param[in] data_d the one-dimensional data matrix + * @param[in] data_last the last row in the data matrix + * @param[in] num_rows the number of rows in the data matrix + * @param[in] num_cols the number of columns in the data matrix + * @param[in] gamma the gamma parameter used in the rbf kernel function + */ +__kernel void device_kernel_q_radial(__global real_type *q, __global real_type *data_d, __global real_type *data_last, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type gamma) { + const kernel_index_type index = get_global_id(0); + real_type temp = 0.0; + for (kernel_index_type i = 0; i < num_cols; ++i) { + temp += (data_d[i * num_rows + index] - data_last[i]) * (data_d[i * num_rows + index] - data_last[i]); + } + q[index] = exp(-gamma * temp); +} \ No newline at end of file diff --git a/src/plssvm/backends/OpenCL/svm_kernel.cl b/include/plssvm/backends/OpenCL/svm_kernel.cl similarity index 51% rename from src/plssvm/backends/OpenCL/svm_kernel.cl rename to include/plssvm/backends/OpenCL/svm_kernel.cl index e2f7b010d..2f1579454 100644 --- a/src/plssvm/backends/OpenCL/svm_kernel.cl +++ b/include/plssvm/backends/OpenCL/svm_kernel.cl @@ -1,47 +1,34 @@ /** + * @file * @author Alexander Van Craen * @author Marcel Breyer * @copyright 2018-today The PLSSVM project - All Rights Reserved * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the kernel functions for the C-SVM using the CUDA backend. */ -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable -static inline void __attribute__((overloadable)) AtomicAdd(__global const double *source, const double delta) { - union { - double f; - ulong i; - } oldVal; - union { - double f; - ulong i; - } newVal; - do { - oldVal.f = *source; - newVal.f = oldVal.f + delta; - // ++i; - } while (atom_cmpxchg((volatile __global ulong *) source, oldVal.i, newVal.i) != oldVal.i); -} - -static inline void __attribute__((overloadable)) AtomicAdd(__global const float *source, const float delta) { - union { - float f; - unsigned i; - } oldVal; - union { - float f; - unsigned i; - } newVal; - do { - oldVal.f = *source; - newVal.f = oldVal.f + delta; - } while (atom_cmpxchg((volatile __global unsigned *) source, oldVal.i, newVal.i) != oldVal.i); -} +//#include "detail/atomics.cl" // atomicAdd -> included via string concatenation when building the device kernels -__kernel void device_kernel_linear(__global const real_type *q, __global real_type *ret, __global const real_type *d, __global const real_type *data_d, const real_type QA_cost, const real_type cost, const int num_rows, const real_type add, const int first_feature, const int last_feature) { - size_type i = get_group_id(0) * get_local_size(0) * INTERNAL_BLOCK_SIZE; - size_type j = get_group_id(1) * get_local_size(1) * INTERNAL_BLOCK_SIZE; +/** + * @brief Calculates the C-SVM kernel using the linear kernel function. + * @details Supports multi-GPU execution. + * @tparam real_type the type of the data + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost the bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] feature_range number of features used for the calculation on the device @p id + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] id the id of the current device + */ +__kernel void device_kernel_linear(__global const real_type *q, __global real_type *ret, __global const real_type *d, __global const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) { + kernel_index_type i = get_group_id(0) * get_local_size(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = get_group_id(1) * get_local_size(1) * INTERNAL_BLOCK_SIZE; __local real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; __local real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; @@ -52,15 +39,15 @@ __kernel void device_kernel_linear(__global const real_type *q, __global real_ty i += get_local_id(0) * INTERNAL_BLOCK_SIZE; j += get_local_id(1) * INTERNAL_BLOCK_SIZE; // cache data - for (int vec_index = first_feature * num_rows; vec_index < last_feature * num_rows; vec_index += num_rows) { + for (kernel_index_type vec_index = 0; vec_index < feature_range * num_rows; vec_index += num_rows) { barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const size_type idx = 0; // TODO: load parallel + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE; if (get_local_id(1) == idx) { data_intern_i[get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; } - const size_type idx_2 = 0; // TODO: TODO: load parallel + const kernel_index_type idx_2 = block_id % THREAD_BLOCK_SIZE; if (get_local_id(0) == idx_2) { data_intern_j[get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; } @@ -68,52 +55,69 @@ __kernel void device_kernel_linear(__global const real_type *q, __global real_ty barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { data_j[data_index] = data_intern_j[get_local_id(1)][data_index]; } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { const real_type data_i = data_intern_i[get_local_id(0)][l]; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { matr[k][l] += data_i * data_j[k]; } } } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { real_type ret_jx = 0.0; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { real_type temp; - if (first_feature == 0) { + if (id == 0) { temp = (matr[x][y] + QA_cost - q[i + y] - q[j + x]) * add; } else { temp = matr[x][y] * add; } if (i + x > j + y) { // upper triangular matrix - AtomicAdd(&ret[i + y], temp * d[j + x]); + atomicAdd(&ret[i + y], temp * d[j + x]); ret_jx += temp * d[i + y]; } else if (i + x == j + y) { // diagonal - if (first_feature == 0) { + if (id == 0) { ret_jx += (temp + cost * add) * d[i + y]; } else { ret_jx += temp * d[i + y]; } } } - AtomicAdd(&ret[j + x], ret_jx); + atomicAdd(&ret[j + x], ret_jx); } } } -__kernel void device_kernel_poly(__global const real_type *q, __global real_type *ret, __global const real_type *d, __global const real_type *data_d, const real_type QA_cost, const real_type cost, const int num_rows, const int num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) { - size_type i = get_group_id(0) * get_local_size(0) * INTERNAL_BLOCK_SIZE; - size_type j = get_group_id(1) * get_local_size(1) * INTERNAL_BLOCK_SIZE; +/** + * @brief Calculates the C-SVM kernel using the polynomial kernel function. + * @details Currently only single GPU execution is supported. + * @tparam real_type the type of the data + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] num_cols the number of rows in the data matrix + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] degree the degree parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] coef0 the coef0 parameter used in the polynomial kernel function + */ +__kernel void device_kernel_poly(__global const real_type *q, __global real_type *ret, __global const real_type *d, __global const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) { + kernel_index_type i = get_group_id(0) * get_local_size(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = get_group_id(1) * get_local_size(1) * INTERNAL_BLOCK_SIZE; __local real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; __local real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; @@ -124,15 +128,15 @@ __kernel void device_kernel_poly(__global const real_type *q, __global real_type i += get_local_id(0) * INTERNAL_BLOCK_SIZE; j += get_local_id(1) * INTERNAL_BLOCK_SIZE; // cache data - for (int vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const size_type idx = 0; // TODO: load parallel + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE; if (get_local_id(1) == idx) { data_intern_i[get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; } - const size_type idx_2 = 0; // TODO: TODO: load parallel + const kernel_index_type idx_2 = block_id % THREAD_BLOCK_SIZE; if (get_local_id(0) == idx_2) { data_intern_j[get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; } @@ -140,43 +144,58 @@ __kernel void device_kernel_poly(__global const real_type *q, __global real_type barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { data_j[data_index] = data_intern_j[get_local_id(1)][data_index]; } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { const real_type data_i = data_intern_i[get_local_id(0)][l]; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { matr[k][l] += data_i * data_j[k]; } } } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { real_type ret_jx = 0.0; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { const real_type temp = (pow(gamma * matr[x][y] + coef0, degree) + QA_cost - q[i + y] - q[j + x]) * add; if (i + x > j + y) { // upper triangular matrix - AtomicAdd(&ret[i + y], temp * d[j + x]); + atomicAdd(&ret[i + y], temp * d[j + x]); ret_jx += temp * d[i + y]; } else if (i + x == j + y) { // diagonal ret_jx += (temp + cost * add) * d[i + y]; } } - AtomicAdd(&ret[j + x], ret_jx); + atomicAdd(&ret[j + x], ret_jx); } } } -__kernel void device_kernel_radial(__global const real_type *q, __global real_type *ret, __global const real_type *d, __global const real_type *data_d, const real_type QA_cost, const real_type cost, const int num_rows, const int num_cols, const real_type add, const real_type gamma) { - size_type i = get_group_id(0) * get_local_size(0) * INTERNAL_BLOCK_SIZE; - size_type j = get_group_id(1) * get_local_size(1) * INTERNAL_BLOCK_SIZE; +/** + * @brief Calculates the C-SVM kernel using the radial basis function kernel function. + * @details Currently only single GPU execution is supported. + * @tparam real_type the type of the data + * @param[in] q the `q` vector + * @param[out] ret the result vector + * @param[in] d the right-hand side of the equation + * @param[in] data_d the one-dimension data matrix + * @param[in] QA_cost he bottom right matrix entry multiplied by cost + * @param[in] cost 1 / the cost parameter in the C-SVM + * @param[in] num_rows the number of columns in the data matrix + * @param[in] num_cols the number of rows in the data matrix + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] gamma the gamma parameter used in the rbf kernel function + */ +__kernel void device_kernel_radial(__global const real_type *q, __global real_type *ret, __global const real_type *d, __global const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) { + kernel_index_type i = get_group_id(0) * get_local_size(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = get_group_id(1) * get_local_size(1) * INTERNAL_BLOCK_SIZE; __local real_type data_intern_i[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; __local real_type data_intern_j[THREAD_BLOCK_SIZE][INTERNAL_BLOCK_SIZE]; @@ -187,15 +206,15 @@ __kernel void device_kernel_radial(__global const real_type *q, __global real_ty i += get_local_id(0) * INTERNAL_BLOCK_SIZE; j += get_local_id(1) * INTERNAL_BLOCK_SIZE; // cache data - for (int vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { + for (kernel_index_type vec_index = 0; vec_index < num_cols * num_rows; vec_index += num_rows) { barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const size_type idx = 0; // TODO: load parallel + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const kernel_index_type idx = block_id % THREAD_BLOCK_SIZE; if (get_local_id(1) == idx) { data_intern_i[get_local_id(0)][block_id] = data_d[block_id + vec_index + i]; } - const size_type idx_2 = 0; // TODO: TODO: load parallel + const kernel_index_type idx_2 = block_id % THREAD_BLOCK_SIZE; if (get_local_id(0) == idx_2) { data_intern_j[get_local_id(1)][block_id] = data_d[block_id + vec_index + j]; } @@ -203,36 +222,36 @@ __kernel void device_kernel_radial(__global const real_type *q, __global real_ty barrier(CLK_LOCAL_MEM_FENCE); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { data_j[data_index] = data_intern_j[get_local_id(1)][data_index]; } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { const real_type data_i = data_intern_i[get_local_id(0)][l]; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]); } } } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { real_type ret_jx = 0.0; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { const real_type temp = (exp(-gamma * matr[x][y]) + QA_cost - q[i + y] - q[j + x]) * add; if (i + x > j + y) { // upper triangular matrix - AtomicAdd(&ret[i + y], temp * d[j + x]); + atomicAdd(&ret[i + y], temp * d[j + x]); ret_jx += temp * d[i + y]; } else if (i + x == j + y) { // diagonal ret_jx += (temp + cost * add) * d[i + y]; } } - AtomicAdd(&ret[j + x], ret_jx); + atomicAdd(&ret[j + x], ret_jx); } } } \ No newline at end of file diff --git a/include/plssvm/backends/OpenMP/csvm.hpp b/include/plssvm/backends/OpenMP/csvm.hpp index ee7b1dd9b..a5263280e 100644 --- a/include/plssvm/backends/OpenMP/csvm.hpp +++ b/include/plssvm/backends/OpenMP/csvm.hpp @@ -11,17 +11,20 @@ #pragma once -#include "plssvm/csvm.hpp" // plssvm::csvm -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/target_platform.hpp" // plssvm::target_platform +#include "plssvm/csvm.hpp" // plssvm::csvm #include // std::vector -namespace plssvm::openmp { +namespace plssvm { + +// forward declare parameter class +template +class parameter; + +namespace openmp { /** - * @brief The C-SVM class using the OpenMP backend. + * @brief A C-SVM implementation using OpenMP as backend. * @tparam T the type of the data */ template @@ -48,48 +51,54 @@ class csvm : public ::plssvm::csvm { public: /// The type of the data. Must be either `float` or `double`. using real_type = typename base_type::real_type; - /// Unsigned integer type. - using size_type = typename base_type::size_type; /** * @brief Construct a new C-SVM using the OpenMP backend with the parameters given through @p params. * @param[in] params struct encapsulating all possible parameters + * @throws plssvm::csvm::csvm() exceptions + * @throws plssvm::openmp::backend_exception if the target platform isn't plssvm::target_platform::automatic or plssvm::target_platform::cpu + * @throws plssvm::openmp::backend_exception if the plssvm::target_platform::cpu target isn't available */ explicit csvm(const parameter ¶ms); /** - * @brief Uses the already learned model to predict the class of multiple (new) data points. - * @param[in] points the data points to predict - * @return a `std::vector` filled with negative values for each prediction for a data point with the negative class and positive values otherwise ([[nodiscard]]) + * @copydoc plssvm::csvm::predict(const std::vector>&) */ - [[nodiscard]] virtual std::vector predict(const std::vector> &points) override; - - // TODO: write predict + [[nodiscard]] std::vector predict(const std::vector> &points) override; protected: + /** + * @copydoc plssvm::csvm::setup_data_on_device + */ void setup_data_on_device() override { // OpenMP device is the CPU -> no special load functions } - std::vector generate_q() override; - std::vector solver_CG(const std::vector &b, size_type imax, real_type eps, const std::vector &q) override; + /** + * @copydoc plssvm::csvm::generate_q + */ + [[nodiscard]] std::vector generate_q() override; + /** + * @copydoc plssvm::csvm::solver_CG + */ + std::vector solver_CG(const std::vector &b, std::size_t imax, real_type eps, const std::vector &q) override; + /** + * @copydoc plssvm::csvm::update_w + */ + void update_w() override; /** * @brief Select the correct kernel based on the value of @p kernel_ and run it on the CPU using OpenMP. * @param[in] q the `q` vector * @param[out] ret the result vector - * @param[in] d the right-hand side + * @param[in] d the right-hand side of the equation * @param[in] data the data * @param[in] add denotes whether the values are added or subtracted from the result vector */ void run_device_kernel(const std::vector &q, std::vector &ret, const std::vector &d, const std::vector> &data, real_type add); - - /** - * @brief updates the `w_` vector to the current data and alpha values. - */ - virtual void update_w() override; }; extern template class csvm; extern template class csvm; -} // namespace plssvm::openmp \ No newline at end of file +} // namespace openmp +} // namespace plssvm \ No newline at end of file diff --git a/include/plssvm/backends/OpenMP/exceptions.hpp b/include/plssvm/backends/OpenMP/exceptions.hpp index 1387afb73..b5ccf67c5 100644 --- a/include/plssvm/backends/OpenMP/exceptions.hpp +++ b/include/plssvm/backends/OpenMP/exceptions.hpp @@ -24,12 +24,11 @@ namespace plssvm::openmp { class backend_exception : public exception { public: /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. * @param[in] msg the exception's `what()` message * @param[in] loc the exception's call side information */ - explicit backend_exception(const std::string &msg, source_location loc = source_location::current()) : - ::plssvm::exception{ msg, "openmp::backend_exception", loc } {} + explicit backend_exception(const std::string &msg, source_location loc = source_location::current()); }; -}; // namespace plssvm::openmp \ No newline at end of file +} // namespace plssvm::openmp \ No newline at end of file diff --git a/include/plssvm/backends/OpenMP/svm_kernel.hpp b/include/plssvm/backends/OpenMP/svm_kernel.hpp index 3ce46e56b..0a09cdf62 100644 --- a/include/plssvm/backends/OpenMP/svm_kernel.hpp +++ b/include/plssvm/backends/OpenMP/svm_kernel.hpp @@ -42,8 +42,6 @@ void device_kernel_linear(const std::vector &q, std::vector void device_kernel_poly(const std::vector &q, std::vector &ret, const std::vector &d, const std::vector> &data, real_type QA_cost, real_type cost, real_type add, int degree, real_type gamma, real_type coef0); @@ -59,8 +57,6 @@ void device_kernel_poly(const std::vector &q, std::vector * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] add denotes whether the values are added or subtracted from the result vector * @param[in] gamma the gamma parameter used in the rbf kernel function - * - * @attention Currently not implemented! */ template void device_kernel_radial(const std::vector &q, std::vector &ret, const std::vector &d, const std::vector> &data, real_type QA_cost, real_type cost, real_type add, real_type gamma); diff --git a/include/plssvm/backends/SYCL/csvm.hpp b/include/plssvm/backends/SYCL/csvm.hpp index 5d9f62a09..1ecda3705 100644 --- a/include/plssvm/backends/SYCL/csvm.hpp +++ b/include/plssvm/backends/SYCL/csvm.hpp @@ -12,32 +12,38 @@ #pragma once #include "plssvm/backends/SYCL/detail/device_ptr.hpp" // plssvm::sycl::detail::device_ptr -#include "plssvm/csvm.hpp" // plssvm::csvm -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/target_platform.hpp" // plssvm::target_platform +#include "plssvm/backends/gpu_csvm.hpp" // plssvm::detail::gpu_csvm #include "sycl/sycl.hpp" // sycl::queue -#include // std::vector +namespace plssvm { -namespace plssvm::sycl { +// forward declare parameter class +template +class parameter; + +namespace detail { + +// forward declare execution_range class +class execution_range; + +} // namespace detail + +namespace sycl { /** - * @brief The C-SVM class using the SYCL backend. + * @brief A C-SVM implementation using SYCL as backend. * @tparam T the type of the data */ template -class csvm : public ::plssvm::csvm { +class csvm : public ::plssvm::detail::gpu_csvm, ::sycl::queue> { protected: // protected for the test MOCK class /// The template base type of the SYCL C-SVM class. - using base_type = ::plssvm::csvm; - using base_type::alpha_ptr_; - using base_type::bias_; + using base_type = ::plssvm::detail::gpu_csvm, ::sycl::queue>; + using base_type::coef0_; using base_type::cost_; - using base_type::data_ptr_; using base_type::degree_; using base_type::gamma_; using base_type::kernel_; @@ -46,79 +52,63 @@ class csvm : public ::plssvm::csvm { using base_type::print_info_; using base_type::QA_cost_; using base_type::target_; - using base_type::w_; + + using base_type::data_d_; + using base_type::data_last_d_; + using base_type::devices_; + using base_type::num_cols_; + using base_type::num_rows_; public: /// The type of the data. Must be either `float` or `double`. using real_type = typename base_type::real_type; - /// Unsigned integer type. - using size_type = typename base_type::size_type; + + /// The type of the SYCL device pointer. + using device_ptr_type = ::plssvm::sycl::detail::device_ptr; + /// The type of the SYCL device queue. + using queue_type = ::sycl::queue; /** * @brief Construct a new C-SVM using the SYCL backend with the parameters given through @p params. * @param[in] params struct encapsulating all possible parameters + * @throws plssvm::csvm::csvm() exceptions + * @throws plssvm::sycl::backend_exception if the requested plssvm::target_platform isn't available + * @throws plssvm::sycl::backend_exception if no possible OpenCL devices could be found */ explicit csvm(const parameter ¶ms); /** * @brief Wait for all operations in all [`sycl::queue`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:interface.queue.class) to finish. - * @details Terminates the program, if any asynchronous exceptions are thrown. + * @details Terminates the program, if any asynchronous exception is thrown. */ ~csvm() override; + protected: /** - * @brief Uses the already learned model to predict the class of multiple (new) data points. - * @param[in] points the data points to predict - * @return a `std::vector` filled with negative values for each prediction for a data point with the negative class and positive values otherwise ([[nodiscard]]) + * @copydoc plssvm::detail::gpu_csvm::device_synchronize */ - [[nodiscard]] virtual std::vector predict(const std::vector> &points) override; - - protected: - void setup_data_on_device() override; - std::vector generate_q() override; - std::vector solver_CG(const std::vector &b, size_type imax, real_type eps, const std::vector &q) override; + void device_synchronize(queue_type &queue) final; /** - * @brief Select the correct kernel based on the value of @p kernel_ and run it on the SYCL @p device. - * @param[in] device the SYCL device to run the kernel on - * @param[in] q_d subvector of the least-squares matrix equation - * @param[in,out] r_d the result vector - * @param[in] x_d the `x` vector - * @param[in] data_d the data - * @param[in] add denotes whether the values are added or subtracted from the result vector + * @copydoc plssvm::detail::gpu_csvm::run_q_kernel */ - void run_device_kernel(size_type device, const detail::device_ptr &q_d, detail::device_ptr &r_d, const detail::device_ptr &x_d, const detail::device_ptr &data_d, real_type add); + void run_q_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &q_d, std::size_t num_features) final; /** - * @brief Combines the data in @p buffer_d from all devices into @p buffer and distributes them back to each devices. - * @param[in,out] buffer_d the data to gather - * @param[in,out] buffer the reduces data + * @copydoc plssvm::detail::gpu_csvm::run_svm_kernel */ - void device_reduction(std::vector> &buffer_d, std::vector &buffer); - + void run_svm_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, const real_type add, std::size_t num_features) final; + /** + * @copydoc plssvm::detail::gpu_csvm::run_w_kernel + */ + void run_w_kernel(std::size_t device, const ::plssvm::detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, std::size_t num_features) final; /** - * @brief updates the `w_` vector to the current data and alpha values. + * @copydoc plssvm::detail::gpu_csvm::run_predict_kernel */ - virtual void update_w() override; - - /// The available/used SYCL devices. - std::vector<::sycl::queue> devices_{}; // TODO: rename - /// The number of data points excluding the last data point. - size_type dept_{}; - /// The boundary size used to remove boundary condition checks inside the kernels. - size_type boundary_size_{}; - /// The number of rows to calculate including the boundary values. - int num_rows_{}; - /// The number of columns in the data matrix (= the number of features per data point). - int num_cols_{}; - /// The data saved across all devices. - std::vector> data_d_{}; - /// The last row of the data matrix. - std::vector> data_last_d_{}; - /// The normal vector used for speeding up the prediction in case of the linear kernel function saved on the first device. - detail::device_ptr w_d_{}; + void run_predict_kernel(const ::plssvm::detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, std::size_t num_predict_points) final; }; extern template class csvm; extern template class csvm; -} // namespace plssvm::sycl \ No newline at end of file +} // namespace sycl +} // namespace plssvm \ No newline at end of file diff --git a/include/plssvm/backends/SYCL/detail/atomics.hpp b/include/plssvm/backends/SYCL/detail/atomics.hpp new file mode 100644 index 000000000..af6f59003 --- /dev/null +++ b/include/plssvm/backends/SYCL/detail/atomics.hpp @@ -0,0 +1,37 @@ +/** +* @file +* @author Alexander Van Craen +* @author Marcel Breyer +* @copyright 2018-today The PLSSVM project - All Rights Reserved +* @license This file is part of the PLSSVM project which is released under the MIT license. +* See the LICENSE.md file in the project root for full license information. +* +* @brief Defines an atomic_ref wrapper for the SYCL backend. +*/ + +#pragma once + +#include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL, PLSSVM_SYCL_BACKEND_COMPILER_DPCPP + +#include "sycl/sycl.hpp" // sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space + +namespace plssvm::sycl { +namespace detail { + +// TODO: remove #if after Intel has a SYCL2020 conformant sycl::atomic_ref implementation +#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP +using ::sycl::ext::oneapi::atomic_ref; +#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL +using ::sycl::atomic_ref; +#endif + +} // namespace detail + +/** + * @brief Shortcut alias for a [`sycl::atomic_ref`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:atomic-references) targeting global memory. + * @tparam T the type of the accessed values + */ +template +using atomic_op = detail::atomic_ref; + +} // namespace plssvm::sycl \ No newline at end of file diff --git a/include/plssvm/backends/SYCL/detail/device_ptr.hpp b/include/plssvm/backends/SYCL/detail/device_ptr.hpp index 69b571949..60f3ee604 100644 --- a/include/plssvm/backends/SYCL/detail/device_ptr.hpp +++ b/include/plssvm/backends/SYCL/detail/device_ptr.hpp @@ -6,82 +6,68 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Small wrapper around a SYCL device pointer and functions. + * @brief Small wrapper around a SYCL device pointer. */ #pragma once -#include "plssvm/target_platform.hpp" // plssvm::target_platform - #include "sycl/sycl.hpp" // sycl::queue -#include // std::size_t -#include // std::vector +#include // std::size_t +#include // std::is_same_v +#include // std::vector namespace plssvm::sycl::detail { -/** - * @brief Returns the list devices matching the target platform @p target. - * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order: - * 1. NVIDIA GPUs - * 2. AMD GPUs - * 3. Intel GPUs - * 4. CPUs - * @param[in] target the target platform for which the devices must match - * @return the devices (`[[nodiscard]]`) - */ -[[nodiscard]] std::vector<::sycl::queue> get_device_list(target_platform target); -/** - * @brief Wait for the compute device associated with @p queue to finish. - * @param[in] queue the SYCL queue to synchronize - */ -void device_synchronize(::sycl::queue &queue); - /** * @brief Small wrapper class around a SYCL device pointer together with commonly used device functions. * @tparam T the type of the kernel pointer to wrap */ template class device_ptr { + // only float and doubles are allowed + static_assert(std::is_same_v || std::is_same_v, "The template type can only be 'float' or 'double'!"); + public: - /// The type of the values used in the SYCL device pointer. + /// The type of the values used in the device_ptr. using value_type = T; - /// The type of the wrapped SYCL device pointer. + /// The type of the wrapped device_ptr. using pointer = value_type *; - /// The const type of the wrapped SYCL device pointer. + /// The const type of the wrapped device_ptr. using const_pointer = const value_type *; /// The used size type. using size_type = std::size_t; /** - * @brief Default construct a `device_ptr` with a size of `0`. - * @details Always associated with device `0`. + * @brief Default construct a device_ptr with a size of 0. + * @details Always associated with device 0. */ device_ptr() = default; /** * @brief Allocates `size * sizeof(T)` bytes on the device associated with @p queue. - * @param[in] size the number of elements represented by the device pointer + * @param[in] size the number of elements represented by the device_ptr * @param[in] queue the associated SYCL queue */ device_ptr(size_type size, ::sycl::queue &queue); /** - * @brief Move only type, therefore deleted copy-constructor. + * @brief Delete copy-constructor to make device_ptr a move only type. */ device_ptr(const device_ptr &) = delete; /** - * @brief Move-constructor. - * @param[in,out] other the `device_ptr` to move-construct from + * @brief Move-constructor as device_ptr is a move-only type. + * @param[in,out] other the device_ptr to move-construct from */ device_ptr(device_ptr &&other) noexcept; /** - * @brief Move only type, therefore deleted copy-assignment operator. + * @brief Delete copy-assignment-operator to make device_ptr a move only type. */ device_ptr &operator=(const device_ptr &) = delete; /** - * @brief Move-assignment operator. Uses the copy-and-swap idiom. - * @param[in] other the `device_ptr` to move-assign from + * @brief Move-assignment-operator as device_ptr is a move-only type. + * @details Uses the copy-and-swap idiom. + * @param[in] other the device_ptr to move-assign from * @return `*this` */ device_ptr &operator=(device_ptr &&other) noexcept; @@ -93,13 +79,13 @@ class device_ptr { /** * @brief Swap the contents of `*this` with the contents of @p other. - * @param[in,out] other the other `device_ptr` + * @param[in,out] other the other device_ptr */ void swap(device_ptr &other) noexcept; /** * @brief Swap the contents of @p lhs and @p rhs. - * @param[in,out] lhs a `device_ptr` - * @param[in,out] rhs a `device_ptr` + * @param[in,out] lhs a device_ptr + * @param[in,out] rhs a device_ptr */ friend void swap(device_ptr &lhs, device_ptr &rhs) noexcept { lhs.swap(rhs); } @@ -124,14 +110,14 @@ class device_ptr { return data_; } /** - * @brief Get the number of elements in the wrapped SYCL device pointer. + * @brief Get the number of elements in the device_ptr. * @return the size (`[[nodiscard]]`) */ [[nodiscard]] size_type size() const noexcept { return size_; } /** - * @brief Check whether no elements are currently associated to the SYCL device pointer. + * @brief Check whether the device_ptr currently maps zero elements. * @return `true` if no elements are wrapped, `false` otherwise (`[[nodiscard]]`) */ [[nodiscard]] bool empty() const noexcept { @@ -149,7 +135,7 @@ class device_ptr { * @brief Memset all values to @p value starting at position @p pos. * @param[in] value the memset value * @param[in] pos the position to start the memset - * @throws plssvm::sycl::backend_exception if @p pos is greater or equal than `device_ptr::size()` + * @throws plssvm::sycl::backend_exception if @p pos is greater or equal than device_ptr::size() */ void memset(int value, size_type pos = 0); /** @@ -158,12 +144,12 @@ class device_ptr { * @param[in] value the memset value * @param[in] pos the position to start the memset * @param[in] count the number of values to set - * @throws plssvm::sycl::backend_exception if @p pos is greater or equal than `device_ptr::size()` + * @throws plssvm::sycl::backend_exception if @p pos is greater or equal than device_ptr::size() */ void memset(int value, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from @p data_to_copy to the device. + * @brief Memcpy device_ptr::size() many values from @p data_to_copy to the device. * @param[in] data_to_copy the data to copy onto the device * @throws plssvm::sycl::backend_exception if @p data_to_copy is too small to satisfy the memcpy */ @@ -178,7 +164,7 @@ class device_ptr { */ void memcpy_to_device(const std::vector &data_to_copy, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from @p data_to_copy to the device. + * @brief Memcpy device_ptr::size() many values from @p data_to_copy to the device. * @param[in] data_to_copy the data to copy onto the device */ void memcpy_to_device(const_pointer data_to_copy); @@ -192,7 +178,7 @@ class device_ptr { void memcpy_to_device(const_pointer data_to_copy, size_type pos, size_type count); /** - * @brief Memcpy `device_ptr::size()` many values from the device to the host buffer @p buffer. + * @brief Memcpy device_ptr::size() many values from the device to the host buffer @p buffer. * @param[in] buffer the buffer to copy the data to * @throws plssvm::sycl::backend_exception if @p buffer is too small */ @@ -207,7 +193,7 @@ class device_ptr { */ void memcpy_to_host(std::vector &buffer, size_type pos, size_type count) const; /** - * @brief Memcpy `device_ptr::size()` many values from the device to the host buffer @p buffer. + * @brief Memcpy device_ptr::size() many values from the device to the host buffer @p buffer. * @param[in] buffer the buffer to copy the data to */ void memcpy_to_host(pointer buffer) const; @@ -221,9 +207,9 @@ class device_ptr { void memcpy_to_host(pointer buffer, size_type pos, size_type count) const; private: - ::sycl::queue *queue_ = nullptr; - pointer data_ = nullptr; - size_type size_ = 0; + ::sycl::queue *queue_{ nullptr }; + pointer data_{ nullptr }; + size_type size_{ 0 }; }; extern template class device_ptr; diff --git a/include/plssvm/backends/SYCL/detail/utility.hpp b/include/plssvm/backends/SYCL/detail/utility.hpp new file mode 100644 index 000000000..d6afe8bb1 --- /dev/null +++ b/include/plssvm/backends/SYCL/detail/utility.hpp @@ -0,0 +1,40 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Utility functions specific to the SYCL backend. + */ + +#pragma once + +#include "plssvm/target_platforms.hpp" // plssvm::target_platform + +#include "sycl/sycl.hpp" // sycl::queue + +#include // std::vector + +namespace plssvm::sycl::detail { + +/** + * @brief Returns the list devices matching the target platform @p target. + * @details If the selected target platform is `plssvm::target_platform::automatic` the selector tries to find devices in the following order: + * 1. NVIDIA GPUs + * 2. AMD GPUs + * 3. Intel GPUs + * 4. CPUs + * + * @param[in] target the target platform for which the devices must match + * @return the devices (`[[nodiscard]]`) + */ +[[nodiscard]] std::vector<::sycl::queue> get_device_list(target_platform target); +/** + * @brief Wait for the compute device associated with @p queue to finish. + * @param[in] queue the SYCL queue to synchronize + */ +void device_synchronize(::sycl::queue &queue); + +} // namespace plssvm::sycl::detail \ No newline at end of file diff --git a/include/plssvm/backends/SYCL/exceptions.hpp b/include/plssvm/backends/SYCL/exceptions.hpp index 0e85f5263..15bcaab52 100644 --- a/include/plssvm/backends/SYCL/exceptions.hpp +++ b/include/plssvm/backends/SYCL/exceptions.hpp @@ -24,12 +24,11 @@ namespace plssvm::sycl { class backend_exception : public exception { public: /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. * @param[in] msg the exception's `what()` message * @param[in] loc the exception's call side information */ - explicit backend_exception(const std::string &msg, source_location loc = source_location::current()) : - ::plssvm::exception{ msg, "sycl::backend_exception", loc } {} + explicit backend_exception(const std::string &msg, source_location loc = source_location::current()); }; -}; // namespace plssvm::sycl \ No newline at end of file +} // namespace plssvm::sycl \ No newline at end of file diff --git a/include/plssvm/backends/SYCL/predict.hpp b/include/plssvm/backends/SYCL/predict.hpp deleted file mode 100644 index dd7268d53..000000000 --- a/include/plssvm/backends/SYCL/predict.hpp +++ /dev/null @@ -1,163 +0,0 @@ -/** - * @file - * @author Alexander Van Craen - * @author Marcel Breyer - * @copyright 2018-today The PLSSVM project - All Rights Reserved - * @license This file is part of the PLSSVM project which is released under the MIT license. - * See the LICENSE.md file in the project root for full license information. - * - * @brief TODO: brief description - */ - -#pragma once - -#include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL -#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE - -#include "sycl/sycl.hpp" // sycl::nd_item, sycl::handler, sycl::accessor, sycl::access::mode, sycl::access::target, sycl::range, sycl::pow, - // sycl::exp, sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space - -namespace plssvm::sycl { - -/// Unsigned integer type. -using size_type = std::size_t; - -namespace detail { - -// TODO: move to separate file -// TODO: remove #if after Intel has a SYCL2020 conformant sycl::atomic_ref implementation -#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP -using ::sycl::ext::oneapi::atomic_ref; -#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL -using ::sycl::atomic_ref; -#endif - -} // namespace detail - -/** - * @brief Shortcut alias for a [`sycl::atomic_ref`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:atomic-references). - * @tparam real_type the type of the accessed values - */ - -template -using atomic_op = detail::atomic_ref; - -// TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations -/** - * @brief Shortcut alias for a SYCL local accessor. - * @tparam T the type of the accessed values - */ -template -using local_accessor = ::sycl::accessor; - -template -class kernel_w { - public: - kernel_w(real_type *w_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const size_type num_data_points, const size_type num_features) : - w_d_{ w_d }, data_d_{ data_d }, data_last_d_{ data_last_d }, alpha_d_{ alpha_d }, num_data_points_{ num_data_points }, num_features_{ num_features } {} - void operator()(::sycl::nd_item<1> nd_idx) const { - const auto index = nd_idx.get_global_linear_id(); - real_type temp = 0; - if (index < num_features_) { - for (size_type dat = 0; dat < num_data_points_ - 1; ++dat) { - temp += alpha_d_[dat] * data_d_[dat + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * index]; - } - temp += alpha_d_[num_data_points_ - 1] * data_last_d_[index]; - w_d_[index] = temp; - } - } - - private: - real_type *w_d_; - const real_type *data_d_; - const real_type *data_last_d_; - const real_type *alpha_d_; - const size_type num_data_points_; - const size_type num_features_; -}; - -template -class predict_points_poly { - public: - predict_points_poly(::sycl::handler &cgh, real_type *out_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const size_type num_data_points, const real_type *points, const size_type num_predict_points, const size_type num_features, const int degree, const real_type gamma, const real_type coef0) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, out_d_{ out_d }, data_d_{ data_d }, data_last_d_{ data_last_d }, alpha_d_{ alpha_d }, num_data_points_{ num_data_points }, points_{ points }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} - - void operator()(::sycl::nd_item<2> nd_idx) const { - const size_type data_point_index = nd_idx.get_global_id(0); - const size_type predict_point_index = nd_idx.get_global_id(1); - - real_type temp = 0; - if (predict_point_index < num_predict_points_) { - for (size_type feature_index = 0; feature_index < num_features_; ++feature_index) { - if (data_point_index == num_data_points_) { - temp += data_last_d_[feature_index] * points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]; - } else { - temp += data_d_[data_point_index + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] * points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]; - } - } - - temp = alpha_d_[data_point_index] * ::sycl::pow(gamma_ * temp + coef0_, static_cast(degree_)); - - atomic_op{ out_d_[predict_point_index] } += temp; - } - } - - private: - local_accessor data_intern_i_; - local_accessor data_intern_j_; - - real_type *out_d_; - const real_type *data_d_; - const real_type *data_last_d_; - const real_type *alpha_d_; - const size_type num_data_points_; - const real_type *points_; - const size_type num_predict_points_; - const size_type num_features_; - const int degree_; - const real_type gamma_; - const real_type coef0_; -}; - -template -class predict_points_rbf { - public: - predict_points_rbf(::sycl::handler &cgh, real_type *out_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const size_type num_data_points, const real_type *points, const size_type num_predict_points, const size_type num_features, const real_type gamma) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, out_d_{ out_d }, data_d_{ data_d }, data_last_d_{ data_last_d }, alpha_d_{ alpha_d }, num_data_points_{ num_data_points }, points_{ points }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, gamma_{ gamma } {} - - void operator()(::sycl::nd_item<2> nd_idx) const { - const size_type data_point_index = nd_idx.get_global_id(0); - const size_type predict_point_index = nd_idx.get_global_id(1); - - real_type temp = 0; - if (predict_point_index < num_predict_points_) { - for (size_type feature_index = 0; feature_index < num_features_; ++feature_index) { - if (data_point_index == num_data_points_) { - temp += (data_last_d_[feature_index] - points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]) * (data_last_d_[feature_index] - points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]); - } else { - temp += (data_d_[data_point_index + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] - points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]) * (data_d_[data_point_index + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] - points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]); - } - } - - temp = alpha_d_[data_point_index] * ::sycl::exp(-gamma_ * temp); - - atomic_op{ out_d_[predict_point_index] } += temp; - } - } - - private: - local_accessor data_intern_i_; - local_accessor data_intern_j_; - - real_type *out_d_; - const real_type *data_d_; - const real_type *data_last_d_; - const real_type *alpha_d_; - const size_type num_data_points_; - const real_type *points_; - const size_type num_predict_points_; - const size_type num_features_; - const real_type gamma_; -}; - -} // namespace plssvm::sycl diff --git a/include/plssvm/backends/SYCL/predict_kernel.hpp b/include/plssvm/backends/SYCL/predict_kernel.hpp new file mode 100644 index 000000000..98b666676 --- /dev/null +++ b/include/plssvm/backends/SYCL/predict_kernel.hpp @@ -0,0 +1,203 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the functions used for prediction for the C-SVM using the SYCL backend. + */ + +#pragma once + +#include "plssvm/backends/SYCL/detail/atomics.hpp" // plssvm::sycl::atomic_op +#include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE + +#include "sycl/sycl.hpp" // sycl::nd_item, sycl::range, sycl::pow, sycl::exp + +namespace plssvm::sycl { + +/** + * @brief Calculate the `w` vector to speed up the prediction of the labels for data points using the linear kernel function. + * @details Supports multi-GPU execution. + * @tparam T the type of the data + */ +template +class device_kernel_w_linear { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel generating the `w` vector used to speedup the prediction when using the linear kernel function. + * @details Currently only single GPU execution is supported. + * @param[out] w_d the `w` vector to assemble + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] num_features the number of features per support vector + */ + device_kernel_w_linear(real_type *w_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const kernel_index_type num_data_points, const kernel_index_type num_features) : + w_d_{ w_d }, data_d_{ data_d }, data_last_d_{ data_last_d }, alpha_d_{ alpha_d }, num_data_points_{ num_data_points }, num_features_{ num_features } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ + void operator()(::sycl::nd_item<1> nd_idx) const { + const kernel_index_type index = nd_idx.get_global_linear_id(); + real_type temp = 0; + if (index < num_features_) { + for (kernel_index_type dat = 0; dat < num_data_points_ - 1; ++dat) { + temp += alpha_d_[dat] * data_d_[dat + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * index]; + } + temp += alpha_d_[num_data_points_ - 1] * data_last_d_[index]; + w_d_[index] = temp; + } + } + + private: + real_type *w_d_; + const real_type *data_d_; + const real_type *data_last_d_; + const real_type *alpha_d_; + const kernel_index_type num_data_points_; + const kernel_index_type num_features_; +}; + +/** + * @brief Predicts the labels for data points using the polynomial kernel function. + * @details Currently only single GPU execution is supported. + * @tparam T the type of the data points + */ +template +class device_kernel_predict_poly { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel to predict the labels for data points using the polynomial kernel function. + * @details Currently only single GPU execution is supported. + * @param[in] out_d the calculated predictions + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] points the data points to predict + * @param[in] num_predict_points the total number of data points to predict + * @param[in] num_features the number of features per support vector and point to predict + * @param[in] degree the degree parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] coef0 the coef0 parameter used in the polynomial kernel function + */ + device_kernel_predict_poly(real_type *out_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const kernel_index_type num_data_points, const real_type *points, const kernel_index_type num_predict_points, const kernel_index_type num_features, const int degree, const real_type gamma, const real_type coef0) : + out_d_{ out_d }, data_d_{ data_d }, data_last_d_{ data_last_d }, alpha_d_{ alpha_d }, num_data_points_{ num_data_points }, points_{ points }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ + void operator()(::sycl::nd_item<2> nd_idx) const { + const kernel_index_type data_point_index = nd_idx.get_global_id(0); + const kernel_index_type predict_point_index = nd_idx.get_global_id(1); + + real_type temp = 0; + if (predict_point_index < num_predict_points_) { + for (kernel_index_type feature_index = 0; feature_index < num_features_; ++feature_index) { + if (data_point_index == num_data_points_) { + temp += data_last_d_[feature_index] * points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]; + } else { + temp += data_d_[data_point_index + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] * points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]; + } + } + + temp = alpha_d_[data_point_index] * ::sycl::pow(gamma_ * temp + coef0_, static_cast(degree_)); + + atomic_op{ out_d_[predict_point_index] } += temp; + } + } + + private: + real_type *out_d_; + const real_type *data_d_; + const real_type *data_last_d_; + const real_type *alpha_d_; + const kernel_index_type num_data_points_; + const real_type *points_; + const kernel_index_type num_predict_points_; + const kernel_index_type num_features_; + const int degree_; + const real_type gamma_; + const real_type coef0_; +}; + +/** + * @brief Predicts the labels for data points using the radial basis functions kernel function. + * @details Currently only single GPU execution is supported. + * @tparam T the type of the data points + */ +template +class device_kernel_predict_radial { + public: + /// The type of the data. + using real_type = T; + + /** + * @brief Construct a new device kernel to predict the labels for data points using the radial basis function kernel function. + * @details Currently only single GPU execution is supported. + * @param[in] out_d the calculated predictions + * @param[in] data_d the one-dimension support vector matrix + * @param[in] data_last_d the last row of the support vector matrix + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_data_points the total number of support vectors + * @param[in] points the data points to predict + * @param[in] num_predict_points the total number of data points to predict + * @param[in] num_features the number of features per support vector and point to predict + * @param[in] gamma the gamma parameter used in the rbf kernel function + */ + device_kernel_predict_radial(real_type *out_d, const real_type *data_d, const real_type *data_last_d, const real_type *alpha_d, const kernel_index_type num_data_points, const real_type *points, const kernel_index_type num_predict_points, const kernel_index_type num_features, const real_type gamma) : + out_d_{ out_d }, data_d_{ data_d }, data_last_d_{ data_last_d }, alpha_d_{ alpha_d }, num_data_points_{ num_data_points }, points_{ points }, num_predict_points_{ num_predict_points }, num_features_{ num_features }, gamma_{ gamma } {} + + /** + * @brief Function call operator overload performing the actual calculation. + * @param[in] nd_idx the [`sycl::item`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#subsec:item.class) + * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) + */ + void operator()(::sycl::nd_item<2> nd_idx) const { + const kernel_index_type data_point_index = nd_idx.get_global_id(0); + const kernel_index_type predict_point_index = nd_idx.get_global_id(1); + + real_type temp = 0; + if (predict_point_index < num_predict_points_) { + for (kernel_index_type feature_index = 0; feature_index < num_features_; ++feature_index) { + if (data_point_index == num_data_points_) { + temp += (data_last_d_[feature_index] - points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]) * (data_last_d_[feature_index] - points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]); + } else { + temp += (data_d_[data_point_index + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] - points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]) * (data_d_[data_point_index + (num_data_points_ - 1 + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index] - points_[predict_point_index + (num_predict_points_ + THREAD_BLOCK_SIZE * INTERNAL_BLOCK_SIZE) * feature_index]); + } + } + + temp = alpha_d_[data_point_index] * ::sycl::exp(-gamma_ * temp); + + atomic_op{ out_d_[predict_point_index] } += temp; + } + } + + private: + real_type *out_d_; + const real_type *data_d_; + const real_type *data_last_d_; + const real_type *alpha_d_; + const kernel_index_type num_data_points_; + const real_type *points_; + const kernel_index_type num_predict_points_; + const kernel_index_type num_features_; + const real_type gamma_; +}; + +} // namespace plssvm::sycl diff --git a/include/plssvm/backends/SYCL/q_kernel.hpp b/include/plssvm/backends/SYCL/q_kernel.hpp index 956fa3501..6c5a6e375 100644 --- a/include/plssvm/backends/SYCL/q_kernel.hpp +++ b/include/plssvm/backends/SYCL/q_kernel.hpp @@ -11,6 +11,8 @@ #pragma once +#include "plssvm/constants.hpp" // plssvm::kernel_index_type + #include "sycl/sycl.hpp" // sycl::nd_item, sycl::pow, sycl::exp namespace plssvm::sycl { @@ -32,11 +34,10 @@ class device_kernel_q_linear { * @param[in] data_d the one-dimensional data matrix * @param[in] data_last the last row in the data matrix * @param[in] num_rows the number of rows in the data matrix - * @param[in] first_feature the first feature used in the calculations (depending on the current device) - * @param[in] last_feature the last feature used in the calculations (depending on the current device) + * @param[in] feature_range number of features used for the calculation */ - device_kernel_q_linear(real_type *q, const real_type *data_d, const real_type *data_last, int num_rows, int first_feature, int last_feature) : - q_{ q }, data_d_{ data_d }, data_last_{ data_last }, num_rows_{ num_rows }, first_feature_{ first_feature }, last_feature_{ last_feature } {} + device_kernel_q_linear(real_type *q, const real_type *data_d, const real_type *data_last, const kernel_index_type num_rows, const kernel_index_type feature_range) : + q_{ q }, data_d_{ data_d }, data_last_{ data_last }, num_rows_{ num_rows }, feature_range_{ feature_range } {} /** * @brief Function call operator overload performing the actual calculation. @@ -44,9 +45,9 @@ class device_kernel_q_linear { * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ void operator()(::sycl::nd_item<1> item) const { - const auto index = item.get_global_linear_id(); + const kernel_index_type index = item.get_global_linear_id(); real_type temp{ 0.0 }; - for (int i = first_feature_; i < last_feature_; ++i) { + for (kernel_index_type i = 0; i < feature_range_; ++i) { temp += data_d_[i * num_rows_ + index] * data_last_[i]; } q_[index] = temp; @@ -56,9 +57,8 @@ class device_kernel_q_linear { real_type *q_; const real_type *data_d_; const real_type *data_last_; - const int num_rows_; - const int first_feature_; - const int last_feature_; + const kernel_index_type num_rows_; + const kernel_index_type feature_range_; }; /** @@ -83,7 +83,7 @@ class device_kernel_q_poly { * @param[in] gamma the gamma parameter used in the polynomial kernel function * @param[in] coef0 the coef0 parameter used in the polynomial kernel function */ - device_kernel_q_poly(real_type *q, const real_type *data_d, const real_type *data_last, int num_rows, int num_cols, int degree, real_type gamma, real_type coef0) : + device_kernel_q_poly(real_type *q, const real_type *data_d, const real_type *data_last, const kernel_index_type num_rows, const kernel_index_type num_cols, const int degree, const real_type gamma, const real_type coef0) : q_{ q }, data_d_{ data_d }, data_last_{ data_last }, num_rows_{ num_rows }, num_cols_{ num_cols }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} /** @@ -92,9 +92,9 @@ class device_kernel_q_poly { * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ void operator()(::sycl::nd_item<1> item) const { - const auto index = item.get_global_linear_id(); + const kernel_index_type index = item.get_global_linear_id(); real_type temp{ 0.0 }; - for (int i = 0; i < num_cols_; ++i) { + for (kernel_index_type i = 0; i < num_cols_; ++i) { temp += data_d_[i * num_rows_ + index] * data_last_[i]; } q_[index] = ::sycl::pow(gamma_ * temp + coef0_, static_cast(degree_)); @@ -104,8 +104,8 @@ class device_kernel_q_poly { real_type *q_; const real_type *data_d_; const real_type *data_last_; - const int num_rows_; - const int num_cols_; + const kernel_index_type num_rows_; + const kernel_index_type num_cols_; const int degree_; const real_type gamma_; const real_type coef0_; @@ -129,9 +129,9 @@ class device_kernel_q_radial { * @param[in] data_last the last row in the data matrix * @param[in] num_rows the number of rows in the data matrix * @param[in] num_cols the number of columns in the data matrix - * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the rbf kernel function */ - device_kernel_q_radial(real_type *q, const real_type *data_d, const real_type *data_last, int num_rows, int num_cols, real_type gamma) : + device_kernel_q_radial(real_type *q, const real_type *data_d, const real_type *data_last, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type gamma) : q_{ q }, data_d_{ data_d }, data_last_{ data_last }, num_rows_{ num_rows }, num_cols_{ num_cols }, gamma_{ gamma } {} /** @@ -140,9 +140,9 @@ class device_kernel_q_radial { * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ void operator()(::sycl::nd_item<1> item) const { - const auto index = item.get_global_linear_id(); + const kernel_index_type index = item.get_global_linear_id(); real_type temp{ 0.0 }; - for (int i = 0; i < num_cols_; ++i) { + for (kernel_index_type i = 0; i < num_cols_; ++i) { temp += (data_d_[i * num_rows_ + index] - data_last_[i]) * (data_d_[i * num_rows_ + index] - data_last_[i]); } q_[index] = ::sycl::exp(-gamma_ * temp); @@ -152,8 +152,8 @@ class device_kernel_q_radial { real_type *q_; const real_type *data_d_; const real_type *data_last_; - const int num_rows_; - const int num_cols_; + const kernel_index_type num_rows_; + const kernel_index_type num_cols_; const real_type gamma_; }; diff --git a/include/plssvm/backends/SYCL/svm_kernel.hpp b/include/plssvm/backends/SYCL/svm_kernel.hpp index 951a498cd..a0b1a669b 100644 --- a/include/plssvm/backends/SYCL/svm_kernel.hpp +++ b/include/plssvm/backends/SYCL/svm_kernel.hpp @@ -12,7 +12,7 @@ #pragma once #include "plssvm/backends/SYCL/detail/constants.hpp" // PLSSVM_SYCL_BACKEND_COMPILER_DPCPP, PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL -#include "plssvm/constants.hpp" // plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE +#include "plssvm/constants.hpp" // plssvm::kernel_index_type, plssvm::THREAD_BLOCK_SIZE, plssvm::INTERNAL_BLOCK_SIZE #include "sycl/sycl.hpp" // sycl::nd_item, sycl::handler, sycl::accessor, sycl::access::mode, sycl::access::target, sycl::range, sycl::group_barrier, sycl::pow, // sycl::exp, sycl::atomic_ref, sycl::memory_order, sycl::memory_scope, sycl::access::address_space @@ -21,28 +21,6 @@ namespace plssvm::sycl { -/// Unsigned integer type. -using size_type = std::size_t; // TODO: consistent in one place (not for each backend?) - -namespace detail { - -// TODO: remove #if after Intel has a SYCL2020 conformant sycl::atomic_ref implementation -#if PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_DPCPP -using ::sycl::ext::oneapi::atomic_ref; -#elif PLSSVM_SYCL_BACKEND_COMPILER == PLSSVM_SYCL_BACKEND_COMPILER_HIPSYCL -using ::sycl::atomic_ref; -#endif - -} // namespace detail - -/** - * @brief Shortcut alias for a [`sycl::atomic_ref`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:atomic-references). - * @tparam real_type the type of the accessed values - */ - -template -using atomic_op = detail::atomic_ref; - // TODO: change to ::sycl::local_accessor once implemented in the SYCL implementations /** * @brief Shortcut alias for a SYCL local accessor. @@ -66,18 +44,18 @@ class device_kernel_linear { * @brief Construct a new device kernel calculating the `q` vector using the linear C-SVM kernel. * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory * @param[in] q the `q` vector - * @param[in] ret the result vector + * @param[out] ret the result vector * @param[in] d the right-hand side of the equation * @param[in] data_d the one-dimension data matrix * @param[in] QA_cost he bottom right matrix entry multiplied by cost * @param[in] cost 1 / the cost parameter in the C-SVM * @param[in] num_rows the number of columns in the data matrix + * @param[in] feature_range number of features used for the calculation on the device @p id * @param[in] add denotes whether the values are added or subtracted from the result vector - * @param[in] first_feature the first feature used in the calculations (depending on the current device) - * @param[in] last_feature the last feature used in the calculations (depending on the current device) + * @param[in] id the id of the device */ - device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, real_type QA_cost, real_type cost, int num_rows, real_type add, int first_feature, int last_feature) : - data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, add_{ add }, first_feature_{ first_feature }, last_feature_{ last_feature } {} + device_kernel_linear(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type feature_range, const real_type add, const kernel_index_type id) : + data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, feature_range_{ feature_range }, add_{ add }, device_{ id } {} /** * @brief Function call operator overload performing the actual calculation. @@ -85,27 +63,26 @@ class device_kernel_linear { * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ void operator()(::sycl::nd_item<2> nd_idx) const { - size_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - size_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; real_type data_j[INTERNAL_BLOCK_SIZE]; if (i >= j) { i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - //const size_type ji = j + nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; // cache data - for (int vec_index = first_feature_ * num_rows_; vec_index < last_feature_ * num_rows_; vec_index += num_rows_) { + for (kernel_index_type vec_index = 0; vec_index < feature_range_ * num_rows_; vec_index += num_rows_) { ::sycl::group_barrier(nd_idx.get_group()); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const size_type idx = 0; // TODO: load parallel + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx = block_id % THREAD_BLOCK_SIZE; if (nd_idx.get_local_id(1) == idx) { data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; } - const size_type idx_2 = 0; // TODO: load parallel + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; if (nd_idx.get_local_id(0) == idx_2) { data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; } @@ -113,27 +90,27 @@ class device_kernel_linear { ::sycl::group_barrier(nd_idx.get_group()); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { matr[k][l] += data_i * data_j[k]; } } } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { real_type ret_jx = 0.0; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { real_type temp; - if (first_feature_ == 0) { + if (device_ == 0) { temp = (matr[x][y] + QA_cost_ - q_[i + y] - q_[j + x]) * add_; } else { temp = matr[x][y] * add_; @@ -144,7 +121,7 @@ class device_kernel_linear { ret_jx += temp * d_[i + y]; } else if (i + x == j + y) { // diagonal - if (first_feature_ == 0) { + if (device_ == 0) { ret_jx += (temp + cost_ * add_) * d_[i + y]; } else { ret_jx += temp * d_[i + y]; @@ -166,10 +143,10 @@ class device_kernel_linear { const real_type *data_d_; const real_type QA_cost_; const real_type cost_; - const int num_rows_; + const kernel_index_type num_rows_; + const kernel_index_type feature_range_; const real_type add_; - const int first_feature_; - const int last_feature_; + const kernel_index_type device_; }; /** @@ -187,7 +164,7 @@ class device_kernel_poly { * @brief Construct a new device kernel calculating the `q` vector using the polynomial C-SVM kernel. * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory * @param[in] q the `q` vector - * @param[in] ret the result vector + * @param[out] ret the result vector * @param[in] d the right-hand side of the equation * @param[in] data_d the one-dimension data matrix * @param[in] QA_cost he bottom right matrix entry multiplied by cost @@ -199,7 +176,7 @@ class device_kernel_poly { * @param[in] gamma the gamma parameter used in the polynomial kernel function * @param[in] coef0 the coef0 parameter used in the polynomial kernel function */ - device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, real_type QA_cost, real_type cost, int num_rows, int num_cols, real_type add, int degree, real_type gamma, real_type coef0) : + device_kernel_poly(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const int degree, const real_type gamma, const real_type coef0) : data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, degree_{ degree }, gamma_{ gamma }, coef0_{ coef0 } {} /** @@ -208,27 +185,26 @@ class device_kernel_poly { * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ void operator()(::sycl::nd_item<2> nd_idx) const { - size_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - size_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; real_type data_j[INTERNAL_BLOCK_SIZE]; if (i >= j) { i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - //const size_type ji = j + nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; // cache data - for (int vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { + for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { ::sycl::group_barrier(nd_idx.get_group()); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const size_type idx = 0; // TODO: load parallel + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx = block_id % THREAD_BLOCK_SIZE; if (nd_idx.get_local_id(1) == idx) { data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; } - const size_type idx_2 = 0; // TODO: load parallel + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; if (nd_idx.get_local_id(0) == idx_2) { data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; } @@ -236,25 +212,25 @@ class device_kernel_poly { ::sycl::group_barrier(nd_idx.get_group()); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { matr[k][l] += data_i * data_j[k]; } } } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { real_type ret_jx = 0.0; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { const real_type temp = (::sycl::pow(gamma_ * matr[x][y] + coef0_, static_cast(degree_)) + QA_cost_ - q_[i + y] - q_[j + x]) * add_; if (i + x > j + y) { // upper triangular matrix @@ -280,8 +256,8 @@ class device_kernel_poly { const real_type *data_d_; const real_type QA_cost_; const real_type cost_; - const int num_rows_; - const int num_cols_; + const kernel_index_type num_rows_; + const kernel_index_type num_cols_; const real_type add_; const int degree_; const real_type gamma_; @@ -303,7 +279,7 @@ class device_kernel_radial { * @brief Construct a new device kernel calculating the `q` vector using the radial basis functions C-SVM kernel. * @param[in] cgh [`sycl::handler`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:handlerClass) used to allocate the local memory * @param[in] q the `q` vector - * @param[in] ret the result vector + * @param[out] ret the result vector * @param[in] d the right-hand side of the equation * @param[in] data_d the one-dimension data matrix * @param[in] QA_cost he bottom right matrix entry multiplied by cost @@ -311,9 +287,9 @@ class device_kernel_radial { * @param[in] num_rows the number of columns in the data matrix * @param[in] num_cols the number of rows in the data matrix * @param[in] add denotes whether the values are added or subtracted from the result vector - * @param[in] gamma the gamma parameter used in the polynomial kernel function + * @param[in] gamma the gamma parameter used in the rbf kernel function */ - device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, real_type QA_cost, real_type cost, int num_rows, int num_cols, real_type add, real_type gamma) : + device_kernel_radial(::sycl::handler &cgh, const real_type *q, real_type *ret, const real_type *d, const real_type *data_d, const real_type QA_cost, const real_type cost, const kernel_index_type num_rows, const kernel_index_type num_cols, const real_type add, const real_type gamma) : data_intern_i_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, data_intern_j_{ ::sycl::range<2>{ THREAD_BLOCK_SIZE, INTERNAL_BLOCK_SIZE }, cgh }, q_{ q }, ret_{ ret }, d_{ d }, data_d_{ data_d }, QA_cost_{ QA_cost }, cost_{ cost }, num_rows_{ num_rows }, num_cols_{ num_cols }, add_{ add }, gamma_{ gamma } {} /** @@ -322,27 +298,26 @@ class device_kernel_radial { * identifying an instance of the functor executing at each point in a [`sycl::range`](https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#range-class) */ void operator()(::sycl::nd_item<2> nd_idx) const { - size_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; - size_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; + kernel_index_type i = nd_idx.get_group(0) * nd_idx.get_local_range(0) * INTERNAL_BLOCK_SIZE; + kernel_index_type j = nd_idx.get_group(1) * nd_idx.get_local_range(1) * INTERNAL_BLOCK_SIZE; real_type matr[INTERNAL_BLOCK_SIZE][INTERNAL_BLOCK_SIZE] = { { 0.0 } }; real_type data_j[INTERNAL_BLOCK_SIZE]; if (i >= j) { i += nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; - //const size_type ji = j + nd_idx.get_local_id(0) * INTERNAL_BLOCK_SIZE; j += nd_idx.get_local_id(1) * INTERNAL_BLOCK_SIZE; // cache data - for (int vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { + for (kernel_index_type vec_index = 0; vec_index < num_cols_ * num_rows_; vec_index += num_rows_) { ::sycl::group_barrier(nd_idx.get_group()); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { - const size_type idx = 0; // TODO: load parallel + for (kernel_index_type block_id = 0; block_id < INTERNAL_BLOCK_SIZE; ++block_id) { + const std::size_t idx = block_id % THREAD_BLOCK_SIZE; if (nd_idx.get_local_id(1) == idx) { data_intern_i_[nd_idx.get_local_id(0)][block_id] = data_d_[block_id + vec_index + i]; } - const size_type idx_2 = 0; // TODO: load parallel + const std::size_t idx_2 = block_id % THREAD_BLOCK_SIZE; if (nd_idx.get_local_id(0) == idx_2) { data_intern_j_[nd_idx.get_local_id(1)][block_id] = data_d_[block_id + vec_index + j]; } @@ -350,25 +325,25 @@ class device_kernel_radial { ::sycl::group_barrier(nd_idx.get_group()); #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { + for (kernel_index_type data_index = 0; data_index < INTERNAL_BLOCK_SIZE; ++data_index) { data_j[data_index] = data_intern_j_[nd_idx.get_local_id(1)][data_index]; } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { + for (kernel_index_type l = 0; l < INTERNAL_BLOCK_SIZE; ++l) { const real_type data_i = data_intern_i_[nd_idx.get_local_id(0)][l]; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { + for (kernel_index_type k = 0; k < INTERNAL_BLOCK_SIZE; ++k) { matr[k][l] += (data_i - data_j[k]) * (data_i - data_j[k]); } } } #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { + for (kernel_index_type x = 0; x < INTERNAL_BLOCK_SIZE; ++x) { real_type ret_jx = 0.0; #pragma unroll INTERNAL_BLOCK_SIZE - for (size_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { + for (kernel_index_type y = 0; y < INTERNAL_BLOCK_SIZE; ++y) { const real_type temp = (::sycl::exp(-gamma_ * matr[x][y]) + QA_cost_ - q_[i + y] - q_[j + x]) * add_; if (i + x > j + y) { // upper triangular matrix @@ -394,8 +369,8 @@ class device_kernel_radial { const real_type *data_d_; const real_type QA_cost_; const real_type cost_; - const int num_rows_; - const int num_cols_; + const kernel_index_type num_rows_; + const kernel_index_type num_cols_; const real_type add_; const real_type gamma_; }; diff --git a/include/plssvm/backends/gpu_csvm.hpp b/include/plssvm/backends/gpu_csvm.hpp new file mode 100644 index 000000000..7c796cb9c --- /dev/null +++ b/include/plssvm/backends/gpu_csvm.hpp @@ -0,0 +1,191 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Defines the base class for all C-SVM backends using a GPU. Used for code duplication reduction. + */ + +#pragma once + +#include "plssvm/csvm.hpp" // plssvm::csvm + +#include // std::size_t +#include // std::vector + +namespace plssvm { + +// forward declare parameter class +template +class parameter; + +namespace detail { + +// forward declare execution_range class +class execution_range; + +/** + * @brief A C-SVM implementation for all GPU backends to reduce code duplication. + * @details Implements all virtual functions defined in plssvm::csvm. The GPU backends only have to implement the actual kernel launches. + * @tparam T the type of the data + * @tparam device_ptr_t the type of the device pointer (dependent on the used backend) + * @tparam queue_t the type of the device queue (dependent on the used backend) + */ +template +class gpu_csvm : public csvm { + protected: + /// The template base type of the C-SVM class. + using base_type = ::plssvm::csvm; + + using base_type::alpha_ptr_; + using base_type::bias_; + using base_type::coef0_; + using base_type::cost_; + using base_type::data_ptr_; + using base_type::degree_; + using base_type::epsilon_; + using base_type::gamma_; + using base_type::kernel_; + using base_type::num_data_points_; + using base_type::num_features_; + using base_type::print_info_; + using base_type::QA_cost_; + using base_type::target_; + using base_type::value_ptr_; + using base_type::w_; + + public: + /// The type of the data. Must be either `float` or `double`. + using real_type = typename base_type::real_type; + /// The type of the device pointer (dependent on the used backend). + using device_ptr_type = device_ptr_t; + /// The type of the device queue (dependent on the used backend). + using queue_type = queue_t; + + /** + * @brief Construct a new C-SVM using any GPU backend with the parameters given through @p params. + * @param[in] params struct encapsulating all possible parameters + * @throws plssvm::csvm::csvm() exceptions + */ + explicit gpu_csvm(const parameter ¶ms); + + /** + * @brief Virtual destructor to enable safe inheritance. + */ + virtual ~gpu_csvm() = default; + + //*************************************************************************************************************************************// + // functions inherited from plssvm::csvm // + //*************************************************************************************************************************************// + /** + * @copydoc plssvm::csvm::predict(const std::vector>&) + */ + [[nodiscard]] std::vector predict(const std::vector> &points) final; + + protected: + /** + * @copydoc plssvm::csvm::setup_data_on_device + */ + void setup_data_on_device() final; + /** + * @copydoc plssvm::csvm::generate_q + */ + [[nodiscard]] std::vector generate_q() final; + /** + * @copydoc plssvm::csvm::solver_CG + */ + std::vector solver_CG(const std::vector &b, std::size_t imax, real_type eps, const std::vector &q) final; + /** + * @copydoc plssvm::csvm::update_w + */ + void update_w() final; + + /** + * @brief Run the SVM kernel on the GPU denoted by the @p device ID. + * @param[in] device the device ID denoting the GPU on which the kernel should be executed + * @param[in] q_d subvector of the least-squares matrix equation + * @param[in,out] r_d the result vector + * @param[in] x_d the right-hand side of the equation + * @param[in] add denotes whether the values are added or subtracted from the result vector + */ + void run_device_kernel(std::size_t device, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, real_type add); + /** + * @brief Combines the data in @p buffer_d from all devices into @p buffer and distributes them back to each device. + * @param[in,out] buffer_d the data to gather + * @param[in,out] buffer the reduced data + */ + void device_reduction(std::vector &buffer_d, std::vector &buffer); + + //*************************************************************************************************************************************// + // pure virtual, must be implemented by all subclasses // + //*************************************************************************************************************************************// + /** + * @brief Synchronize the device denoted by @p queue. + * @param[in,out] queue the queue denoting the device to synchronize + */ + virtual void device_synchronize(queue_type &queue) = 0; + /** + * @brief Run the GPU kernel filling the `q` vector. + * @param[in] device the device ID denoting the GPU on which the kernel should be executed + * @param[in] range the execution range used to launch the kernel + * @param[out] q_d the `q` vector to fill + * @param[in] num_features number of features used for the calculation on the @p device + */ + virtual void run_q_kernel(std::size_t device, const detail::execution_range &range, device_ptr_type &q_d, std::size_t num_features) = 0; + /** + * @brief Run the main GPU kernel used in the CG algorithm. + * @param[in] device the device ID denoting the GPU on which the kernel should be executed + * @param[in] range the execution range used to launch the kernel + * @param[in] q_d the `q` vector + * @param[in,out] r_d the result vector + * @param[in] x_d the right-hand side of the equation + * @param[in] add denotes whether the values are added or subtracted from the result vector + * @param[in] num_features number of features used for the calculation in the @p device + */ + virtual void run_svm_kernel(std::size_t device, const detail::execution_range &range, const device_ptr_type &q_d, device_ptr_type &r_d, const device_ptr_type &x_d, real_type add, std::size_t num_features) = 0; + /** + * @brief Run the GPU kernel (only on the first GPU) the calculate the `w` vector used to speed up the prediction when using the linear kernel function. + * @param[in] device the device ID denoting the GPU on which the kernel should be executed + * @param[in] range the execution range used to launch the + * @param[out] w_d the `w` vector to fill, used to speed up the prediction when using the linear kernel + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] num_features number of features used for the calculation on the @p device + */ + virtual void run_w_kernel(std::size_t device, const detail::execution_range &range, device_ptr_type &w_d, const device_ptr_type &alpha_d, std::size_t num_features) = 0; + /** + * @brief Run the GPU kernel (only on the first GPU) to predict the new data points @p point_d. + * @param[in] range the execution range used to launch the kernel + * @param[out] out_d the calculated prediction + * @param[in] alpha_d the previously calculated weight for each data point + * @param[in] point_d the data points to predict + * @param[in] num_predict_points the number of data points to predict + */ + virtual void run_predict_kernel(const detail::execution_range &range, device_ptr_type &out_d, const device_ptr_type &alpha_d, const device_ptr_type &point_d, std::size_t num_predict_points) = 0; + + //*************************************************************************************************************************************// + // internal variables specific to GPU backends // + //*************************************************************************************************************************************// + /// The number of data points excluding the last data point. + std::size_t dept_{}; + /// The boundary size used to remove boundary condition checks inside the kernels. + std::size_t boundary_size_{}; + /// The number of rows to calculate including the boundary values. + std::size_t num_rows_{}; + /// The number of columns in the data matrix (= the number of features per data point). + std::size_t num_cols_{}; + /// The feature range per GPU. The GPU with the ID `i` uses the features: `[feature_ranges_[i], feature_ranges_[i + 1])`. + std::vector feature_ranges_; + + /// The available/used backend devices. + std::vector devices_{}; + /// The data saved across all devices. + std::vector data_d_{}; + /// The last row of the data matrix. + std::vector data_last_d_{}; +}; + +} // namespace detail +} // namespace plssvm \ No newline at end of file diff --git a/include/plssvm/constants.hpp b/include/plssvm/constants.hpp index a9f2edc61..c1411679b 100644 --- a/include/plssvm/constants.hpp +++ b/include/plssvm/constants.hpp @@ -6,28 +6,40 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Global compile-time constants. + * @brief Global type definitions and compile-time constants. */ #pragma once namespace plssvm { -/// Used for internal caching. +/// Integer type used inside kernels. +using kernel_index_type = int; + +/// Global compile-time constant used for internal caching. #if defined(PLSSVM_THREAD_BLOCK_SIZE) -constexpr unsigned int THREAD_BLOCK_SIZE = PLSSVM_THREAD_BLOCK_SIZE; +constexpr kernel_index_type THREAD_BLOCK_SIZE = PLSSVM_THREAD_BLOCK_SIZE; #else -constexpr unsigned int THREAD_BLOCK_SIZE = 16; +constexpr kernel_index_type THREAD_BLOCK_SIZE = 16; #endif -/// Used for internal caching. +/// Global compile-time constant used for internal caching. #if defined(PLSSVM_INTERNAL_BLOCK_SIZE) -constexpr unsigned int INTERNAL_BLOCK_SIZE = PLSSVM_INTERNAL_BLOCK_SIZE; +constexpr kernel_index_type INTERNAL_BLOCK_SIZE = PLSSVM_INTERNAL_BLOCK_SIZE; +#else +constexpr kernel_index_type INTERNAL_BLOCK_SIZE = 6; +#endif + +/// Global compile-time constant used for internal caching in the OpenMP kernel. +#if defined(PLSSVM_OPENMP_BLOCK_SIZE) +constexpr kernel_index_type OPENMP_BLOCK_SIZE = PLSSVM_OPENMP_BLOCK_SIZE; #else -constexpr unsigned int INTERNAL_BLOCK_SIZE = 6; +constexpr kernel_index_type OPENMP_BLOCK_SIZE = 64; #endif +// perform sanity checks static_assert(THREAD_BLOCK_SIZE > 0, "THREAD_BLOCK_SIZE must be greater than 0!"); static_assert(INTERNAL_BLOCK_SIZE > 0, "INTERNAL_BLOCK_SIZE must be greater than 0!"); +static_assert(OPENMP_BLOCK_SIZE > 0, "OPENMP_BLOCK_SIZE must be greater than 0!"); } // namespace plssvm \ No newline at end of file diff --git a/include/plssvm/core.hpp b/include/plssvm/core.hpp index 3123d1fa1..28e2c5d77 100644 --- a/include/plssvm/core.hpp +++ b/include/plssvm/core.hpp @@ -11,17 +11,21 @@ #pragma once +#include "plssvm/csvm.hpp" #include "plssvm/csvm_factory.hpp" -#include "plssvm/backend_types.hpp" -#include "plssvm/csvm.hpp" -#include "plssvm/exceptions/exceptions.hpp" -#include "plssvm/kernel_types.hpp" +#include "plssvm/parameter.hpp" #include "plssvm/parameter_predict.hpp" #include "plssvm/parameter_train.hpp" + +#include "plssvm/backend_types.hpp" +#include "plssvm/kernel_types.hpp" +#include "plssvm/target_platforms.hpp" + +#include "plssvm/exceptions/exceptions.hpp" #include "plssvm/version/version.hpp" -/// The main namespace containing all API functions. +/// The main namespace containing all public API functions. namespace plssvm {} /// Namespace containing versioning information. diff --git a/include/plssvm/csvm.hpp b/include/plssvm/csvm.hpp index dc9c98a2c..267743d04 100644 --- a/include/plssvm/csvm.hpp +++ b/include/plssvm/csvm.hpp @@ -11,9 +11,8 @@ #pragma once -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/parameter.hpp" // plssvm::parameter -#include "plssvm/target_platform.hpp" // plssvm::target_platform +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform #include // std::size_t #include // std::shared_ptr @@ -23,6 +22,10 @@ namespace plssvm { +// forward declare class +template +class parameter; + /** * @brief Base class for all C-SVM backends. * @tparam T the type of the data @@ -35,8 +38,6 @@ class csvm { public: /// The type of the data. Must be either `float` or `double`. using real_type = T; - /// Unsigned integer type. - using size_type = std::size_t; //*************************************************************************************************************************************// // special member functions // @@ -44,6 +45,11 @@ class csvm { /** * @brief Construct a new C-SVM with the parameters given through @p params. * @param[in] params struct encapsulating all possible parameters + * @throws plssvm::exception if the given data pointer is the `nullptr` + * @throws plssvm::exception if the data matrix is empty + * @throws plssvm::exception if not all points in the data matrix have the same number of features + * @throws plssvm::exception if no features are provided for the data points + * @throws plssvm::exception if weights are given, but the number of weights doesn't match the number of data points */ explicit csvm(const parameter ¶ms); @@ -53,23 +59,30 @@ class csvm { virtual ~csvm() = default; /** - * @brief Disable copy-constructor. + * @brief Delete expensive copy-constructor to make csvm a move only type. */ csvm(const csvm &) = delete; - // clang-format off /** - * @brief Explicitly allow move-construction. + * @brief Move-constructor as csvm is a move-only type. */ csvm(csvm &&) noexcept = default; - // clang-format on + /** + * @brief Delete expensive copy-assignment-operator to make csvm a move only type. + */ + csvm &operator=(const csvm &) = delete; + /** + * @brief Move-assignment-operator as csvm is a move-only type. + * @return `*this` + */ + csvm &operator=(csvm &&) noexcept = default; //*************************************************************************************************************************************// // IO functions // //*************************************************************************************************************************************// /** - * @brief Write the calculated model to the given file. - * @details Writes the model using the libsvm format: + * @brief Write the calculated model to the file denoted by @þ filename. + * @details Writes the model using the LIBSVM format: * @code * svm_type c_svc * kernel_type linear @@ -85,8 +98,10 @@ class csvm { * -0.23146635 0:5.765022e-01 1:1.014056e+00 2:1.300943e-01 3:7.261914e-01 * 0.0034576654 0:1.884940e+00 1:1.005186e+00 2:2.984999e-01 3:1.646463e+00 * @endcode - * @throws unsupported_kernel_type_exception if the kernel_type cannot be recognized * @param[in] filename name of the file to write the model information to + * @throws plssvm::exception if a call to learn() is missing + * @throws plssvm::exception if no labels are given + * @throws plssvm::exception if the number of labels and number of data points mismatch */ void write_model(const std::string &filename); @@ -94,13 +109,15 @@ class csvm { // learn model // //*************************************************************************************************************************************// /** - * @brief Learns the Support Vectors given the data in the provided parameter class. + * @brief Learns the support vectors given the data in the provided parameter class. * @details Performs 2 steps: * 1. Load the data onto the used device (e.g. one or more GPUs) * 2. Learn the model by solving a minimization problem using the Conjugated Gradients algorithm + * + * @throws plssvm::exception if no labels are given for training + * @throws plssvm::exception if the number of labels and number of data points mismatch */ void learn(); - // TODO: absolute vs relative residual //*************************************************************************************************************************************// // predict // @@ -108,27 +125,56 @@ class csvm { /** * @brief Evaluates the model on the data used for training. - * @return The fraction of correct labeled training data in percent. ([[nodiscard]]) + * @throws plssvm::exception if no labels are given for the accuracy calculation + * @throws plssvm::exception if no weights are provided for calculating the accuracy (possibly a call to learn() is missing) + * @return the fraction of correctly labeled training data in percent (`[[nodiscard]]`) */ [[nodiscard]] real_type accuracy(); + /** + * @brief Evaluate the model on the given data @p point with @p correct_label being the correct label. + * @param[in] point the data point to predict + * @param[in] correct_label the correct label + * @throws plssvm::exception if the number of features in @p point doesn't match the number of features in the data matrix + * @throws plssvm::exception if no weights are provided for calculating the accuracy (possibly a call to learn() is missing) + * @return `1.0` if @p point is predicted correctly, `0.0` otherwise. (`[[nodiscard]]`) + */ + [[nodiscard]] real_type accuracy(const std::vector &point, real_type correct_label); + /** + * @brief Evaluate the model on the given data @p points with @p correct_labels being the correct labels. + * @param[in] points the data points to predict + * @param[in] correct_labels the correct labels + * @throws plssvm::exception if the number of points to predict mismatch the number of provided, correct label + * @throws plssvm::exception if not all @p points to predict have the same number of features + * @throws plssvm::exception if the number of features per point to predict and per point in data matrix mismatch + * @throws plssvm::exception if no weights are provided for calculating the accuracy (possibly a call to learn() is missing) + * @return the fraction of correctly labeled data points. (`[[nodiscard]]`) + */ + [[nodiscard]] real_type accuracy(const std::vector> &points, const std::vector &correct_labels); /** - * @brief Uses the already learned model to predict the class of a (new) data point. + * @brief Uses the already learned model to predict a (new) data point. * @param[in] point the data point to predict - * @return a negative `real_type` value if the prediction for data point point is the negative class and a positive `real_type` value otherwise ([[nodiscard]]) + * @throws plssvm::exception if the number of features in @p point doesn't match the number of features in the data matrix + * @throws plssvm::exception if no weights are provided for calculating the accuracy (possibly a call to learn() is missing) + * @return a negative #real_type value if the prediction for data point point is the negative class and a positive #real_type value otherwise (`[[nodiscard]]`) */ - [[nodiscard]] real_type predict(const std::vector &point); // TODO: implement on devices for performance improvement + [[nodiscard]] real_type predict(const std::vector &point); /** - * @brief Uses the already learned model to predict the class of an (new) point + * @brief Uses the already learned model to predict the class of a (new) data point. * @param[in] point the data point to predict - * @return -1.0 if the prediction for point is the negative class and +1 otherwise ([[nodiscard]]) + * @throws plssvm::exception if the number of features in @p point doesn't match the number of features in the data matrix + * @throws plssvm::exception if no weights are provided for calculating the accuracy (possibly a call to learn() is missing) + * @return -1.0 if the prediction for @p point is the negative class and +1 otherwise (`[[nodiscard]]`) */ [[nodiscard]] real_type predict_label(const std::vector &point); /** - * @brief Uses the already learned model to predict the class of multiple (new) points - * @param[in] points the points to predict - * @return a `std::vector` filled with -1 for each prediction for a data point the negative class and +1 otherwise ([[nodiscard]]) + * @brief Uses the already learned model to predict the class of multiple (new) data points. + * @param[in] points the data points to predict + * @throws plssvm::exception if not all @p points to predict have the same number of features + * @throws plssvm::exception if the number of features per point to predict and per point in data matrix mismatch + * @throws plssvm::exception if no weights are provided for calculating the accuracy (possibly a call to learn() is missing) + * @return a [`std::vector`](https://en.cppreference.com/w/cpp/container/vector) filled with -1 for each prediction for a data point with the negative class and +1 otherwise (`[[nodiscard]]`) */ [[nodiscard]] std::vector predict_label(const std::vector> &points); @@ -142,7 +188,7 @@ class csvm { virtual void setup_data_on_device() = 0; /** * @brief Generate the vector `q`, a subvector of the least-squares matrix equation. - * @return the generated `q` vector + * @return the generated `q` vector (`[[nodiscard]]`) */ [[nodiscard]] virtual std::vector generate_q() = 0; /** @@ -155,15 +201,15 @@ class csvm { * @param[in] q subvector of the least-squares matrix equation * @return the alpha values */ - virtual std::vector solver_CG(const std::vector &b, size_type imax, real_type eps, const std::vector &q) = 0; + virtual std::vector solver_CG(const std::vector &b, std::size_t imax, real_type eps, const std::vector &q) = 0; /** - * @brief updates the `w_` vector to the current data and alpha values. + * @brief Updates the normal vector #w_, used to speed-up the prediction in case of the linear kernel function, to the current data and alpha values. */ virtual void update_w() = 0; /** * @brief Uses the already learned model to predict the class of multiple (new) data points. * @param[in] points the data points to predict - * @return a `std::vector` filled with negative values for each prediction for a data point with the negative class and positive values otherwise ([[nodiscard]]) + * @return a [`std::vector`](https://en.cppreference.com/w/cpp/container/vector) filled with negative values for each prediction for a data point with the negative class and positive values otherwise (`[[nodiscard]]`) */ [[nodiscard]] virtual std::vector predict(const std::vector> &points) = 0; @@ -174,25 +220,27 @@ class csvm { * @brief Computes the value of the two vectors @p xi and @p xj using the kernel function specified during construction. * @param[in] xi the first vector * @param[in] xj the second vector - * @throws unsupported_kernel_type_exception if the kernel_type cannot be recognized - * @return the value computed by the kernel function + * @throws plssvm::unsupported_kernel_type_exception if the kernel type cannot be recognized + * @return the value computed by the kernel function (`[[nodiscard]]`) */ - real_type kernel_function(const std::vector &xi, const std::vector &xj); + [[nodiscard]] real_type kernel_function(const std::vector &xi, const std::vector &xj); /** - * @brief Transforms the 2D data from AoS to a 1D SoA layout, ignoring the last data point and adding boundary points. - * @param[in] boundary the number of boundary cells - * @attention boundary values can contain random numbers - * @return an 1D vector in a SoA layout + * @brief Transforms @p num_points entries of the 2D data from AoS to a 1D SoA layout and adding @p boundary points. + * @param[in] matrix the 2D vector to be transformed into a 1D representation + * @param[in] boundary the number of boundary points + * @param[in] num_points the number of data points of the 2D vector to transform + * @attention Boundary values can contain random numbers! + * @return an 1D vector in a SoA layout (`[[nodiscard]]`) */ - std::vector transform_data(const std::vector> &matrix, const size_type boundary, const size_type num_points); + [[nodiscard]] std::vector transform_data(const std::vector> &matrix, std::size_t boundary, std::size_t num_points); //*************************************************************************************************************************************// // parameter initialized by the constructor // //*************************************************************************************************************************************// /// The target platform. const target_platform target_; - /// The used kernel function: linear, polynomial or radial basis functions (rbf). + /// The used kernel function. const kernel_type kernel_; /// The degree parameter used in the polynomial kernel function. const int degree_; @@ -211,16 +259,16 @@ class csvm { const std::shared_ptr>> data_ptr_{}; /// The labels associated to each data point. std::shared_ptr> value_ptr_{}; - /// The result of the CG calculation. + /// The result of the CG calculation: the weights of the support vectors. std::shared_ptr> alpha_ptr_{}; //*************************************************************************************************************************************// // internal variables // //*************************************************************************************************************************************// /// The number of data points in the data set. - size_type num_data_points_{}; + std::size_t num_data_points_{}; /// The number of features per data point. - size_type num_features_{}; + std::size_t num_features_{}; /// The bias after learning. real_type bias_{}; /// The bottom right matrix entry multiplied by cost. diff --git a/include/plssvm/csvm_factory.hpp b/include/plssvm/csvm_factory.hpp index c1958f221..e82d9d481 100644 --- a/include/plssvm/csvm_factory.hpp +++ b/include/plssvm/csvm_factory.hpp @@ -6,19 +6,16 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Factory functions for constructing a new C-SVM based on the provided command line arguments. + * @brief Factory function for constructing a new C-SVM using one of the available backends based on the provided command line arguments. */ #pragma once #include "plssvm/backend_types.hpp" // plssvm::backend #include "plssvm/csvm.hpp" // plssvm::csvm -#include "plssvm/detail/utility.hpp" // plssvm::detail::to_underlying #include "plssvm/exceptions/exceptions.hpp" // plssvm::unsupported_backend_exception #include "plssvm/parameter.hpp" // plssvm::parameter -#include "fmt/core.h" // fmt::format - #include // std::unique_ptr, std::make_unique // only include requested/available backends @@ -36,15 +33,16 @@ #endif namespace plssvm { + /** * @brief Construct a new C-SVM with the parameters given through @p params using the requested backend. * @tparam T the type of the data - * @param[in] params struct encapsulating all possible parameters - * @throws unsupported_backend_exception if the requested backend isn't available - * @return [`std::unique_ptr`](https://en.cppreference.com/w/cpp/memory/unique_ptr) to the constructed C-SVM + * @param[in] params class encapsulating all possible parameters + * @throws plssvm::unsupported_backend_exception if the requested backend isn't available + * @return [`std::unique_ptr`](https://en.cppreference.com/w/cpp/memory/unique_ptr) to the constructed C-SVM (`[[nodiscard]]`) */ template -std::unique_ptr> make_csvm(const parameter ¶ms) { +[[nodiscard]] std::unique_ptr> make_csvm(const parameter ¶ms) { switch (params.backend) { case backend_type::openmp: #if defined(PLSSVM_HAS_OPENMP_BACKEND) @@ -73,7 +71,7 @@ std::unique_ptr> make_csvm(const parameter ¶ms) { throw unsupported_backend_exception{ "No SYCL backend available!" }; #endif } - throw unsupported_backend_exception{ fmt::format("Can't recognize backend with value '{}'!", detail::to_underlying(params.backend)) }; + throw unsupported_backend_exception{ "Can't recognize backend !" }; } } // namespace plssvm \ No newline at end of file diff --git a/include/plssvm/detail/arithmetic_type_name.hpp b/include/plssvm/detail/arithmetic_type_name.hpp index 58fb95b74..cd2258985 100644 --- a/include/plssvm/detail/arithmetic_type_name.hpp +++ b/include/plssvm/detail/arithmetic_type_name.hpp @@ -18,17 +18,23 @@ * @brief Defines a macro to create all possible conversion functions from arithmetic types to their name as string representation. * @param[in] type the data type to convert to a string */ -#define PLSSVM_CREATE_ARITHMETIC_TYPE_NAME(type) \ - template <> \ - [[nodiscard]] constexpr inline std::string_view arithmetic_type_name() { return #type; } +#define PLSSVM_CREATE_ARITHMETIC_TYPE_NAME(type) \ + template <> \ + [[nodiscard]] constexpr inline std::string_view arithmetic_type_name() { return #type; } \ + template <> \ + [[nodiscard]] constexpr inline std::string_view arithmetic_type_name() { return "const " #type; } \ + template <> \ + [[nodiscard]] constexpr inline std::string_view arithmetic_type_name() { return "volatile " #type; } \ + template <> \ + [[nodiscard]] constexpr inline std::string_view arithmetic_type_name() { return "const volatile " #type; } namespace plssvm::detail { /** - * @brief Tries to convert the given type to its name as string representation. - * @details The definition is marked as **deleted** if `T` isn't an arithmetic type (without cvref-qualification). + * @brief Tries to convert the given type to its name as string representation including possible const and/or volatile attributes. + * @details The definition is marked as **deleted** if `T` isn't an [arithmetic type](https://en.cppreference.com/w/cpp/types/is_arithmetic). * @tparam T the type to convert to a string - * @return the name of `T` + * @return the name of `T` (`[[nodiscard]]`) */ template [[nodiscard]] constexpr inline std::string_view arithmetic_type_name() = delete; diff --git a/include/plssvm/detail/assert.hpp b/include/plssvm/detail/assert.hpp index b54069f09..f435160ee 100644 --- a/include/plssvm/detail/assert.hpp +++ b/include/plssvm/detail/assert.hpp @@ -23,7 +23,8 @@ namespace plssvm::detail { /** - * @brief Function called by the `PLSSVM_ASSERT` macro. Checks the assertion condition and prints and aborts the program if the condition evaluates to `false`. + * @brief Function called by the `PLSSVM_ASSERT` macro. Checks the assertion condition. If the condition evaluates to `false`, + * prints the assertion condition together with additional information (e.g. plssvm::source_location information) and aborts the program. * @tparam Args the placeholder types * @param[in] cond the assertion condition, aborts the program if evaluated to `false` * @param[in] cond_str the assertion condition as string diff --git a/include/plssvm/detail/execution_range.hpp b/include/plssvm/detail/execution_range.hpp new file mode 100644 index 000000000..17c1dfea5 --- /dev/null +++ b/include/plssvm/detail/execution_range.hpp @@ -0,0 +1,66 @@ +/** + * @file + * @author Alexander Van Craen + * @author Marcel Breyer + * @copyright 2018-today The PLSSVM project - All Rights Reserved + * @license This file is part of the PLSSVM project which is released under the MIT license. + * See the LICENSE.md file in the project root for full license information. + * + * @brief Implement a backend independent class used to specify the execution range for all kernel invocations. + */ + +#pragma once + +#include // std::copy +#include // std::array +#include // std::size_t +#include // std::initializer_list +#include // forward declare std::ostream +#include // std::enable_if_t + +namespace plssvm::detail { + +/** + * @brief Class specifying a backend independent execution range. + * @details Holds two members: `grid` specifying the grid size and `block` specifying the block size using the CUDA definition. + * Both grid and block must specify at least a one dimensional and at most a three dimensional value used in the kernel invocation. + */ +class execution_range { + public: + /** + * @brief Initialize the grid and block sizes using [`std::initializer_list`](https://en.cppreference.com/w/cpp/utility/initializer_list)s. + * @details If less than three values are specified, fills the missing values with zero. Uses the CUDA definition. + * @throws plssvm::exception if the number of values specified for the grid and block sizes are less than one or greater than three + * @param[in] grid specifies the grid sizes + * @param[in] block specifies the block sizes + */ + execution_range(std::initializer_list grid, std::initializer_list block); + + /** + * @brief Initialize the grid and block sizes using [`std::array`](https://en.cppreference.com/w/cpp/container/array)s. + * Only available if the number of values specified for the grid and block sizes are greater than zero and less or equal than three. + * @details If less than three values are specified, fills the missing values with zero. Uses the CUDA definition. + * @param[in] p_grid specifies the grid sizes + * @param[in] p_block specifies the block sizes + */ + template = true> + execution_range(const std::array &p_grid, const std::array &p_block) { + std::copy(p_grid.begin(), p_grid.end(), grid.begin()); + std::copy(p_block.begin(), p_block.end(), block.begin()); + } + + /// The grid sizes. + std::array grid = { 1, 1, 1 }; + /// The block sizes. + std::array block = { 1, 1, 1 }; +}; + +/** + * @brief Output the execution @p range to the given output-stream @p out. + * @param[in,out] out the output-stream to write the execution range to + * @param[in] range the execution range + * @return the output-stream + */ +std::ostream &operator<<(std::ostream &out, const execution_range &range); + +} // namespace plssvm::detail \ No newline at end of file diff --git a/include/plssvm/detail/file_reader.hpp b/include/plssvm/detail/file_reader.hpp index 1408d9e81..43d85e9cd 100644 --- a/include/plssvm/detail/file_reader.hpp +++ b/include/plssvm/detail/file_reader.hpp @@ -11,33 +11,19 @@ #pragma once -#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/detail/string_utility.hpp" // plssvm::detail::starts_with, plssvm::detail::trim_left -#include "plssvm/exceptions/exceptions.hpp" // plssvm::file_not_found_exception - -#include "fmt/core.h" // fmt::format - // check if memory mapping can be supported #if __has_include() && __has_include() && __has_include() && __has_include() - #include // open, O_RDONLY - #include // mmap, munmap - #include // fstat - #include // close - #define PLSSVM_HAS_MEMORY_MAPPING #endif -#include // std::size_t -#include // std::ifstream -#include // std::ios, std::streamsize -#include // std::cerr, std::endl +#include // std::streamsize #include // std::string_view #include // std::vector namespace plssvm::detail { /** - * @brief The `plssvm::detail::file_reader` class is responsible for reading a file and splitting it into its lines. + * @brief The plssvm::detail::file_reader class is responsible for reading a file and splitting it into its lines. * @details If the necessary headers are present, the class tries to memory map the given file. If this fails or if the headers are not present, * the file is read as one blob using [`std::ifstream::read`](https://en.cppreference.com/w/cpp/io/basic_ifstream). */ @@ -48,146 +34,59 @@ class file_reader { * the comment token @p comment. * @param[in] filename the file to read and split into lines * @param[in] comment the character used to denote comments + * @throws plssvm::file_not_found_exception if the @p filename couldn't be found + * @throws plssvm::invalid_file_format_exception if the file couldn't be read using [`std::ifstream::read`](https://en.cppreference.com/w/cpp/io/basic_istream/read) */ - file_reader(const std::string_view filename, const char comment) { -#if defined(PLSSVM_HAS_MEMORY_MAPPING) - // headers for memory mapped IO are present -> try it - this->open_memory_mapped_file(filename); -#else - // memory mapped IO headers are missing -> use std::ifstream instead - this->open_file(filename); -#endif - // split read data into lines - this->parse_lines(comment); - } + file_reader(const std::string &filename, char comment); /** - * @brief Unmap the file and close the file descriptor used by the memory mapped IO operations or delete the allocated buffer. + * @brief Unmap the file and close the file descriptor used by the memory mapped IO operations and delete the allocated buffer. */ - ~file_reader() { -#if defined(PLSSVM_HAS_MEMORY_MAPPING) - if (must_unmap_file_) { - // unmap file - munmap(file_content_, num_bytes_); - // close file descriptor - close(file_descriptor_); - } - file_content_ = nullptr; -#endif - // delete allocated buffer (deleting nullptr is a no-op) - delete[] file_content_; - } + ~file_reader(); /** * @brief Return the number of parsed lines. * @details All empty lines or lines starting with a comment are ignored. * @return the number of lines after preprocessing (`[[nodiscard]]`) */ - [[nodiscard]] std::size_t num_lines() const noexcept { - return lines_.size(); - } + [[nodiscard]] typename std::vector::size_type num_lines() const noexcept; /** * @brief Return the @p pos line of the parsed file. * @param[in] pos the line to return - * @return the line without leading whitespaces + * @return the line without leading whitespaces (`[[nodiscard]]`) + */ + [[nodiscard]] std::string_view line(typename std::vector::size_type pos) const; + /** + * @brief Return all lines present after the preprocessing. + * @return all lines after preprocessing (`[[nodiscard]]`) */ - [[nodiscard]] std::string_view line(const std::size_t pos) const { - PLSSVM_ASSERT(pos < this->num_lines(), "Out-of-bounce access!: {} >= {}", pos, this->num_lines()); - return lines_[pos]; - } + [[nodiscard]] const std::vector &lines() const noexcept; private: +#if defined(PLSSVM_HAS_MEMORY_MAPPING) /* * Try to read the file using memory mapped IO. */ -#if defined(PLSSVM_HAS_MEMORY_MAPPING) - void open_memory_mapped_file(const std::string_view filename) { - // open the file - file_descriptor_ = open(filename.data(), O_RDONLY); - struct stat attr {}; - // check if file could be opened - if (fstat(file_descriptor_, &attr) == -1) { - close(file_descriptor_); - throw file_not_found_exception{ fmt::format("Couldn't find file: '{}'!", filename) }; - } - if (attr.st_size == 0) { - // can't memory map empty file - close(file_descriptor_); - this->open_file(filename); - } else { - // memory map file - file_content_ = static_cast(mmap(nullptr, attr.st_size, PROT_READ, MAP_SHARED, file_descriptor_, 0)); - // check if memory mapping was successful - if (static_cast(file_content_) == MAP_FAILED) { - // memory mapping wasn't successful -> try reading file with std::ifstream - close(file_descriptor_); - std::cerr << "Memory mapping failed, falling back to std::ifstream." << std::endl; - this->open_file(filename); - } else { - // set size - num_bytes_ = attr.st_size; - must_unmap_file_ = true; - } - } - } + void open_memory_mapped_file(std::string_view filename); #endif /* * Read the file using a normal std::ifstream. */ - void open_file(const std::string_view filename) { - // open the file - std::ifstream f{ filename.data() }; - if (f.fail()) { - throw file_not_found_exception{ fmt::format("Couldn't find file: '{}'!", filename) }; - } - // get the size of the file - f.seekg(0, std::ios::end); - num_bytes_ = f.tellg(); - f.seekg(0); - // allocate the necessary buffer - file_content_ = new char[num_bytes_]; - // read the whole file in one go - f.read(file_content_, static_cast(num_bytes_)); - } + void open_file(std::string_view filename); /* * Split the file into its lines, ignoring empty lines and lines starting with a comment. */ - void parse_lines(const char comment) { - // create view from buffer - std::string_view file_content_view{ file_content_, num_bytes_ }; - std::size_t pos = 0; - while (true) { - // find newline - std::size_t next_pos = file_content_view.find_first_of('\n', pos); - if (next_pos == std::string_view::npos) { - break; - } - // remove trailing whitespaces - std::string_view sv = trim_left(std::string_view{ file_content_view.data() + pos, next_pos - pos }); - // add line iff the line is not empty and doesn't with a comment - if (!sv.empty() && !starts_with(sv, comment)) { - lines_.push_back(sv); - } - pos = next_pos + 1; - } - // add last line - std::string_view sv = trim_left(std::string_view{ file_content_view.data() + pos, file_content_view.size() - pos }); - if (!sv.empty() && !starts_with(sv, comment)) { - lines_.push_back(sv); - } - } + void parse_lines(char comment); #if defined(PLSSVM_HAS_MEMORY_MAPPING) - int file_descriptor_ = 0; - bool must_unmap_file_ = false; + int file_descriptor_{ 0 }; + bool must_unmap_file_{ false }; #endif - char *file_content_ = nullptr; - std::size_t num_bytes_ = 0; + char *file_content_{ nullptr }; + std::streamsize num_bytes_{ 0 }; std::vector lines_{}; }; } // namespace plssvm::detail - -#undef PLSSVM_HAS_MEMORY_MAPPING \ No newline at end of file diff --git a/include/plssvm/detail/operators.hpp b/include/plssvm/detail/operators.hpp index 7179a5446..1645b4590 100644 --- a/include/plssvm/detail/operators.hpp +++ b/include/plssvm/detail/operators.hpp @@ -13,8 +13,9 @@ #include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include // std::fma, std::copysign -#include // std::vector +#include // std::fma, std::copysign +#include // std::is_arithmetic_v +#include // std::vector /** * @def PLSSVM_GENERATE_ARITHMETIC_OPERATION @@ -35,7 +36,7 @@ * vec1 + scalar; // operator+(vector, scalar) * scalar + vec1; // operator+(scalar, vector) * @endcode - * Also checks that both vectors have the same size. + * Also checks that both vectors have the same size using the PLSSVM_ASSERT macro. * @param[in] Op the operator to generate */ // clang-format off @@ -107,7 +108,7 @@ transposed(const std::vector &) -> transposed; * @tparam T the value type * @param[in] lhs the first vector * @param[in] rhs the second vector - * @return the dot product + * @return the dot product (`[[nodiscard]]`) */ template [[nodiscard]] inline T operator*(const transposed &lhs, const std::vector &rhs) { @@ -133,13 +134,12 @@ template * @details Uses OpenMP SIMD reduction to speedup the calculation. * @tparam T the value type * @param[in] vec the elements to accumulate - * @return the sum of all elements + * @return the sum of all elements (`[[nodiscard]]`) */ template [[nodiscard]] inline T sum(const std::vector &vec) { T val{}; - #pragma omp simd reduction(+ \ - : val) + #pragma omp simd reduction(+:val) for (typename std::vector::size_type i = 0; i < vec.size(); ++i) { val += vec[i]; } @@ -152,29 +152,29 @@ template * @tparam T the value type * @param[in] lhs the first vector * @param[in] rhs the second vector - * @return the squared euclidean distance + * @return the squared euclidean distance (`[[nodiscard]]`) */ template [[nodiscard]] inline T squared_euclidean_dist(const std::vector &lhs, const std::vector &rhs) { PLSSVM_ASSERT(lhs.size() == rhs.size(), "Sizes mismatch!: {} != {}", lhs.size(), rhs.size()); T val{}; - // #pragma omp simd reduction(+:val) //TODO: debug gcc ASSERT BUG for (typename std::vector::size_type i = 0; i < lhs.size(); ++i) { - T tmp = lhs[i] - rhs[i]; - val = std::fma(tmp, tmp, val); + const T diff = lhs[i] - rhs[i]; + val = std::fma(diff, diff, val); } return val; } /** * @brief Returns +1 if x is positive and -1 if x is negative or 0. - * @param x the number parameter to evaluate - * @return +1 if x is positive and -1 if x is negative or 0 ([[nodiscard]]) + * @param[in] x the number parameter to evaluate + * @return +1 if x is positive and -1 if x is negative or 0 (`[[nodiscard]]`) */ template -[[nodiscard]] inline constexpr int sign(const T x) { - return x == 0 ? -1 : static_cast(std::copysign(1, x)); +[[nodiscard]] inline constexpr T sign(const T x) { + static_assert(std::is_arithmetic_v, "The type T must be an arithmetic type!"); + return x == T{ 0 } ? T{ -1 } : static_cast(std::copysign(T{ 1 }, x)); } #undef PLSSVM_GENERATE_ARITHMETIC_OPERATION diff --git a/include/plssvm/detail/string_utility.hpp b/include/plssvm/detail/string_utility.hpp index 1d2f9c37d..0c4f78dc2 100644 --- a/include/plssvm/detail/string_utility.hpp +++ b/include/plssvm/detail/string_utility.hpp @@ -6,14 +6,12 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Implements utility functions for string manipulation and conversion. + * @brief Implements utility functions for string manipulation and querying. */ #pragma once -#include // std::min, std::transform -#include // std::tolower -#include // std::char_traits, std::string +#include // std::string #include // std::string_view namespace plssvm::detail { @@ -24,64 +22,61 @@ namespace plssvm::detail { * @param[in] sv the string to match against the start of @p str * @return `true` if @p str starts with the string @p sv, otherwise `false` (`[[nodiscard]]`) */ -[[nodiscard]] inline bool starts_with(const std::string_view str, const std::string_view sv) noexcept { - return str.substr(0, sv.size()) == sv; -} +[[nodiscard]] bool starts_with(std::string_view str, std::string_view sv) noexcept; /** * @brief Checks if the string @p str starts with the character @p c. * @param[in] str the string to check * @param[in] c the character to match against the first character of @p str * @return `true` if @p str starts with the character @p c, otherwise `false` (`[[nodiscard]]`) */ -[[nodiscard]] inline bool starts_with(const std::string_view str, const char c) noexcept { - return !str.empty() && std::char_traits::eq(str.front(), c); -} +[[nodiscard]] bool starts_with(std::string_view str, char c) noexcept; /** * @brief Checks if the string @p str ends with the suffix @p sv. * @param[in] str the string to check * @param[in] sv the string to match against the end of @p str * @return `true` if @p str ends with the string @p sv, otherwise `false` (`[[nodiscard]]`) */ -[[nodiscard]] inline bool ends_with(const std::string_view str, const std::string_view sv) noexcept { - return str.size() >= sv.size() && str.compare(str.size() - sv.size(), std::string_view::npos, sv) == 0; -} +[[nodiscard]] bool ends_with(std::string_view str, std::string_view sv) noexcept; /** * @brief Checks if the string @p str ends with the character @p c. * @param[in] str the string to check * @param[in] c the character to match against the last character of @p str * @return `true` if @p str ends with the character @p c, otherwise `false` (`[[nodiscard]]`) */ -[[nodiscard]] inline bool ends_with(const std::string_view str, const char c) noexcept { - return !str.empty() && std::char_traits::eq(str.back(), c); -} +[[nodiscard]] bool ends_with(std::string_view str, char c) noexcept; /** * @brief Checks if the string @p str contains the string @p sv. * @param[in] str the string to check * @param[in] sv the string to find * @return `true` if @p str contains the string @p sv, otherwise `false` (`[[nodiscard]]`) */ -[[nodiscard]] inline bool contains(const std::string_view str, const std::string_view sv) noexcept { - return str.find(sv) != std::string_view::npos; -} +[[nodiscard]] bool contains(std::string_view str, std::string_view sv) noexcept; /** * @brief Checks if the string @p str contains the character @p c. * @param[in] str the string to check * @param[in] c the character to find * @return `true` if @p str contains the character @p c, otherwise `false` (`[[nodiscard]]`) */ -[[nodiscard]] inline bool contains(const std::string_view str, const char c) noexcept { - return str.find(c) != std::string_view::npos; -} +[[nodiscard]] bool contains(std::string_view str, char c) noexcept; /** - * @brief Returns a new [`std::string_view`](https://en.cppreference.com/w/cpp/string/basic_string_view) equal to @p str where all leding whitespaces are removed. + * @brief Returns a new [`std::string_view`](https://en.cppreference.com/w/cpp/string/basic_string_view) equal to @p str where all leading whitespaces are removed. * @param[in] str the string to remove the leading whitespaces * @return the string @p str without leading whitespace (`[[nodiscard]]`) */ -[[nodiscard]] inline std::string_view trim_left(const std::string_view str) noexcept { - std::string_view::size_type pos = std::min(str.find_first_not_of(' '), str.size()); - return str.substr(pos); -} +[[nodiscard]] std::string_view trim_left(std::string_view str) noexcept; +/** + * @brief Returns a new [`std::string_view`](https://en.cppreference.com/w/cpp/string/basic_string_view) equal to @p str where all trailing whitespaces are removed. + * @param[in] str the string to remove the trailing whitespaces + * @return the string @p str without trailing whitespace (`[[nodiscard]]`) + */ +[[nodiscard]] std::string_view trim_right(std::string_view str) noexcept; +/** + * @brief Returns a new [`std::string_view`](https://en.cppreference.com/w/cpp/string/basic_string_view) equal to @p str where all leading and trailing whitespaces are removed. + * @param[in] str the string to remove the leading and trailing whitespaces + * @return the string @p str without leading and trailing whitespace (`[[nodiscard]]`) + */ +[[nodiscard]] std::string_view trim(std::string_view str) noexcept; /** * @brief Replaces all occurrences of @p what with @p with in the string @p str. @@ -89,32 +84,36 @@ namespace plssvm::detail { * @param[in] what the string to replace * @param[in] with the string to replace with */ -inline void replace_all(std::string &str, const std::string_view what, const std::string_view with) { - for (std::string::size_type pos = 0; std::string::npos != (pos = str.find(what.data(), pos, what.length())); pos += with.length()) { - str.replace(pos, what.length(), with.data(), with.length()); - } -} +void replace_all(std::string &str, std::string_view what, std::string_view with); /** * @brief Convert the string @p str to its all lower case representation. * @param[in,out] str the string to transform * @return the transformed string */ -inline std::string &to_lower_case(std::string &str) { - std::transform(str.begin(), str.end(), str.begin(), [](const char c) { return std::tolower(c); }); - return str; -} +std::string &to_lower_case(std::string &str); /** * @brief Return a new string with the same content as @p str but all lower case. - * @details In contrast to `std::string& to_lower_case(std::string&)` this function does not change the input string @p str. + * @details In contrast to to_lower_case(std::string&) this function does not change the input string @p str. + * @param[in] str the string to use in the transformation + * @return the transformed string (`[[nodiscard]]`) + */ +[[nodiscard]] std::string as_lower_case(std::string_view str); + +/** + * @brief Convert the string @p str to its all upper case representation. + * @param[in,out] str the string to transform + * @return the transformed string + */ +std::string &to_upper_case(std::string &str); + +/** + * @brief Return a new string with the same content as @p str but all upper case. + * @details In contrast to to_upper_case(std::string&) this function does not change the input string @p str. * @param[in] str the string to use in the transformation * @return the transformed string (`[[nodiscard]]`) */ -[[nodiscard]] inline std::string as_lower_case(const std::string_view str) { - std::string lowercase_str{ str }; - std::transform(str.begin(), str.end(), lowercase_str.begin(), [](const char c) { return std::tolower(c); }); - return lowercase_str; -} +[[nodiscard]] std::string as_upper_case(std::string_view str); } // namespace plssvm::detail \ No newline at end of file diff --git a/include/plssvm/detail/utility.hpp b/include/plssvm/detail/utility.hpp index 28ec9ea01..afc14c922 100644 --- a/include/plssvm/detail/utility.hpp +++ b/include/plssvm/detail/utility.hpp @@ -24,11 +24,11 @@ template constexpr bool always_false_v = false; /** - * @brief Get the @p I-th element of the parameter pack @p ts. + * @brief Get the @p I-th element of the parameter pack @p args. * @tparam I the index of the element to get * @tparam Types the types inside the parameter pack * @param[in] args the values of the parameter pack - * @return the @p I-th element of @p args + * @return the @p I-th element of @p args (`[[nodiscard]]`) */ template [[nodiscard]] constexpr decltype(auto) get(Types &&...args) noexcept { @@ -40,7 +40,7 @@ template * @brief Converts an enumeration to its underlying type. * @tparam Enum the enumeration type * @param[in] e enumeration value to convert - * @return the integer value of the underlying type of `Enum`, converted from @p e + * @return the integer value of the underlying type of `Enum`, converted from @p e (`[[nodiscard]]`) */ template [[nodiscard]] constexpr std::underlying_type_t to_underlying(const Enum e) noexcept { diff --git a/include/plssvm/exceptions/exceptions.hpp b/include/plssvm/exceptions/exceptions.hpp index 034e48222..686a5e55a 100644 --- a/include/plssvm/exceptions/exceptions.hpp +++ b/include/plssvm/exceptions/exceptions.hpp @@ -13,8 +13,6 @@ #include "plssvm/exceptions/source_location.hpp" // plssvm::source_location -#include "fmt/core.h" // fmt::format - #include // std::runtime_error #include // std::string #include // std::string_view @@ -23,7 +21,7 @@ namespace plssvm { /** * @brief Base class for all custom exception types. Forwards its message to [`std::runtime_error`](https://en.cppreference.com/w/cpp/error/runtime_error) - * and saves the call side source location information. + * and saves the exception name and the call side source location information. */ class exception : public std::runtime_error { public: @@ -33,33 +31,20 @@ class exception : public std::runtime_error { * @param[in] class_name the name of the thrown exception class * @param[in] loc the exception's call side information */ - explicit exception(const std::string &msg, const std::string_view class_name = "exception", source_location loc = source_location::current()) : - std::runtime_error{ msg }, class_name_{ class_name }, loc_{ loc } {} + explicit exception(const std::string &msg, std::string_view class_name = "exception", source_location loc = source_location::current()); /** * @brief Returns the information of the call side where the exception was thrown. * @return the exception's call side information (`[[nodiscard]]`) */ - [[nodiscard]] const source_location &loc() const noexcept { return loc_; } + [[nodiscard]] const source_location &loc() const noexcept; /** * @brief Returns a sting containing the exception's `what()` message, the name of the thrown exception class and information about the call * side where the exception has been thrown. * @return the exception's `what()` message including source location information */ - [[nodiscard]] std::string what_with_loc() const { - return fmt::format( - "{}\n" - "{} thrown:\n" - " in file {}\n" - " in function {}\n" - " @ line {}", - this->what(), - class_name_, - loc_.file_name(), - loc_.function_name(), - loc_.line()); - } + [[nodiscard]] std::string what_with_loc() const; private: const std::string_view class_name_; @@ -67,32 +52,30 @@ class exception : public std::runtime_error { }; /** - * @brief Exception type thrown if the provided data set file couldn't be found. + * @brief Exception type thrown if the provided file couldn't be found. */ class file_not_found_exception : public exception { public: /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. * @param[in] msg the exception's `what()` message * @param[in] loc the exception's call side information */ - explicit file_not_found_exception(const std::string &msg, source_location loc = source_location::current()) : - exception{ msg, "file_not_found_exception", loc } {} + explicit file_not_found_exception(const std::string &msg, source_location loc = source_location::current()); }; /** - * @brief Exception type thrown if the provided data set file has an invalid format for the selected parser - * (e.g. if the arff parser tries to parse a libsvm file). + * @brief Exception type thrown if the provided file has an invalid format for the selected parser + * (e.g. if the arff parser tries to parse a LIBSVM file). */ class invalid_file_format_exception : public exception { public: /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. * @param[in] msg the exception's `what()` message * @param[in] loc the exception's call side information */ - explicit invalid_file_format_exception(const std::string &msg, source_location loc = source_location::current()) : - exception{ msg, "invalid_file_format_exception", loc } {} + explicit invalid_file_format_exception(const std::string &msg, source_location loc = source_location::current()); }; /** @@ -101,12 +84,11 @@ class invalid_file_format_exception : public exception { class unsupported_backend_exception : public exception { public: /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. * @param[in] msg the exception's `what()` message * @param[in] loc the exception's call side information */ - explicit unsupported_backend_exception(const std::string &msg, source_location loc = source_location::current()) : - exception{ msg, "unsupported_backend_exception", loc } {} + explicit unsupported_backend_exception(const std::string &msg, source_location loc = source_location::current()); }; /** @@ -115,26 +97,11 @@ class unsupported_backend_exception : public exception { class unsupported_kernel_type_exception : public exception { public: /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. - * @param[in] msg the exception's `what()` message - * @param[in] loc the exception's call side information - */ - explicit unsupported_kernel_type_exception(const std::string &msg, source_location loc = source_location::current()) : - exception{ msg, "unsupported_kernel_type_exception", loc } {} -}; - -/** - * @brief Exception type thrown if the requested operation is currently not implemented. - */ -class not_implemented_exception : public exception { - public: - /** - * @brief Construct a new exception forwarding the exception message and source location to `plssvm::exception`. + * @brief Construct a new exception forwarding the exception message and source location to plssvm::exception. * @param[in] msg the exception's `what()` message * @param[in] loc the exception's call side information */ - explicit not_implemented_exception(const std::string &msg, source_location loc = source_location::current()) : - exception{ msg, "not_implemented_exception", loc } {} + explicit unsupported_kernel_type_exception(const std::string &msg, source_location loc = source_location::current()); }; } // namespace plssvm \ No newline at end of file diff --git a/include/plssvm/exceptions/source_location.hpp b/include/plssvm/exceptions/source_location.hpp index 5d27fe03e..13c686e98 100644 --- a/include/plssvm/exceptions/source_location.hpp +++ b/include/plssvm/exceptions/source_location.hpp @@ -16,7 +16,7 @@ namespace plssvm { /** - * @brief The `plssvm::source_location` class represents certain information about the source code, such as file names, line numbers or function names. + * @brief The plssvm::source_location class represents certain information about the source code, such as file names, line numbers or function names. * @details Based on [`std::source_location`](https://en.cppreference.com/w/cpp/utility/source_location). */ class source_location { @@ -32,45 +32,37 @@ class source_location { [[nodiscard]] static source_location current( const char *file_name = __builtin_FILE(), const char *function_name = __builtin_FUNCTION(), - const int line = __builtin_LINE(), - const int column = 0) noexcept { - source_location loc; - - loc.file_name_ = file_name; - loc.function_name_ = function_name; - loc.line_ = line; - loc.column_ = column; - - return loc; - } + int line = __builtin_LINE(), + int column = 0) noexcept; /** * @brief Returns the absolute path name of the file or `"unknown"` if no information could be retrieved. * @return the file name (`[[nodiscard]]`) */ - [[nodiscard]] std::string_view function_name() const noexcept { return function_name_; } + [[nodiscard]] std::string_view function_name() const noexcept; /** * @brief Returns the function name without additional signature information (i.e. return type and parameters) * or `"unknown"` if no information could be retrieved. * @return the function name (`[[nodiscard]]`) */ - [[nodiscard]] std::string_view file_name() const noexcept { return file_name_; } + [[nodiscard]] std::string_view file_name() const noexcept; /** * @brief Returns the line number or `0` if no information could be retrieved. * @return the line number (`[[nodiscard]]`) */ - [[nodiscard]] int line() const noexcept { return line_; } + [[nodiscard]] int line() const noexcept; /** - * @brief Returns the column number. Always `0`! + * @brief Returns the column number. + * @attention Always `0`! * @return `0` (`[[nodiscard]]`) */ - [[nodiscard]] int column() const noexcept { return column_; } + [[nodiscard]] int column() const noexcept; private: - std::string_view function_name_ = "unknown"; - std::string_view file_name_ = "unknown"; - int line_ = 0; - int column_ = 0; + std::string_view function_name_{ "unknown" }; + std::string_view file_name_{ "unknown" }; + int line_{ 0 }; + int column_{ 0 }; }; } // namespace plssvm diff --git a/include/plssvm/kernel_types.hpp b/include/plssvm/kernel_types.hpp index bdc691272..ce31e64af 100644 --- a/include/plssvm/kernel_types.hpp +++ b/include/plssvm/kernel_types.hpp @@ -11,88 +11,57 @@ #pragma once -#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT -#include "plssvm/detail/operators.hpp" // dot product, plssvm::squared_euclidean_dist -#include "plssvm/detail/string_utility.hpp" // plssvm::detail::to_lower_case -#include "plssvm/detail/utility.hpp" // plssvm::detail::always_false_v +#include "plssvm/detail/assert.hpp" // PLSSVM_ASSERT +#include "plssvm/detail/operators.hpp" // dot product, plssvm::squared_euclidean_dist +#include "plssvm/detail/utility.hpp" // plssvm::detail::always_false_v -#include "fmt/ostream.h" // use operator<< to enable fmt::format with custom type - -#include // std::pow, std::exp, std::fma -#include // std::ios::failbit -#include // std::istream -#include // std::ostream -#include // std::string -#include // std::vector +#include // std::pow, std::exp, std::fma +#include // forward declare std::ostream and std::istream +#include // std::vector namespace plssvm { /** - * @brief Enum class for the different kernel types. + * @brief Enum class for all implemented kernel functions. */ enum class kernel_type { - /** \f$\vec{u}^T \cdot \vec{v}\f$ */ + /** Linear kernel function: \f$\vec{u}^T \cdot \vec{v}\f$. */ linear = 0, - /** \f$(gamma \cdot \vec{u}^T \cdot \vec{v} + coef0)^{degree}\f$ */ + /** Polynomial kernel function: \f$(gamma \cdot \vec{u}^T \cdot \vec{v} + coef0)^{degree}\f$. */ polynomial = 1, - /** \f$e^{(-gamma \cdot |\vec{u} - \vec{v}|^2)}\f$ */ + /** Radial basis function: \f$e^{(-gamma \cdot |\vec{u} - \vec{v}|^2)}\f$. */ rbf = 2 }; /** - * @brief Stream-insertion operator overload for convenient printing of the kernel type @p kernel. + * @brief Output the @p kernel type to the given output-stream @p out. * @param[in,out] out the output-stream to write the kernel type to * @param[in] kernel the kernel type * @return the output-stream */ -inline std::ostream &operator<<(std::ostream &out, const kernel_type kernel) { - switch (kernel) { - case kernel_type::linear: - return out << "linear"; - case kernel_type::polynomial: - return out << "polynomial"; - case kernel_type::rbf: - return out << "rbf"; - } - return out << "unknown"; -} +std::ostream &operator<<(std::ostream &out, kernel_type kernel); /** - * @brief Stream-extraction operator overload for convenient converting a string to a kernel type. + * @brief Use the input-stream @p in to initialize the @p kernel type. * @details The extracted value is matched case-insensitive and can be the integer value of the kernel_type. * @param[in,out] in input-stream to extract the kernel type from * @param[in] kernel the kernel type * @return the input-stream */ -inline std::istream &operator>>(std::istream &in, kernel_type &kernel) { - std::string str; - in >> str; - detail::to_lower_case(str); - - if (str == "linear" || str == "0") { - kernel = kernel_type::linear; - } else if (str == "polynomial" || str == "1") { - kernel = kernel_type::polynomial; - } else if (str == "rbf" || str == "2") { - kernel = kernel_type::rbf; - } else { - in.setstate(std::ios::failbit); - } - return in; -} +std::istream &operator>>(std::istream &in, kernel_type &kernel); /** - * @brief Computes the value of the two vectors @p xi and @p xj using the kernel function determined at compile-time. + * @brief Computes the value of the two vectors @p xi and @p xj using the @p kernel function determined at compile-time. * @tparam kernel the type of the kernel * @tparam real_type the type of the values - * @tparam Args additional parameters used in the respective kernel functions + * @tparam Args additional parameters used in the respective kernel function * @param[in] xi the first vector * @param[in] xj the second vector * @param[in] args additional parameters - * @return the value computed by the kernel function + * @return the value computed by the @p kernel function (`[[nodiscard]]`) */ template -real_type kernel_function(const std::vector &xi, const std::vector &xj, Args &&...args) { +[[nodiscard]] inline real_type kernel_function(const std::vector &xi, const std::vector &xj, Args &&...args) { using namespace plssvm::operators; PLSSVM_ASSERT(xi.size() == xj.size(), "Sizes mismatch!: {} != {}", xi.size(), xj.size()); @@ -102,10 +71,10 @@ real_type kernel_function(const std::vector &xi, const std::vector(detail::get<0>(args...)); + const auto degree = static_cast(detail::get<0>(args...)); const auto gamma = static_cast(detail::get<1>(args...)); const auto coef0 = static_cast(detail::get<2>(args...)); - return std::pow(std::fma(gamma, (transposed{ xi } * xj), coef0), static_cast(degree)); + return std::pow(std::fma(gamma, (transposed{ xi } * xj), coef0), degree); } else if constexpr (kernel == kernel_type::rbf) { static_assert(sizeof...(args) == 1, "Illegal number of additional parameters! Must be 1."); const auto gamma = static_cast(detail::get<0>(args...)); diff --git a/include/plssvm/parameter.hpp b/include/plssvm/parameter.hpp index 2ef9a4a6f..ccc102124 100644 --- a/include/plssvm/parameter.hpp +++ b/include/plssvm/parameter.hpp @@ -6,20 +6,16 @@ * @license This file is part of the PLSSVM project which is released under the MIT license. * See the LICENSE.md file in the project root for full license information. * - * @brief Implements the base class encapsulating all necessary parameters. + * @brief Implements the parameter base class encapsulating all necessary parameters. */ #pragma once -#include "plssvm/backend_types.hpp" // plssvm::backend_type -#include "plssvm/exceptions/exceptions.hpp" // plssvm::invalid_file_format_exception -#include "plssvm/kernel_types.hpp" // plssvm::kernel_type -#include "plssvm/target_platform.hpp" // plssvm::target_platform +#include "plssvm/backend_types.hpp" // plssvm::backend_type +#include "plssvm/kernel_types.hpp" // plssvm::kernel_type +#include "plssvm/target_platforms.hpp" // plssvm::target_platform -#include "fmt/ostream.h" // use operator<< to enable fmt::format with custom type - -#include // std::size_t -#include // std::ostream (forward declaration only) +#include // forward declare std::ostream #include // std::shared_ptr #include // std::string #include // std::is_same_v @@ -39,21 +35,21 @@ class parameter { public: /// The type of the data. Must be either `float` or `double`. using real_type = T; - /// Unsigned integer type. - using size_type = std::size_t; - /// virtual, default destructor. + /** + * @brief Virtual destructor to enable safe inheritance. + */ virtual ~parameter() = default; /** - * @brief Parse a file in the [libsvm sparse file format](https://www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html#f303). - * @details The sparse libsvm file format saves each data point with its respective class as follows: + * @brief Parse a file in the [LIBSVM sparse file format](https://www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html#f303). + * @details The sparse LIBSVM file format saves each data point with its respective class as follows: * @code *