Skip to content

Commit

Permalink
Merge branch 'release-2.3.0-rc2'
Browse files Browse the repository at this point in the history
  • Loading branch information
alazzaro committed May 31, 2022
2 parents 910c210 + 8d75698 commit 2e4f799
Show file tree
Hide file tree
Showing 32 changed files with 588 additions and 553 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ jobs:
-DMPI_EXECUTABLE_SUFFIX=.mpich \
..
- name: Configure git to trust the workspace despite the different owner
run:
git config --global --add safe.directory "$GITHUB_WORKSPACE"

- name: Build Release Asset
run: cmake --build build -- dist

Expand Down
4 changes: 2 additions & 2 deletions VERSION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
MAJOR = 2
MINOR = 3
PATCH = 0-rc1
PATCH = 0-rc2
# A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
# it is considered Development version.
DATE = 2022-05-03
DATE =


6 changes: 3 additions & 3 deletions cmake/CompilerConfiguration.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@ Please open an issue at https://github.com/cp2k/dbcsr/issues with the reported c
endif ()

if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -funroll-loops -Wall -Werror")
set(CMAKE_CXX_FLAGS_COVERAGE "-O0 -g --coverage -Wall -Werror")
set(CMAKE_CXX_FLAGS_DEBUG "-O2 -ggdb -Wall -Werror -fsanitize=undefined -fsanitize=address -fsanitize-recover=all")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -funroll-loops -Wall -Wextra -Werror")
set(CMAKE_CXX_FLAGS_COVERAGE "-O0 -g --coverage -Wall -Wextra -Werror")
set(CMAKE_CXX_FLAGS_DEBUG "-O2 -ggdb -Wall -Wextra -Werror -fsanitize=undefined -fsanitize=address -fsanitize-recover=all")
if ((NOT (USE_MPI)) OR (NOT ("${MPI_Fortran_LIBRARY_VERSION_STRING}" MATCHES "Open MPI")))
set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_COVERAGE} -fsanitize=leak")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=leak")
Expand Down
7 changes: 6 additions & 1 deletion docs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@ add_custom_target(
COMMENT "Generating API documentation and doc pages"
COMMAND "${FORD_EXE}" "${FORD_PROJECT_FILE}"
VERBATIM)
add_dependencies(doc doc_copy_tests)

if (BUILD_TESTING)
add_dependencies(doc doc_copy_tests)
endif ()

if (WITH_C_API AND WITH_EXAMPLES)
add_dependencies(doc doc_copy_examples)
endif ()

add_dependencies(doc fypp) # only depend on the fypp step to avoid building
# everything just for the docs
2 changes: 2 additions & 0 deletions src/acc/acc.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#define DBCSR_CONCATENATE2(A, B) A##B
#define DBCSR_CONCATENATE(A, B) DBCSR_CONCATENATE2(A, B)

/** used to mark variables used */
#define DBCSR_MARK_USED(x) (void)(x)

#if defined(__cplusplus)
extern "C" {
Expand Down
61 changes: 37 additions & 24 deletions src/acc/acc_bench_smm.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,26 @@

#if defined(__LIBXSMM)
# include <libxsmm.h>
# include <libxsmm_sync.h>
# define USE_LIBXSMM
# if !defined(LIBXSMM_VERSION_NUMBER)
# define LIBXSMM_VERSION_NUMBER \
LIBXSMM_VERSION4(LIBXSMM_VERSION_MAJOR, LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE, LIBXSMM_VERSION_PATCH)
# endif
# define USE_LIBXSMM
# if defined(_OPENMP)
# define ACC_BENCH_USEOMP(FUNC) LIBXSMM_USEOMP(FUNC)
# else
# define ACC_BENCH_USEOMP(FUNC) (FUNC)
# endif
# define PRINTF(...) \
do { \
const size_t print_buffer_size = sizeof(print_buffer) - print_offset; \
const int print_buffer_result = LIBXSMM_SNPRINTF(print_buffer + print_offset, print_buffer_size, __VA_ARGS__); \
assert(0 <= print_buffer_result && print_buffer_result < (int)print_buffer_size); \
print_offset += print_buffer_result; \
} while (0)
#else
# define PRINTF(...) printf(__VA_ARGS__)
#endif

#if !defined(ELEM_TYPE)
Expand Down Expand Up @@ -81,17 +91,15 @@ static void parse_params(int argc, char* argv[], FILE** file, const char** snr,
if (NULL == *file) {
for (i = 0; i < argc; ++i) args[i] = argv[i + 1];
}
else {
else { /* input file is specified */
argc = 0;
if (NULL != fgets(buffer, sizeof(buffer), *file)) {
char* arg = strtok(buffer, DELIMS);
while (NULL == arg && NULL != fgets(buffer, sizeof(buffer), *file)) {
arg = strtok(buffer, DELIMS);
}
for (; NULL != arg; arg = strtok(NULL, DELIMS)) {
if (argc * sizeof(*args) < sizeof(args)) {
args[argc++] = arg;
}
if (argc * sizeof(*args) < sizeof(args)) args[argc++] = arg;
else { /* malformed command-line */
fclose(*file);
*file = NULL;
Expand Down Expand Up @@ -237,6 +245,8 @@ int main(int argc, char* argv[]) {
void* stream = NULL;
#if defined(USE_LIBXSMM)
libxsmm_timer_tickint start;
int print_offset = 0;
char print_buffer[1024];
# if defined(__OPENCL)
const char* const env_smm_repeat = getenv("SMM_NREPEAT");
const int smm_nrepeat = (NULL == env_smm_repeat ? 1 : MAX(atoi(env_smm_repeat), 1));
Expand Down Expand Up @@ -293,8 +303,8 @@ int main(int argc, char* argv[]) {
assert(m <= (mn / n) && 0 == (mn % n));
assert(m <= (mk / k) && 0 == (mk % k));
assert(k <= (kn / n) && 0 == (kn % n));
printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k, nc, na, nb);
printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
PRINTF("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k, nc, na, nb);
PRINTF("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n || MAX_KERNEL_DIM < k) {
fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n");
result = EXIT_FAILURE;
Expand Down Expand Up @@ -344,7 +354,7 @@ int main(int argc, char* argv[]) {
if (NULL != amat_hst && NULL != bmat_hst && NULL != stack_hst) {
const size_t size = sizeof(ELEM_TYPE) * (mk * na + kn * nb) + sizeof(int) * 3 * stack_size;
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
printf("copy-in (%i MB): %.2g ms %.1f GB/s\n", (int)((size + (1 << 19)) >> 20), 1000.0 * duration,
PRINTF("copy-in (%i MB): %.2g ms %.1f GB/s\n", (int)((size + (1 << 19)) >> 20), 1000.0 * duration,
size / (duration * (1ULL << 30)));
}
#endif
Expand All @@ -370,8 +380,8 @@ int main(int argc, char* argv[]) {
#endif
/* warmup execution and prebuild SMM-kernel */
for (r = 0; r < warmup; ++r) {
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3 /*nparams*/, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev,
m, n, k, MAX_KERNEL_DIM, 1 /*homogeneous*/, stream, stream),
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev, m, n, k,
MAX_KERNEL_DIM, 1 /*homogeneous*/, stream, stream),
&result);
}
CHECK(c_dbcsr_acc_memset_zero(cmat_dev, 0 /*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
Expand All @@ -381,19 +391,19 @@ int main(int argc, char* argv[]) {
#endif
for (r = 0; r < nrepeat; ++r) {
/* GPU-kernel is limited to C += Ai * Bi^T, i.e., NT (for NN, all Bi must be transposed upfront) */
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3 /*nparams*/, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev,
m, n, k, MAX_KERNEL_DIM, 1 /*homogeneous*/, stream, stream),
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev, m, n, k,
MAX_KERNEL_DIM, 1 /*homogeneous*/, stream, stream),
&result);
}
#if defined(USE_LIBXSMM)
CHECK(c_dbcsr_acc_stream_sync(stream), &result);
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
if (EXIT_SUCCESS == result) {
# if defined(TRANSPOSE)
printf("transpose: %.2g ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / (nrepeat * smm_nrepeat),
PRINTF("transpose: %.2g ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / (nrepeat * smm_nrepeat),
1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat * smm_nrepeat) / (duration + transpose));
# endif
printf("device: %.2g ms %.1f GFLOPS/s\n", 1000.0 * duration / (nrepeat * smm_nrepeat),
PRINTF("device: %.2g ms %.1f GFLOPS/s\n", 1000.0 * duration / (nrepeat * smm_nrepeat),
1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat * smm_nrepeat) / duration);
}
# if defined(VALIDATE)
Expand Down Expand Up @@ -425,7 +435,7 @@ int main(int argc, char* argv[]) {
stack_hst + 2, stack_size);
}
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
printf("host: %.2g ms %.1f GFLOPS/s\n", 1000.0 * duration / (nrepeat * smm_nrepeat),
PRINTF("host: %.2g ms %.1f GFLOPS/s\n", 1000.0 * duration / (nrepeat * smm_nrepeat),
1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat * smm_nrepeat) / duration);
/* validate correctness in case of successful result code/status */
if (EXIT_SUCCESS == result) {
Expand All @@ -439,18 +449,18 @@ int main(int argc, char* argv[]) {
result = libxsmm_matdiff(&diff, LIBXSMM_DATATYPE(ELEM_TYPE), mn, nc, gold_hst, cmat_hst, &mn, &mn);
if (EXIT_SUCCESS == result) {
const double relerror = 1.0 - diff.rsq;
printf("rel.error: %g", relerror);
PRINTF("rel.error: %g", relerror);
if (maxerror < relerror && NULL != file) maxerror = relerror;
if (0 < relerror) {
if (LIBXSMM_NOTNAN(diff.v_tst)) {
printf(" (%g != %g)\n", diff.v_ref, diff.v_tst);
PRINTF(" (%g != %g)\n", diff.v_ref, diff.v_tst);
}
else {
printf(" (%g)\n", diff.v_tst);
PRINTF(" (%g)\n", diff.v_tst);
}
}
else {
printf("\n");
PRINTF("\n");
}
if (0 < check && check < relerror) result = EXIT_FAILURE;
}
Expand All @@ -474,13 +484,16 @@ int main(int argc, char* argv[]) {
CHECK(c_dbcsr_acc_dev_mem_deallocate(cmat_dev), NULL);
CHECK(c_dbcsr_acc_stream_destroy(stream), NULL);
if (EXIT_SUCCESS == result) {
++nok;
parse_params(argc, argv, &file, &snr, &sss, &ssm, &ssn, &ssk, &snc, &sna, &snb);
if (NULL != file) {
printf("\n");
}
else break;
if (NULL != file) PRINTF("\n");
++nok;
}
#if defined(USE_LIBXSMM)
LIBXSMM_STDIO_ACQUIRE();
fputs(print_buffer, stdout);
LIBXSMM_STDIO_RELEASE();
#endif
if (EXIT_SUCCESS == result && NULL == file) break;
}
free(rnd); /* release array of random numbers */
#if !defined(__CUDA)
Expand Down
37 changes: 37 additions & 0 deletions src/acc/acc_bench_smm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env bash
####################################################################################################
# Copyright (C) by the DBCSR developers group - All rights reserved #
# This file is part of the DBCSR library. #
# #
# For information on the license, see the LICENSE file. #
# For further information please visit https://dbcsr.cp2k.org #
# SPDX-License-Identifier: GPL-2.0+ #
####################################################################################################

HERE=$(cd "$(dirname "$0")" && pwd -P)
SED=$(command -v gsed)

# GNU sed is desired (macOS)
if [ ! "${SED}" ]; then
SED=$(command -v sed)
fi

if [ "${SED}" ] && [ -x "${HERE}/acc_bench_smm" ]; then
LIBXSMM_PEXEC=${LIBXSMM_ROOT:-${HOME}/libxsmm}/scripts/tool_pexec.sh
if [ -x "${LIBXSMM_PEXEC}" ] && [ -e "$1" ]; then
NDEVICES=$(ACC_OPENCL_VERBOSE=1 CHECK=0 "${HERE}/acc_bench_smm" 1 1 1 2>&1 >/dev/null \
| ${SED} -n "s/INFO ACC\/OpenCL: ndevices=\([0-9][0-9]*\) ..*$/\1/p")
fi
if [ "${NDEVICES}" ] && [ "0" != "$((1<NDEVICES))" ]; then
NLINES=0
while read -r LINE; do
echo "ACC_OPENCL_DEVICE=$((NLINES%NDEVICES)) ${HERE}/acc_bench_smm ${LINE}"
NLINES=$((NLINES+1))
done <"$1" | ${LIBXSMM_PEXEC} "${NDEVICES}"
else
"${HERE}/acc_bench_smm" "$*"
fi
else
>&2 echo "ERROR: missing prerequisites!"
exit 1
fi
6 changes: 3 additions & 3 deletions src/acc/acc_libsmm.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ c_dbcsr_acc_bool_t libsmm_acc_is_thread_safe(void);
int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, void* dev_data, libsmm_acc_data_t datatype, int m,
int n, int max_kernel_dim, void* stream);

int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, int stack_size, int nparams,
libsmm_acc_data_t datatype, const void* dev_a_data, const void* dev_b_data, void* dev_c_data, int m_max, int n_max, int k_max,
int max_kernel_dim, c_dbcsr_acc_bool_t def_mnk, void* stack_stream, void* c_stream);
int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, int stack_size, libsmm_acc_data_t datatype,
const void* dev_a_data, const void* dev_b_data, void* dev_c_data, int m_max, int n_max, int k_max, int max_kernel_dim,
c_dbcsr_acc_bool_t def_mnk, void* stack_stream, void* c_stream);

static const char libsmm_acc_transpose_routine_name_str[] = "jit_kernel_transpose";
static const char* const libsmm_acc_transpose_routine_name_ptr = libsmm_acc_transpose_routine_name_str;
Expand Down
2 changes: 1 addition & 1 deletion src/acc/acc_triplets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ SED=$(command -v gsed)
CUT=$(command -v cut)

# GNU sed is desired (macOS)
if [ "" = "${SED}" ]; then
if [ ! "${SED}" ]; then
SED=$(command -v sed)
fi

Expand Down
4 changes: 4 additions & 0 deletions src/acc/cuda/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,13 @@ $(DIRSMM)/parameters.h: $(MAKDIR)/Makefile $(DIRSMM)/generate_parameters.py $(PA
$(DIRSMM)/smm_acc_kernels.h: $(GPUSMM) $(MAKDIR)/Makefile $(DIRSMM)/generate_kernels.py $(PARAMS)
@cd $(DIRSMM) && $(PYTHON) ../libsmm_acc/generate_kernels.py ../libsmm_acc/kernels

.PHONY: backend
backend: $(ACCDIR)/dbcsr_acc.a
$(ACCDIR)/dbcsr_acc.a: $(OBJACC) $(DIRSMM)/libsmm_acc_init.o
$(AR) -rs $@ $^

.PHONY: libsmm
libsmm: $(ACCDIR)/dbcsr_acc_smm.a
$(ACCDIR)/dbcsr_acc_smm.a: $(OBJSMM)
$(AR) -rs $@ $^

Expand Down
4 changes: 4 additions & 0 deletions src/acc/cuda_hip/acc_mem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ extern "C" int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) {
extern "C" int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t n, void* stream) {
unsigned int flag = ACC(HostAllocDefault);

DBCSR_MARK_USED(stream);

ACC_API_CALL(HostAlloc, ((void**)host_mem, (size_t)n, flag));
if (host_mem == NULL) return -2;
if (verbose_print) printf("Allocating %zd bytes of host pinned memory at %p\n", n, *host_mem);
Expand All @@ -58,6 +60,8 @@ extern "C" int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t n, void* st

/****************************************************************************/
extern "C" int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) {
DBCSR_MARK_USED(stream);

if (verbose_print) printf("Host pinned deallocation address %p\n", host_mem);
ACC_API_CALL(FreeHost, ((void*)host_mem));

Expand Down
2 changes: 2 additions & 0 deletions src/acc/cuda_hip/acc_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ extern "C" int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int

#if defined(__CUDA_PROFILING)
nvtxNameCudaStreamA(*acc_stream, name);
#else
DBCSR_MARK_USED(name);
#endif

return 0;
Expand Down
8 changes: 4 additions & 4 deletions src/acc/libsmm_acc/libsmm_acc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ kernel_map_iterator add_kernel_handle_to_jitted_kernels(

//===========================================================================
int libsmm_acc_process_blas(const int* param_stack, int stack_size, ACC_DRV(stream) stream, int m, int n, int k, int max_kernel_dim,
const double* a_data, const double* b_data, double* c_data, std::vector<ACC_BLAS(Handle_t) *> handles = acc_blashandles) {
const double* a_data, const double* b_data, double* c_data) {
#if defined _OPENMP
int ithread = omp_get_thread_num();
#else
Expand Down Expand Up @@ -287,9 +287,9 @@ int libsmm_acc_process_d(const int* param_stack, int stack_size, ACC_DRV(stream)
}

//===========================================================================
int libsmm_acc_process(const int* param_stack_host, const int* param_stack_dev, int stack_size, int nparams,
libsmm_acc_data_t datatype, const void* a_data, const void* b_data, void* c_data, int m, int n, int k, int max_kernel_dim,
int def_mnk, void* stack_stream, void* c_stream) {
int libsmm_acc_process(const int* param_stack_host, const int* param_stack_dev, int stack_size, libsmm_acc_data_t datatype,
const void* a_data, const void* b_data, void* c_data, int m, int n, int k, int max_kernel_dim, int def_mnk, void* stack_stream,
void* c_stream) {
if (def_mnk != 1) return -1; // inhomogeneous stacks not supported
if (datatype == dbcsr_type_real_8) {
if (m > max_kernel_dim || n > max_kernel_dim || k > max_kernel_dim)
Expand Down
2 changes: 1 addition & 1 deletion src/acc/libsmm_acc/libsmm_acc.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ typedef std::unordered_map<Triplet, kernel_launcher>::iterator kernel_map_iterat
static std::unordered_map<Triplet, kernel_launcher> kernel_handles;

int libsmm_acc_process_blas(const int* param_stack_host, int stack_size, ACC_DRV(stream) stream, int m, int n, int k,
const double* a_data, const double* b_data, double* c_data, ACC_BLAS(Handle_t) * handle);
const double* a_data, const double* b_data, double* c_data);

int libsmm_acc_process_d(const int* param_stack_dev, int stack_size, ACC_DRV(stream) stream, int m, int n, int k,
const double* a_data, const double* b_data, double* c_data);
Expand Down
Loading

0 comments on commit 2e4f799

Please sign in to comment.