diff --git a/src/tools/perf/api/libperf.h b/src/tools/perf/api/libperf.h index 2a94e031e0a..9b1da56314b 100644 --- a/src/tools/perf/api/libperf.h +++ b/src/tools/perf/api/libperf.h @@ -48,7 +48,8 @@ typedef enum { typedef enum { UCX_PERF_TEST_TYPE_PINGPONG, /* Ping-pong mode */ - UCX_PERF_TEST_TYPE_PINGPONG_WFE, /* Ping-pong mode with wait for event */ + UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM,/* Ping-pong mode with + ucp_worker_wait_mem() */ UCX_PERF_TEST_TYPE_STREAM_UNI, /* Unidirectional stream */ UCX_PERF_TEST_TYPE_STREAM_BI, /* Bidirectional stream */ UCX_PERF_TEST_TYPE_LAST diff --git a/src/tools/perf/lib/libperf.c b/src/tools/perf/lib/libperf.c index 8a5724df85b..71196f0b54d 100644 --- a/src/tools/perf/lib/libperf.c +++ b/src/tools/perf/lib/libperf.c @@ -324,7 +324,7 @@ void ucx_perf_calc_result(ucx_perf_context_t *perf, ucx_perf_result_t *result) double factor; if ((perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG) || - (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG_WFE)) { + (perf->params.test_type == UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM)) { factor = 2.0; } else { factor = 1.0; diff --git a/src/tools/perf/lib/ucp_tests.cc b/src/tools/perf/lib/ucp_tests.cc index e0d73f7c565..fc6bda9e561 100644 --- a/src/tools/perf/lib/ucp_tests.cc +++ b/src/tools/perf/lib/ucp_tests.cc @@ -198,7 +198,7 @@ class ucp_perf_test_runner { /* coverity[switch_selector_expr_is_constant] */ switch (TYPE) { case UCX_PERF_TEST_TYPE_PINGPONG: - case UCX_PERF_TEST_TYPE_PINGPONG_WFE: + case UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM: *((uint8_t*)buffer + length - 1) = sn; break; case UCX_PERF_TEST_TYPE_STREAM_UNI: @@ -287,13 +287,13 @@ class ucp_perf_test_runner { progress_responder(); } return UCS_OK; - case UCX_PERF_TEST_TYPE_PINGPONG_WFE: - ptr = (volatile uint8_t*)buffer + length - 1; - while (*ptr != sn) { - ucp_worker_wait_mem(worker, (void *)ptr); - progress_responder(); - } - return UCS_OK; + case UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM: + ptr = (volatile uint8_t*)buffer + length - 1; + while (*ptr != sn) { + ucp_worker_wait_mem(worker, (void *)ptr); + progress_responder(); + } + return UCS_OK; case UCX_PERF_TEST_TYPE_STREAM_UNI: return UCS_OK; default: @@ -519,7 +519,7 @@ class ucp_perf_test_runner { /* coverity[switch_selector_expr_is_constant] */ switch (TYPE) { case UCX_PERF_TEST_TYPE_PINGPONG: - case UCX_PERF_TEST_TYPE_PINGPONG_WFE: + case UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM: return run_pingpong(); case UCX_PERF_TEST_TYPE_STREAM_UNI: return run_stream_uni(); @@ -635,7 +635,7 @@ ucs_status_t ucp_perf_test_dispatch(ucx_perf_context_t *perf) { UCS_PP_FOREACH(TEST_CASE_ALL_OSD, perf, (UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG), - (UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG_WFE), + (UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM), (UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI), (UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI), (UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI), diff --git a/src/tools/perf/lib/uct_tests.cc b/src/tools/perf/lib/uct_tests.cc index 57fc5cc6af8..96afbb5a53e 100644 --- a/src/tools/perf/lib/uct_tests.cc +++ b/src/tools/perf/lib/uct_tests.cc @@ -265,7 +265,7 @@ class uct_perf_test_runner { } case UCX_PERF_CMD_PUT: if ((TYPE == UCX_PERF_TEST_TYPE_PINGPONG) || - (TYPE == UCX_PERF_TEST_TYPE_PINGPONG_WFE)) { + (TYPE == UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM)) { /* Put the control word at the latest byte of the IOV message */ set_sn(UCS_PTR_BYTE_OFFSET(buffer, uct_perf_get_buffer_extent(&m_perf.params) - 1), @@ -623,7 +623,7 @@ class uct_perf_test_runner { /* coverity[switch_selector_expr_is_constant] */ switch (TYPE) { case UCX_PERF_TEST_TYPE_PINGPONG: - case UCX_PERF_TEST_TYPE_PINGPONG_WFE: + case UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM: return run_pingpong(); case UCX_PERF_TEST_TYPE_STREAM_UNI: /* coverity[switch_selector_expr_is_constant] */ diff --git a/test/gtest/Makefile.am b/test/gtest/Makefile.am index 9c45bf384bb..70899ec5269 100644 --- a/test/gtest/Makefile.am +++ b/test/gtest/Makefile.am @@ -123,7 +123,6 @@ gtest_SOURCES = \ ucp/test_ucp_mmap.cc \ ucp/test_ucp_mem_type.cc \ ucp/test_ucp_perf.cc \ - ucp/test_ucp_wfe.cc \ ucp/test_ucp_proto.cc \ ucp/test_ucp_rma.cc \ ucp/test_ucp_rma_mt.cc \ diff --git a/test/gtest/common/test_perf.cc b/test/gtest/common/test_perf.cc index b457c20549d..00df12174f5 100644 --- a/test/gtest/common/test_perf.cc +++ b/test/gtest/common/test_perf.cc @@ -18,6 +18,11 @@ extern "C" { #include +#define UCP_ARM_PERF_TEST_MULTIPLIER 2 +#define UCT_ARM_PERF_TEST_MULTIPLIER 15 +#define UCT_PERF_TEST_MULTIPLIER 5 + + test_perf::rte_comm::rte_comm() { pthread_mutex_init(&m_mutex, NULL); } @@ -252,9 +257,8 @@ test_perf::test_result test_perf::run_multi_threaded(const test_spec &test, unsi return result; } -void test_perf::run_test(const test_spec& test, unsigned flags, bool check_perf, - const std::string &tl_name, const std::string &dev_name, - double *perf_value) +double test_perf::run_test(const test_spec& test, unsigned flags, bool check_perf, + const std::string &tl_name, const std::string &dev_name) { std::vector cpus = get_affinity(); if (cpus.size() < 2) { @@ -272,7 +276,7 @@ void test_perf::run_test(const test_spec& test, unsigned flags, bool check_perf, if ((result.status == UCS_ERR_UNSUPPORTED) || (result.status == UCS_ERR_UNREACHABLE)) { - return; /* Skipped */ + return 0.0; /* Skipped */ } ASSERT_UCS_OK(result.status); @@ -292,20 +296,17 @@ void test_perf::run_test(const test_spec& test, unsigned flags, bool check_perf, UCS_TEST_MESSAGE << result_str << " (attempt " << i << ")"; } - if (perf_value != NULL) { - *perf_value = value; - } - if (!check_perf) { - return; /* Skip */ + return value; /* Skip */ } else if ((value >= test.min) && (value <= test.max)) { - return; /* Success */ + return value; /* Success */ } else { ucs::safe_sleep(ucs::perf_retry_interval); } } - ADD_FAILURE() << "Invalid " << test.title << " performance, expected: " << - std::setprecision(3) << test.min << ".." << test.max; -} + ADD_FAILURE() << "Invalid " << test.title << " performance, expected: " + << std::setprecision(3) << test.min << ".." << test.max; + return 0.0; +} diff --git a/test/gtest/common/test_perf.h b/test/gtest/common/test_perf.h index 2780cdc4556..a3c753afb05 100644 --- a/test/gtest/common/test_perf.h +++ b/test/gtest/common/test_perf.h @@ -35,9 +35,8 @@ class test_perf { static std::vector get_affinity(); - void run_test(const test_spec& test, unsigned flags, bool check_perf, const - std::string &tl_name, const std::string &dev_name, double - *perf_value = NULL); + double run_test(const test_spec& test, unsigned flags, bool check_perf, const + std::string &tl_name, const std::string &dev_name); private: class rte_comm { diff --git a/test/gtest/ucp/test_ucp_perf.cc b/test/gtest/ucp/test_ucp_perf.cc index 009b763498b..448eb0f7a67 100644 --- a/test/gtest/ucp/test_ucp_perf.cc +++ b/test/gtest/ucp/test_ucp_perf.cc @@ -1,7 +1,8 @@ /** * Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* * Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. +* Copyright (C) ARM Ltd. 2020. ALL RIGHTS RESERVED. +* * See file LICENSE for terms. */ @@ -11,7 +12,9 @@ #define MB pow(1024.0, -2) -#define UCP_ARM_PERF_TEST_MULTIPLIER 2 +#define UCT_PERF_TEST_MULTIPLIER 5 +#define UCT_ARM_PERF_TEST_MULTIPLIER 15 + class test_ucp_perf : public ucp_test, public test_perf { public: static void get_test_variants(std::vector& variants) { @@ -37,7 +40,7 @@ class test_ucp_perf : public ucp_test, public test_perf { // Ignore errors that transport cannot reach peer if (level == UCS_LOG_LEVEL_ERROR) { std::string err_str = format_message(message, ap); - if (strstr(err_str.c_str(), ucs_status_string(UCS_ERR_UNREACHABLE)) || + if (strstr(err_str.c_str(), ucs_status_string(UCS_ERR_UNREACHABLE)) || strstr(err_str.c_str(), ucs_status_string(UCS_ERR_UNSUPPORTED))) { UCS_TEST_MESSAGE << err_str; return UCS_LOG_FUNC_RC_STOP; @@ -187,16 +190,62 @@ UCS_TEST_P(test_ucp_perf, envelope) { ucs::scoped_setenv warn_invalid("UCX_WARN_INVALID_CONFIG", "no"); /* Run all tests */ - for (const test_spec *test_iter = tests; test_iter->title != NULL; ++test_iter) { + for (const test_spec *test_iter = tests; test_iter->title != NULL; + ++test_iter) { test_spec test = *test_iter; if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_ARM_AARCH64) { - test.max *= UCP_ARM_PERF_TEST_MULTIPLIER; - test.min /= UCP_ARM_PERF_TEST_MULTIPLIER; + test.max *= UCT_ARM_PERF_TEST_MULTIPLIER; + test.min /= UCT_ARM_PERF_TEST_MULTIPLIER; + } else { + test.max *= UCT_PERF_TEST_MULTIPLIER; + test.min /= UCT_PERF_TEST_MULTIPLIER; } test.iters = ucs_min(test.iters, max_iter); + run_test(test, 0, check_perf, "", ""); } } UCP_INSTANTIATE_TEST_CASE(test_ucp_perf) + + +class test_ucp_wait_mem : public test_ucp_perf {}; + +UCS_TEST_P(test_ucp_wait_mem, envelope) { + double perf_avg = 0; + double perf_iter = 0; + const int max_iter = ucs_max(ucs::perf_retry_count, 1); + int i; + + /* Run ping-pong with no WFE and get latency reference values */ + const test_spec test1 = { "put latency reference", "usec", + UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, + UCX_PERF_TEST_TYPE_PINGPONG, + UCP_PERF_DATATYPE_CONTIG, + 0, 1, { 8 }, 1, 1000lu, + ucs_offsetof(ucx_perf_result_t, + latency.total_average), + 1e6, 0.001, 30.0, 0 }; + for (i = 0; i < max_iter; i++) { + perf_iter = run_test(test1, 0, false, "", ""); + perf_avg += perf_iter; + } + perf_avg /= max_iter; + + /* Run ping-pong with WFE while re-using previous run numbers as + * a min/max boundary. The latency of the WFE run should stay nearly + * identical with 200 percent margin. When WFE does not work as expected + * the slow down is typically 10x-100x */ + const test_spec test2 = { "put latency with ucp_worker_wait_mem()", + "usec", UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, + UCX_PERF_TEST_TYPE_PINGPONG_WAIT_MEM, + UCP_PERF_DATATYPE_CONTIG, + 0, 1, { 8 }, 1, 1000lu, + ucs_offsetof(ucx_perf_result_t, + latency.total_average), + 1e6, perf_avg * 0.7, perf_avg * 2, 0 }; + run_test(test2, 0, true, "", ""); +} + +UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wait_mem, shm, "shm") diff --git a/test/gtest/ucp/test_ucp_wfe.cc b/test/gtest/ucp/test_ucp_wfe.cc deleted file mode 100644 index 5a4b8184fbd..00000000000 --- a/test/gtest/ucp/test_ucp_wfe.cc +++ /dev/null @@ -1,105 +0,0 @@ -/** -* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED. -* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED. -* Copyright (C) ARM Ltd. 2020. ALL RIGHTS RESERVED. -* -* See file LICENSE for terms. -*/ - -#include "ucp_test.h" - -#include - - -#define MB pow(1024.0, -2) -#define UCP_ARM_PERF_TEST_MULTIPLIER 2 - - -class test_ucp_wfe : public ucp_test, public test_perf { -public: - static void get_test_variants(std::vector& variants) { - add_variant(variants, 0); - } - -protected: - virtual void init() { - test_base::init(); /* Skip entities creation in ucp_test */ - ucs_log_push_handler(log_handler); - } - - virtual void cleanup() { - ucs_log_pop_handler(); - test_base::cleanup(); - } - - static ucs_log_func_rc_t - log_handler(const char *file, unsigned line, const char *function, - ucs_log_level_t level, - const ucs_log_component_config_t *comp_conf, - const char *message, va_list ap) { - // Ignore errors that transport cannot reach peer - if (level == UCS_LOG_LEVEL_ERROR) { - std::string err_str = format_message(message, ap); - if ((err_str.find(ucs_status_string(UCS_ERR_UNREACHABLE)) != std::string::npos) || - (err_str.find(ucs_status_string(UCS_ERR_UNSUPPORTED)) != std::string::npos)) { - UCS_TEST_MESSAGE << err_str; - return UCS_LOG_FUNC_RC_STOP; - } - } - return UCS_LOG_FUNC_RC_CONTINUE; - } - - const static test_spec tests[]; -}; - - -enum { - UCX_PERF_TEST_LAT_NO_WFE, - UCX_PERF_TEST_LAT_WITH_WFE -}; - - -const test_perf::test_spec test_ucp_wfe::tests[] = -{ - { "put latency", "usec", - UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 1000lu, - ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, - 0 }, - - { "put latency with WFE", "usec", - UCX_PERF_API_UCP, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG_WFE, - UCP_PERF_DATATYPE_CONTIG, 0, 1, { 8 }, 1, 1000lu, - ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.001, 30.0, - 0 } -}; - - -UCS_TEST_P(test_ucp_wfe, envelope) { - double perf_value = 0; - int cache_perf_retry_count; - test_spec test; - - test = tests[UCX_PERF_TEST_LAT_NO_WFE]; - if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_ARM_AARCH64) { - test.max *= UCP_ARM_PERF_TEST_MULTIPLIER; - test.min /= UCP_ARM_PERF_TEST_MULTIPLIER; - } - /* Run ping-pong with no WFE and get latency reference values */ - run_test(test, 0, false, "", "", &perf_value); - /* Run ping-pong with WFE while re-using previous run numbers as a min/max - * boundary. The latency of the WFE run should stay nearly identical with 250 - * percent margin. When WFE does not work as expected the slow down is - * typically 10x-100x */ - test = tests[UCX_PERF_TEST_LAT_WITH_WFE]; - test.max = perf_value * 2.5; - test.min = perf_value * 0.7; - cache_perf_retry_count = ucs::perf_retry_count; - ucs::perf_retry_count = 3; /* Have to be set to 1 otherwise the performance - measurement is ignored */ - run_test(test, 0, false, "", ""); - /* restore global value for perf retry */ - ucs::perf_retry_count = cache_perf_retry_count; -} - -UCP_INSTANTIATE_TEST_CASE_TLS(test_ucp_wfe, shm, "shm,")