diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh
index 1b9fc54cec02..c9b2696b9fd2 100755
--- a/contrib/test_jenkins.sh
+++ b/contrib/test_jenkins.sh
@@ -1060,40 +1060,33 @@ test_malloc_hook() {
 		cuda_dynamic_exe=./test/apps/test_cuda_hook_dynamic
 		cuda_static_exe=./test/apps/test_cuda_hook_static
 
-		for mode in reloc bistro
-		do
-			export UCX_MEM_CUDA_HOOK_MODE=${mode}
-
-			# Run cuda memory hooks with dynamic link
-			${cuda_dynamic_exe}
+		# Run cuda memory hooks with dynamic link
+		${cuda_dynamic_exe}
 
-			# Run cuda memory hooks with static link, if exists. If the static
-			# library 'libcudart_static.a' is not present, static test will not
-			# be built.
-			if [ -x ${cuda_static_exe} ]
+		# Run cuda memory hooks with static link, if exists. If the static
+		# library 'libcudart_static.a' is not present, static test will not
+		# be built.
+		if [ -x ${cuda_static_exe} ]
+		then
+			${cuda_static_exe} && status="pass" || status="fail"
+			[ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail"
+			if [ ${status} == ${exp_status} ]
 			then
-				${cuda_static_exe} && status="pass" || status="fail"
-				[ ${mode} == "bistro" ] && exp_status="pass" || exp_status="fail"
-				if [ ${status} == ${exp_status} ]
-				then
-					echo "Static link with cuda ${status}, as expected"
-				else
-					echo "Static link with cuda is expected to ${exp_status}, actual: ${status}"
-					exit 1
-				fi
+				echo "Static link with cuda ${status}, as expected"
+			else
+				echo "Static link with cuda is expected to ${exp_status}, actual: ${status}"
+				exit 1
 			fi
+		fi
 
-			# Test that driver API hooks work in both reloc and bistro modes,
-			# since we call them directly from the test
-			${cuda_dynamic_exe} -d
-			[ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d
-
-			# Test hooks in gtest
-			UCX_MEM_LOG_LEVEL=diag \
-				./test/gtest/gtest --gtest_filter='cuda_hooks.*'
+		# Test that driver API hooks work in both reloc and bistro modes,
+		# since we call them directly from the test
+		${cuda_dynamic_exe} -d
+		[ -x ${cuda_static_exe} ] && ${cuda_static_exe} -d
 
-			unset UCX_MEM_CUDA_HOOK_MODE
-		done
+		# Test hooks in gtest
+		UCX_MEM_LOG_LEVEL=diag \
+			./test/gtest/gtest --gtest_filter='cuda_hooks.*'
 	fi
 }
 
diff --git a/src/ucm/cuda/cudamem.c b/src/ucm/cuda/cudamem.c
index 23a9c5d18061..66309a115bcc 100644
--- a/src/ucm/cuda/cudamem.c
+++ b/src/ucm/cuda/cudamem.c
@@ -87,15 +87,6 @@ UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFree_v2, CUresult, -1, CUdeviceptr)
 UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost, CUresult, -1, void*)
 UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cuMemFreeHost_v2, CUresult, -1, void*)
 
-/* Runtime API */
-UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFree, cudaError_t, -1, void*)
-UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaFreeHost, cudaError_t, -1, void*)
-UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMalloc, cudaError_t, -1, void**, size_t)
-UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocManaged, cudaError_t, -1, void**,
-                                  size_t, unsigned int)
-UCM_DEFINE_REPLACE_DLSYM_PTR_FUNC(cudaMallocPitch, cudaError_t, -1, void**,
-                                  size_t*, size_t, size_t)
-
 static void ucm_cuda_dispatch_mem_alloc(CUdeviceptr ptr, size_t length,
                                         ucs_memory_type_t mem_type)
 {
@@ -160,100 +151,55 @@ UCM_CUDA_FREE_FUNC(cuMemFree_v2, CUresult, CUdeviceptr, UCS_MEMORY_TYPE_CUDA)
 UCM_CUDA_FREE_FUNC(cuMemFreeHost, CUresult, void*, UCS_MEMORY_TYPE_HOST)
 UCM_CUDA_FREE_FUNC(cuMemFreeHost_v2, CUresult, void*, UCS_MEMORY_TYPE_HOST)
 
-static ucm_cuda_func_t ucm_cuda_driver_funcs[] = {
-    UCM_CUDA_FUNC_ENTRY(cuMemAlloc),
-    UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2),
-    UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged),
-    UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch),
-    UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2),
-    UCM_CUDA_FUNC_ENTRY(cuMemFree),
-    UCM_CUDA_FUNC_ENTRY(cuMemFree_v2),
-    UCM_CUDA_FUNC_ENTRY(cuMemFreeHost),
-    UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2),
-    {{NULL}, NULL}
-};
-
-/* Runtime API replacements */
-UCM_CUDA_ALLOC_FUNC(cudaMalloc, UCS_MEMORY_TYPE_CUDA, cudaError_t, cudaSuccess,
-                    arg0, void*, "size=%zu", size_t)
-UCM_CUDA_ALLOC_FUNC(cudaMallocManaged, UCS_MEMORY_TYPE_CUDA_MANAGED,
-                    cudaError_t, cudaSuccess, arg0, void*,
-                    "size=%zu flags=0x%x", size_t, unsigned)
-UCM_CUDA_ALLOC_FUNC(cudaMallocPitch, UCS_MEMORY_TYPE_CUDA, cudaError_t,
-                    cudaSuccess, (size_t)arg1 * arg2, void*,
-                    "pitch=%p width=%zu height=%zu", size_t*, size_t, size_t)
-UCM_CUDA_FREE_FUNC(cudaFree, cudaError_t, void*, UCS_MEMORY_TYPE_CUDA)
-UCM_CUDA_FREE_FUNC(cudaFreeHost, cudaError_t, void*, UCS_MEMORY_TYPE_HOST)
-
-static ucm_cuda_func_t ucm_cuda_runtime_funcs[] = {
-    UCM_CUDA_FUNC_ENTRY(cudaFree),
-    UCM_CUDA_FUNC_ENTRY(cudaFreeHost),
-    UCM_CUDA_FUNC_ENTRY(cudaMalloc),
-    UCM_CUDA_FUNC_ENTRY(cudaMallocManaged),
-    UCM_CUDA_FUNC_ENTRY(cudaMallocPitch),
-    {{NULL}, NULL}
-};
-
 static int ucm_cuda_allow_hook_mode(ucm_mmap_hook_mode_t mode)
 {
     return (ucm_global_opts.cuda_hook_modes & UCS_BIT(mode)) &&
            (ucm_get_hook_mode(mode) == mode);
 }
 
-static ucs_status_t
-ucm_cuda_install_hooks(ucm_cuda_func_t *funcs, int *used_reloc,
-                       const char *name)
+static ucs_status_t ucm_cuda_install_hooks()
 {
+    static ucm_cuda_func_t funcs[] = {
+        UCM_CUDA_FUNC_ENTRY(cuMemAlloc),
+        UCM_CUDA_FUNC_ENTRY(cuMemAlloc_v2),
+        UCM_CUDA_FUNC_ENTRY(cuMemAllocManaged),
+        UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch),
+        UCM_CUDA_FUNC_ENTRY(cuMemAllocPitch_v2),
+        UCM_CUDA_FUNC_ENTRY(cuMemFree),
+        UCM_CUDA_FUNC_ENTRY(cuMemFree_v2),
+        UCM_CUDA_FUNC_ENTRY(cuMemFreeHost),
+        UCM_CUDA_FUNC_ENTRY(cuMemFreeHost_v2),
+        {{NULL}, NULL}
+    };
     const char UCS_V_UNUSED *hook_mode;
-    unsigned num_bistro, num_reloc;
     ucm_cuda_func_t *func;
     ucs_status_t status;
+    unsigned num_hooks;
     void *func_ptr;
 
-    num_bistro  = 0;
-    num_reloc   = 0;
+    num_hooks = 0;
     for (func = funcs; func->patch.symbol != NULL; ++func) {
         func_ptr = ucm_reloc_get_orig(func->patch.symbol, func->patch.value);
         if (func_ptr == NULL) {
             continue;
         }
 
-        status = UCS_ERR_UNSUPPORTED;
-
-        if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) {
-            status = ucm_bistro_patch(func_ptr, func->patch.value,
-                                      func->patch.symbol, func->orig_func_ptr,
-                                      NULL);
-            if (status == UCS_OK) {
-                ucm_debug("installed bistro hook for '%s': %d",
-                          func->patch.symbol, status);
-                ++num_bistro;
-                continue;
-            }
-
+        status = ucm_bistro_patch(func_ptr, func->patch.value,
+                                  func->patch.symbol, func->orig_func_ptr,
+                                  NULL);
+        if (status != UCS_OK) {
             ucm_debug("failed to install bistro hook for '%s'",
                       func->patch.symbol);
+            return status;
         }
 
-        if (ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_RELOC)) {
-            status = ucm_reloc_modify(&func->patch);
-            if (status == UCS_OK) {
-                ++num_reloc;
-                ucm_debug("installed reloc hook on '%s'", func->patch.symbol);
-                continue;
-            }
-
-            ucm_debug("failed to install relocation table hook for '%s'",
-                      func->patch.symbol);
-        }
-
-        ucm_diag("failed to install hook for '%s'", func->patch.symbol);
-        return status;
+        ucm_debug("installed bistro hook for '%s': %d", func->patch.symbol,
+                  status);
+        ++num_hooks;
     }
 
-    *used_reloc = num_reloc > 0;
-    ucm_info("cuda memory hooks on %s API: installed %u bistro and %u reloc",
-             name, num_bistro, num_reloc);
+    ucm_info("installed %u/%zu cuda memory hooks", num_hooks,
+             ucs_static_array_size(funcs) - 1);
     return UCS_OK;
 }
 
@@ -262,7 +208,6 @@ static ucs_status_t ucm_cudamem_install(int events)
     static int ucm_cudamem_installed     = 0;
     static pthread_mutex_t install_mutex = PTHREAD_MUTEX_INITIALIZER;
     ucs_status_t status                  = UCS_OK;
-    int used_reloc;
 
     if (!(events & (UCM_EVENT_MEM_TYPE_ALLOC | UCM_EVENT_MEM_TYPE_FREE))) {
         goto out;
@@ -274,30 +219,26 @@ static ucs_status_t ucm_cudamem_install(int events)
         goto out;
     }
 
+    if (!ucm_cuda_allow_hook_mode(UCM_MMAP_HOOK_BISTRO)) {
+        ucm_diag("cuda hooks require bistro mode enabled");
+        status = UCS_ERR_UNSUPPORTED;
+        goto out;
+    }
+
     pthread_mutex_lock(&install_mutex);
 
     if (ucm_cudamem_installed) {
         goto out_unlock;
     }
 
-    status = ucm_cuda_install_hooks(ucm_cuda_driver_funcs, &used_reloc,
-                                    "driver");
+    status = ucm_cuda_install_hooks();
     if (status != UCS_OK) {
-        ucm_warn("failed to install cuda memory hooks on driver API");
-    } else if (!used_reloc) {
-        ucm_cudamem_installed = 1;
-    } else if (status == UCS_OK) {
-        /* Failed to install bistro hooks on all driver APIs, so need to install
-           hooks on runtime APIs. */
-        status = ucm_cuda_install_hooks(ucm_cuda_runtime_funcs, &used_reloc,
-                                        "runtime");
-        if (status == UCS_OK) {
-            ucm_cudamem_installed = 1;
-        } else {
-            ucm_warn("failed to install cuda memory hooks on runtime API")
-        }
+        ucm_diag("failed to install cuda memory hooks on driver API");
+        goto out_unlock;
     }
 
+    ucm_cudamem_installed = 1;
+
 out_unlock:
     pthread_mutex_unlock(&install_mutex);
 out:
diff --git a/src/ucm/cuda/cudamem.h b/src/ucm/cuda/cudamem.h
index cd5a4087e27e..0939d201a8c7 100644
--- a/src/ucm/cuda/cudamem.h
+++ b/src/ucm/cuda/cudamem.h
@@ -7,7 +7,6 @@
 #ifndef UCM_CUDAMEM_H_
 #define UCM_CUDAMEM_H_
 
-#include <cuda_runtime.h>
 #include <cuda.h>
 
 
@@ -25,11 +24,4 @@ CUresult ucm_cuMemFree_v2(CUdeviceptr dptr);
 CUresult ucm_cuMemFreeHost(void *p);
 CUresult ucm_cuMemFreeHost_v2(void *p);
 
-cudaError_t ucm_cudaFree(void *devPtr);
-cudaError_t ucm_cudaFreeHost(void *ptr);
-cudaError_t ucm_cudaMalloc(void **devPtr, size_t size);
-cudaError_t ucm_cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
-cudaError_t ucm_cudaMallocPitch(void **devPtr, size_t *pitch,
-                                size_t width, size_t height);
-
 #endif
diff --git a/src/ucm/util/reloc.c b/src/ucm/util/reloc.c
index bee825b95b76..ab4404f716c3 100644
--- a/src/ucm/util/reloc.c
+++ b/src/ucm/util/reloc.c
@@ -28,6 +28,7 @@
 #include <fcntl.h>
 #include <link.h>
 #include <limits.h>
+#include <sys/auxv.h>
 
 /* Ensure this macro is defined (from <link.h>) - otherwise, cppcheck might
    fail with an "unknown macro" warning */
@@ -116,6 +117,13 @@ static ucs_status_t ucm_reloc_get_aux_phsize(int *phsize_p)
         return UCS_OK;
     }
 
+    phsize = getauxval(AT_PHENT);
+    if (phsize > 0) {
+        ucm_debug("phsize=%d from api", phsize);
+        *phsize_p = phsize;
+        return UCS_OK;
+    }
+
     fd = open(proc_auxv_filename, O_RDONLY);
     if (fd < 0) {
         ucm_error("failed to open '%s' for reading: %m", proc_auxv_filename);
diff --git a/src/ucs/config/global_opts.c b/src/ucs/config/global_opts.c
index d0c32af2fed7..49794804364b 100644
--- a/src/ucs/config/global_opts.c
+++ b/src/ucs/config/global_opts.c
@@ -37,7 +37,7 @@ ucs_global_opts_t ucs_global_opts = {
     .debug_signo           = SIGHUP,
     .log_level_trigger     = UCS_LOG_LEVEL_FATAL,
     .warn_unused_env_vars  = 1,
-    .enable_memtype_cache  = 1,
+    .enable_memtype_cache  = UCS_TRY,
     .async_max_events      = 64,
     .async_signo           = SIGALRM,
     .stats_dest            = "",
@@ -145,9 +145,9 @@ static ucs_config_field_t ucs_global_opts_table[] = {
   "configuration parser.",
   ucs_offsetof(ucs_global_opts_t, warn_unused_env_vars), UCS_CONFIG_TYPE_BOOL},
 
-  {"MEMTYPE_CACHE", "y",
+  {"MEMTYPE_CACHE", "try",
    "Enable memory type (cuda/rocm) cache",
-   ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_BOOL},
+   ucs_offsetof(ucs_global_opts_t, enable_memtype_cache), UCS_CONFIG_TYPE_TERNARY},
 
  {"ASYNC_MAX_EVENTS", "1024", /* TODO remove this; resize mpmc */
   "Maximal number of events which can be handled from one context",
diff --git a/src/ucs/config/global_opts.h b/src/ucs/config/global_opts.h
index 6f7a00658b63..1a55c0603038 100644
--- a/src/ucs/config/global_opts.h
+++ b/src/ucs/config/global_opts.h
@@ -80,7 +80,7 @@ typedef struct {
     unsigned                   async_max_events;
 
     /** Memtype cache */
-    int                        enable_memtype_cache;
+    ucs_ternary_auto_value_t   enable_memtype_cache;
 
     /* Destination for statistics: udp:host:port / file:path / stdout
      */
diff --git a/src/ucs/memory/memtype_cache.c b/src/ucs/memory/memtype_cache.c
index bc3eacf7ec2c..ebec3ffd47e2 100644
--- a/src/ucs/memory/memtype_cache.c
+++ b/src/ucs/memory/memtype_cache.c
@@ -24,6 +24,7 @@
 
 
 static ucs_spinlock_t ucs_memtype_cache_global_instance_lock;
+static int ucs_memtype_cache_failed                    = 0;
 ucs_memtype_cache_t *ucs_memtype_cache_global_instance = NULL;
 
 
@@ -42,16 +43,23 @@ static UCS_F_ALWAYS_INLINE ucs_memtype_cache_t *ucs_memtype_cache_get_global()
     ucs_memtype_cache_t *memtype_cache = NULL;
     ucs_status_t status;
 
-    if (!ucs_global_opts.enable_memtype_cache) {
+    if (ucs_global_opts.enable_memtype_cache == UCS_NO) {
         return NULL;
     }
 
     /* Double-check lock scheme */
-    if (ucs_unlikely(ucs_memtype_cache_global_instance == NULL)) {
+    if (ucs_unlikely(ucs_memtype_cache_global_instance == NULL) &&
+        !ucs_memtype_cache_failed) {
         /* Create the memtype cache outside the lock, to avoid a Coverity error
            of lock inversion with UCS_INIT_ONCE from ucm_set_event_handler() */
         status = UCS_CLASS_NEW(ucs_memtype_cache_t, &memtype_cache);
         if (status != UCS_OK) {
+            /* If we failed to create the memtype cache once, do not try again */
+            ucs_memtype_cache_failed = 1;
+            if (ucs_global_opts.enable_memtype_cache == UCS_YES) {
+                ucs_warn("failed to create memtype cache: %s",
+                         ucs_status_string(status));
+            }
             return NULL;
         }
 
@@ -377,9 +385,9 @@ static UCS_CLASS_INIT_FUNC(ucs_memtype_cache_t)
                                    UCM_EVENT_FLAG_EXISTING_ALLOC,
                                    1000, ucs_memtype_cache_event_callback,
                                    self);
-    if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) {
-        ucs_error("failed to set UCM memtype event handler: %s",
-                  ucs_status_string(status));
+    if (status != UCS_OK) {
+        ucs_diag("failed to set UCM memtype event handler: %s",
+                 ucs_status_string(status));
         goto err_cleanup_pgtable;
     }
 
diff --git a/src/ucs/memory/rcache.c b/src/ucs/memory/rcache.c
index 302b457a0a92..80d5d3820353 100644
--- a/src/ucs/memory/rcache.c
+++ b/src/ucs/memory/rcache.c
@@ -1285,6 +1285,8 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params,
     status = ucm_set_event_handler(params->ucm_events, params->ucm_event_priority,
                                    ucs_rcache_unmapped_callback, self);
     if (status != UCS_OK) {
+        ucs_diag("rcache failed to install UCM event handler: %s",
+                 ucs_status_string(status));
         goto err_remove_vfs;
     }