Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

CudnnFind() usage improvements #12804

Merged
merged 14 commits into from
Oct 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,4 @@ List of Contributors
* [LuckyPigeon](https://github.com/LuckyPigeon)
* [Anton Chernov](https://github.com/lebeg)
* [Denisa Roberts](https://github.com/D-Roberts)
* [Dick Carter](https://github.com/DickJC123)
4 changes: 4 additions & 0 deletions docs/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
* MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF
- Values: Int ```(default=24)```
- The cutoff threshold that decides the rounding strategy. Let's denote the threshold as T. If the memory size is smaller than `2 ** T` (by default, it's 2 ** 24 = 16MB), it rounds to the smallest `2 ** n` that is larger than the requested memory size; if the memory size is larger than `2 ** T`, it rounds to the next k * 2 ** T.
* MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE
- Values: Int ```(default=2097152)```
- When using the naive pool type, memory allocations larger than this threshhold are rounded up to a multiple of this value.
- The default was chosen to minimize global memory fragmentation within the GPU driver. Set this to 1 to disable.

## Engine Type

Expand Down
14 changes: 7 additions & 7 deletions include/mxnet/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,11 @@ struct Context {
/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the integer holding free GPU memory
* \param total_mem pointer to the integer holding total GPU memory
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return No return value
*/
inline static void GetGPUMemoryInformation(int dev, int *free, int *total);
inline static void GetGPUMemoryInformation(int dev, uint64_t *free, uint64_t *total);
/*!
* Create a pinned CPU context.
* \param dev_id the device id for corresponding GPU.
Expand Down Expand Up @@ -334,8 +334,8 @@ inline int32_t Context::GetGPUCount() {
#endif
}

inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
int *total_mem) {
inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem,
uint64_t *total_mem) {
#if MXNET_USE_CUDA

size_t memF, memT;
Expand All @@ -354,8 +354,8 @@ inline void Context::GetGPUMemoryInformation(int dev, int *free_mem,
e = cudaSetDevice(curDevice);
CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);

*free_mem = static_cast<int>(memF);
*total_mem = static_cast<int>(memT);
*free_mem = static_cast<uint64_t>(memF);
*total_mem = static_cast<uint64_t>(memT);

#else
LOG(FATAL)
Expand Down
10 changes: 10 additions & 0 deletions include/mxnet/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -441,13 +441,23 @@ MXNET_DLL int MXGetGPUCount(int* out);

/*!
* \brief get the free and total available memory on a GPU
* Note: Deprecated, use MXGetGPUMemoryInformation64 instead.
* \param dev the GPU number to query
* \param free_mem pointer to the integer holding free GPU memory
* \param total_mem pointer to the integer holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
MXNET_DLL int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem);

/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
MXNET_DLL int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem);

/*!
* \brief get the MXNet library version as an integer
* \param pointer to the integer holding the version number
Expand Down
10 changes: 10 additions & 0 deletions perl-package/AI-MXNetCAPI/mxnet.i
Original file line number Diff line number Diff line change
Expand Up @@ -344,13 +344,23 @@ int MXGetGPUCount(int* out);

/*!
* \brief get the free and total available memory on a GPU
* Note: deprecated, use MXGetGPUMemoryInformation64().
* \param dev the GPU number to query
* \param free_mem pointer to the integer holding free GPU memory
* \param total_mem pointer to the integer holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
int MXGetGPUMemoryInformation(int dev, int *out, int *out);

/*!
* \brief get the free and total available memory on a GPU
* \param dev the GPU number to query
* \param free_mem pointer to the uint64_t holding free GPU memory
* \param total_mem pointer to the uint64_t holding total GPU memory
* \return 0 when success, -1 when failure happens
*/
int MXGetGPUMemoryInformation64(int dev, uint64_t *out, uint64_t *out);


//-------------------------------------
// Part 1: NDArray creation and deletion
Expand Down
24 changes: 24 additions & 0 deletions python/mxnet/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,30 @@ def num_gpus():
check_call(_LIB.MXGetGPUCount(ctypes.byref(count)))
return count.value

def gpu_memory_info(device_id=0):
"""Query CUDA for the free and total bytes of GPU global memory.

Parameters
----------
device_id : int, optional
The device id of the GPU device.

Raises
------
Will raise an exception on any CUDA error.

Returns
-------
(free, total) : (int, int)
Copy link

@blac2kite blac2kite Oct 24, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: 'total' - is it referring to total used, total available or the total size of the physical GPU. Also, aren't they 64 bit integers. So maybe 'long' would be more appropriate. Since we are exposing this API in python, it'd be a good idea to document it well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer to leave this as is. Regarding int vs long, I'm not a Python wizard, but ints are 'plain integers' and longs have unlimited precision:

$ python
Python 2.7.12 (default, Dec  4 2017, 14:50:18) 
[GCC 5.4.0 20160609] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import sys
>>> x  = sys.maxsize
>>> x
9223372036854775807
>>> type(x)
<type 'int'>
>>> y=2*x
>>> y
18446744073709551614L
>>> type(y)
<type 'long'>
>>> 

And unfortunately, there's not a real short answer to what 'total' memory means. We're wrapping the cuda call cudaMemGetInfo(), and the NVIDIA documentation says:

Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes.

Let's say you've got a GPU with published memory T. The GPU driver puts some control structures like the page table in that memory, so call that driver overhead D. Finally, your GPU may be driving a monitor, so a window manager is using the GPU with overhead W. So what does the API return for 'total' in this scenario? The answer is T - D. The long answer then is: 'total' means the total memory available to both your MXNet process and other processes that may be using the GPU. I don't know a way to suggest this succinctly without introducing more confusion.

The number of GPUs.

"""
free = ctypes.c_uint64()
total = ctypes.c_uint64()
dev_id = ctypes.c_int(device_id)
check_call(_LIB.MXGetGPUMemoryInformation64(dev_id, ctypes.byref(free), ctypes.byref(total)))
return (free.value, total.value)

def current_context():
"""Returns the current context.

Expand Down
11 changes: 11 additions & 0 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,18 @@ int MXGetGPUCount(int* out) {
API_END();
}

// Deprecated: use MXGetGPUMemoryInformation64() instead.
int MXGetGPUMemoryInformation(int dev, int *free_mem, int *total_mem) {
API_BEGIN();
uint64_t free_mem64 = 0UL;
uint64_t total_mem64 = 0UL;
Context::GetGPUMemoryInformation(dev, &free_mem64, &total_mem64);
*free_mem = static_cast<int>(free_mem64);
*total_mem = static_cast<int>(total_mem64);
API_END();
}

int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t *total_mem) {
API_BEGIN();
Context::GetGPUMemoryInformation(dev, free_mem, total_mem);
API_END();
Expand Down
66 changes: 28 additions & 38 deletions src/operator/nn/cudnn/cudnn_algoreg-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <mutex>
#include <string>
#include <vector>
#include <functional>
#include <utility>
#include "../../../common/cuda_utils.h"
#include "../convolution-inl.h"
#include "../deconvolution-inl.h"
Expand Down Expand Up @@ -65,7 +67,11 @@ class CuDNNAlgo {
template<typename ParamType>
class CuDNNAlgoReg {
public:
bool Find(const ParamType &param,
using AlgoSetter_t = std::function<void(CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *,
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *,
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *)>;

void FindOrElseRegister(const ParamType &param,
const std::vector<TShape> &in_shape,
const std::vector<TShape> &out_shape,
cudnnDataType_t cudnn_data_type,
Expand All @@ -75,7 +81,8 @@ class CuDNNAlgoReg {
bool add_to_weight,
CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd,
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd,
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) {
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt,
const AlgoSetter_t &algo_setter) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
Expand All @@ -85,45 +92,28 @@ class CuDNNAlgoReg {
*fwd = i->second.fwd;
*bwd = i->second.bwd;
*flt = i->second.flt;
return true;
}
return false;
}

void Register(const ParamType &param,
const std::vector<TShape> &in_shape,
const std::vector<TShape> &out_shape,
cudnnDataType_t cudnn_data_type,
cudnnDataType_t cudnn_forward_compute_type,
cudnnDataType_t cudnn_backward_compute_type,
int sm_arch,
bool add_to_weight,
const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd,
const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd,
const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
std::lock_guard<std::mutex> guard(lock_);
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
LOG(INFO) << "Running performance tests to find the best convolution "
"algorithm, "
"this can take a while... (setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
if (reg_.size() >= 1000) {
// Many people are very concerned about this warning, so change the warning once.
if (!is_warning_autotune_) {
LOG(INFO)
<< "If you see this message in the middle of training, you are "
"probably using bucketing. Consider setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
is_warning_autotune_ = true;
} else {
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
LOG(INFO) << "Running performance tests to find the best convolution "
"algorithm, "
"this can take a while... (setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
if (reg_.size() >= 1000) {
// Many people are very concerned about this warning, so change the warning once.
if (!is_warning_autotune_) {
LOG(INFO)
<< "If you see this message in the middle of training, you are "
"probably using bucketing. Consider setting env variable "
"MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
is_warning_autotune_ = true;
}
}
}
// Call provided function to determine the algos- likely uses cudnnFind() or cudnnGet()
algo_setter(fwd, bwd, flt);
// Save result so future lookups hit in this registry
reg_.insert(std::pair<ParamKey, CudnnAlgorithms>(key, CudnnAlgorithms{*fwd, *bwd, *flt}));
}
reg_[key].fwd = fwd;
reg_[key].bwd = bwd;
reg_[key].flt = flt;
}

static CuDNNAlgoReg *Get();
Expand Down
Loading