Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Histogram refactor #1003

Merged
merged 6 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 3 additions & 58 deletions src/cunumeric/stat/histogram.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,65 +55,10 @@ struct HistogramImplBody<VariantKind::CPU, CODE> {
const AccessorRD<SumReduction<WeightType>, true, 1>& result,
const Rect<1>& result_rect) const
{
namespace det_acc = detail::accessors;
auto exe_pol = thrust::host;

auto exe_pol = thrust::host;
auto&& [src_size, src_copy, src_ptr] = det_acc::make_accessor_copy(exe_pol, src, src_rect);

auto&& [weights_size, weights_copy, weights_ptr] =
det_acc::make_accessor_copy(exe_pol, weights, weights_rect);

assert(weights_size == src_size);

auto&& [bins_size, bins_ptr] = det_acc::get_accessor_ptr(bins, bins_rect);

auto num_intervals = bins_size - 1;
Buffer<WeightType> local_result = create_buffer<WeightType>(num_intervals);

WeightType* local_result_ptr = local_result.ptr(0);

auto&& [global_result_size, global_result_ptr] = det_acc::get_accessor_ptr(result, result_rect);

#ifdef _DEBUG
std::cout << "echo src, bins, weights:\n";

std::copy_n(src_copy.ptr(0), src_size, std::ostream_iterator<VAL>{std::cout, ", "});
std::cout << "\n";

std::copy_n(bins_ptr, num_intervals + 1, std::ostream_iterator<BinType>{std::cout, ", "});
std::cout << "\n";

std::copy_n(weights_copy.ptr(0), src_size, std::ostream_iterator<WeightType>{std::cout, ", "});
std::cout << "\n";

#endif
detail::histogram_weights(exe_pol,
src_copy.ptr(0),
src_size,
bins_ptr,
num_intervals,
local_result_ptr,
weights_copy.ptr(0));

// fold into RD result:
//
assert(num_intervals == global_result_size);

#ifdef _DEBUG
std::cout << "result:\n";

std::copy_n(
local_result_ptr, num_intervals, std::ostream_iterator<WeightType>{std::cout, ", "});
std::cout << "\n";
#endif

thrust::transform(
exe_pol,
local_result_ptr,
local_result_ptr + num_intervals,
global_result_ptr,
global_result_ptr,
[](auto local_value, auto global_value) { return local_value + global_value; });
detail::histogram_wrapper(
exe_pol, src, src_rect, bins, bins_rect, weights, weights_rect, result, result_rect);
}
};

Expand Down
107 changes: 2 additions & 105 deletions src/cunumeric/stat/histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,115 +60,12 @@ struct HistogramImplBody<VariantKind::GPU, CODE> {
const AccessorRD<SumReduction<WeightType>, true, 1>& result,
const Rect<1>& result_rect) const
{
namespace det_acc = detail::accessors;

auto stream = get_cached_stream();
cudaStream_t stream_ = static_cast<cudaStream_t>(stream);
auto exe_pol = DEFAULT_POLICY.on(stream);

auto&& [src_size, src_copy, src_ptr] = det_acc::make_accessor_copy(exe_pol, src, src_rect);

auto&& [weights_size, weights_copy, weights_ptr] =
det_acc::make_accessor_copy(exe_pol, weights, weights_rect);

assert(weights_size == src_size);

auto&& [bins_size, bins_ptr] = det_acc::get_accessor_ptr(bins, bins_rect);

auto num_intervals = bins_size - 1;
Buffer<WeightType> local_result = create_buffer<WeightType>(num_intervals);

WeightType* local_result_ptr = local_result.ptr(0);

auto&& [global_result_size, global_result_ptr] = det_acc::get_accessor_ptr(result, result_rect);

CHECK_CUDA_STREAM(stream);

#ifdef _DEBUG
{
// std::vector<bool>: proxy issues; use thrust::host_vector, instead
//
thrust::host_vector<VAL> v_src(src_size, 0);
VAL* v_src_ptr = v_src.data();

CHECK_CUDA(cudaMemcpyAsync(
v_src_ptr, src_ptr, src_size * sizeof(VAL), cudaMemcpyDeviceToHost, stream));

thrust::host_vector<WeightType> v_weights(weights_size, 0);
CHECK_CUDA(cudaMemcpyAsync(&v_weights[0],
weights_ptr,
weights_size * sizeof(WeightType),
cudaMemcpyDeviceToHost,
stream));

thrust::host_vector<BinType> v_bins(bins_size, 0);
CHECK_CUDA(cudaMemcpyAsync(
&v_bins[0], bins_ptr, bins_size * sizeof(BinType), cudaMemcpyDeviceToHost, stream));

CHECK_CUDA(cudaStreamSynchronize(stream));

std::cout << "echo src, bins, weights:\n";

// doesn't compile with __half:
//
// using alias_val_t = typename decltype(v_src)::value_type;
// std::copy(v_src.begin(), v_src.end(), std::ostream_iterator<alias_val_t>{std::cout, ", "});

for (auto&& src_val : v_src) { std::cout << static_cast<double>(src_val) << ", "; }
std::cout << "\n";

std::copy(v_bins.begin(), v_bins.end(), std::ostream_iterator<BinType>{std::cout, ", "});
std::cout << "\n";

std::copy(
v_weights.begin(), v_weights.end(), std::ostream_iterator<WeightType>{std::cout, ", "});
std::cout << "\n";
}
#endif

detail::histogram_weights(exe_pol,
src_copy.ptr(0),
src_size,
bins_ptr,
num_intervals,
local_result_ptr,
weights_copy.ptr(0),
stream_);

CHECK_CUDA_STREAM(stream);

// fold into RD result:
//
assert(num_intervals == global_result_size);

#ifdef _DEBUG
{
std::cout << "local result:\n";

thrust::host_vector<WeightType> v_result(num_intervals, 0);
CHECK_CUDA(cudaMemcpyAsync(&v_result[0],
local_result_ptr,
num_intervals * sizeof(WeightType),
cudaMemcpyDeviceToHost,
stream));

CHECK_CUDA(cudaStreamSynchronize(stream));

std::copy(
v_result.begin(), v_result.end(), std::ostream_iterator<WeightType>{std::cout, ", "});
std::cout << "\n";
}
#endif

thrust::transform(
exe_pol,
local_result_ptr,
local_result_ptr + num_intervals,
global_result_ptr,
global_result_ptr,
[] __device__(auto local_value, auto global_value) { return local_value + global_value; });

CHECK_CUDA_STREAM(stream);
detail::histogram_wrapper(
exe_pol, src, src_rect, bins, bins_rect, weights, weights_rect, result, result_rect, stream_);
}
};

Expand Down
17 changes: 11 additions & 6 deletions src/cunumeric/stat/histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,10 @@ namespace detail {
// device specialization:
//
template <typename exe_policy_t, typename weight_t, typename offset_t>
struct segmented_sum_t<
exe_policy_t,
weight_t,
offset_t,
std::enable_if_t<!std::is_same_v<exe_policy_t, thrust::detail::host_t> &&
!std::is_same_v<exe_policy_t, thrust::system::omp::detail::par_t>>> {
struct segmented_sum_t<exe_policy_t,
weight_t,
offset_t,
std::enable_if_t<!is_host_policy_v<exe_policy_t>>> {
segmented_sum_t(exe_policy_t exe_pol,
weight_t const* p_weights,
size_t n_samples,
Expand Down Expand Up @@ -106,5 +104,12 @@ struct segmented_sum_t<
allocator_t<unsigned char, exe_policy_t> alloc_scratch_{};
};

template <typename exe_policy_t>
struct sync_policy_t<exe_policy_t, std::enable_if_t<!is_host_policy_v<exe_policy_t>>> {
sync_policy_t() {}

void operator()(cudaStream_t stream) { CHECK_CUDA_STREAM(stream); }
};

} // namespace detail
} // namespace cunumeric
20 changes: 14 additions & 6 deletions src/cunumeric/stat/histogram_cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ namespace detail {
// host specialization:
//
template <typename exe_policy_t, typename weight_t, typename offset_t>
struct segmented_sum_t<
exe_policy_t,
weight_t,
offset_t,
std::enable_if_t<std::is_same_v<exe_policy_t, thrust::detail::host_t> ||
std::is_same_v<exe_policy_t, thrust::system::omp::detail::par_t>>> {
struct segmented_sum_t<exe_policy_t,
weight_t,
offset_t,
std::enable_if_t<is_host_policy_v<exe_policy_t>>> {
segmented_sum_t(exe_policy_t exe_pol,
weight_t const* p_weights,
size_t n_samples,
Expand Down Expand Up @@ -83,5 +81,15 @@ struct segmented_sum_t<
offset_t* ptr_offsets_{nullptr};
};

template <typename exe_policy_t>
struct sync_policy_t<exe_policy_t, std::enable_if_t<is_host_policy_v<exe_policy_t>>> {
sync_policy_t(void) {}

void operator()(cudaStream_t stream)
{
// purposely empty: there's nothing to sync on host
}
};

} // namespace detail
} // namespace cunumeric
8 changes: 8 additions & 0 deletions src/cunumeric/stat/histogram_gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ namespace detail {
template <typename exe_policy_t, typename weight_t, typename offset_t, typename = void>
struct segmented_sum_t;

template <typename exe_policy_t>
inline constexpr bool is_host_policy_v =
std::is_same_v<exe_policy_t, std::remove_cv_t<decltype(thrust::host)>> ||
std::is_same_v<exe_policy_t, std::remove_cv_t<decltype(thrust::omp::par)>>;

template <typename exe_policy_t, typename = void>
struct sync_policy_t;

namespace accessors {

template <typename element_t>
Expand Down
65 changes: 65 additions & 0 deletions src/cunumeric/stat/histogram_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ struct lower_bound_op_t {
size_t n_intervs_;
};

template <typename weight_t>
struct reduction_op_t {
__host__ __device__ weight_t operator()(weight_t local_value, weight_t global_value)
{
return local_value + global_value;
}
};

template <typename exe_policy_t,
typename elem_t,
typename bin_t,
Expand Down Expand Up @@ -106,5 +114,62 @@ void histogram_weights(exe_policy_t exe_pol,
segsum();
}

template <typename exe_policy_t, typename elem_t, typename bin_t, typename weight_t>
void histogram_wrapper(exe_policy_t exe_pol,
const AccessorRO<elem_t, 1>& src,
const Rect<1>& src_rect,
const AccessorRO<bin_t, 1>& bins,
const Rect<1>& bins_rect,
const AccessorRO<weight_t, 1>& weights,
const Rect<1>& weights_rect,
const AccessorRD<SumReduction<weight_t>, true, 1>& result,
const Rect<1>& result_rect,
cudaStream_t stream = nullptr)
{
auto&& [src_size, src_copy, src_ptr] = accessors::make_accessor_copy(exe_pol, src, src_rect);

auto&& [weights_size, weights_copy, weights_ptr] =
accessors::make_accessor_copy(exe_pol, weights, weights_rect);

assert(weights_size == src_size);

auto&& [bins_size, bins_ptr] = accessors::get_accessor_ptr(bins, bins_rect);

auto num_intervals = bins_size - 1;
Buffer<weight_t> local_result = create_buffer<weight_t>(num_intervals);

weight_t* local_result_ptr = local_result.ptr(0);

auto&& [global_result_size, global_result_ptr] = accessors::get_accessor_ptr(result, result_rect);

sync_policy_t<exe_policy_t> synchronizer;

synchronizer(stream);

histogram_weights(exe_pol,
src_copy.ptr(0),
src_size,
bins_ptr,
num_intervals,
local_result_ptr,
weights_copy.ptr(0),
stream);

synchronizer(stream);

// fold into RD result:
//
assert(num_intervals == global_result_size);

thrust::transform(exe_pol,
local_result_ptr,
local_result_ptr + num_intervals,
global_result_ptr,
global_result_ptr,
reduction_op_t<weight_t>{});

synchronizer(stream);
}

} // namespace detail
} // namespace cunumeric
46 changes: 6 additions & 40 deletions src/cunumeric/stat/histogram_omp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -55,48 +55,14 @@ struct HistogramImplBody<VariantKind::OMP, CODE> {
const AccessorRD<SumReduction<WeightType>, true, 1>& result,
const Rect<1>& result_rect) const
{
namespace det_acc = detail::accessors;

auto exe_pol = thrust::omp::par;
auto&& [global_result_size, global_result_ptr] = det_acc::get_accessor_ptr(result, result_rect);

auto exe_pol = thrust::omp::par;
#ifdef _USE_THRUST_
auto&& [src_size, src_copy, src_ptr] = det_acc::make_accessor_copy(exe_pol, src, src_rect);

auto&& [weights_size, weights_copy, weights_ptr] =
det_acc::make_accessor_copy(exe_pol, weights, weights_rect);

assert(weights_size == src_size);

auto&& [bins_size, bins_ptr] = det_acc::get_accessor_ptr(bins, bins_rect);

auto num_intervals = bins_size - 1;
Buffer<WeightType> local_result = create_buffer<WeightType>(num_intervals);

WeightType* local_result_ptr = local_result.ptr(0);

detail::histogram_weights(exe_pol,
src_copy.ptr(0),
src_size,
bins_ptr,
num_intervals,
local_result_ptr,
weights_copy.ptr(0),
nullptr);

// fold into RD result:
//
assert(num_intervals == global_result_size);

thrust::transform(
exe_pol,
local_result_ptr,
local_result_ptr + num_intervals,
global_result_ptr,
global_result_ptr,
[](auto local_value, auto global_value) { return local_value + global_value; });
detail::histogram_wrapper(
exe_pol, src, src_rect, bins, bins_rect, weights, weights_rect, result, result_rect);
#else
auto&& [src_size, src_ptr] = det_acc::get_accessor_ptr(src, src_rect);
namespace det_acc = detail::accessors;
auto&& [global_result_size, global_result_ptr] = det_acc::get_accessor_ptr(result, result_rect);
auto&& [src_size, src_ptr] = det_acc::get_accessor_ptr(src, src_rect);

auto&& [weights_size, weights_ptr] = det_acc::get_accessor_ptr(weights, weights_rect);

Expand Down