Skip to content

Commit

Permalink
more port
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn committed May 7, 2022
1 parent 59bcb2b commit 29df6e2
Show file tree
Hide file tree
Showing 12 changed files with 101 additions and 105 deletions.
13 changes: 8 additions & 5 deletions CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@
#include <cuda_runtime.h>

#include "DataFormats/BeamSpot/interface/BeamSpotPOD.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"

class BeamSpotCUDA {
public:

using buffer = memoryPool::buffer<BeamSpotPOD>;

// default constructor, required by cms::cuda::Product<BeamSpotCUDA>
BeamSpotCUDA() = default;

// constructor that allocates cached device memory on the given CUDA stream
BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique<BeamSpotPOD>(stream); }
BeamSpotCUDA(cudaStream_t stream) { data_d_ = memoryPool::cuda::make_buffer<BeamSpotPOD>(1,stream, memoryPool::onDevice); }

// movable, non-copiable
BeamSpotCUDA(BeamSpotCUDA const&) = delete;
Expand All @@ -23,11 +26,11 @@ class BeamSpotCUDA {
BeamSpotPOD* data() { return data_d_.get(); }
BeamSpotPOD const* data() const { return data_d_.get(); }

cms::cuda::device::unique_ptr<BeamSpotPOD>& ptr() { return data_d_; }
cms::cuda::device::unique_ptr<BeamSpotPOD> const& ptr() const { return data_d_; }
buffer & ptr() { return data_d_; }
buffer const& ptr() const { return data_d_; }

private:
cms::cuda::device::unique_ptr<BeamSpotPOD> data_d_;
buffer data_d_;
};

#endif // CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
66 changes: 57 additions & 9 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@

#include "DataFormats/SiPixelRawData/interface/SiPixelErrorCompact.h"
#include "DataFormats/SiPixelRawData/interface/SiPixelFormatterErrors.h"
#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
#include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"


class SiPixelDigiErrorsCUDA {
public:
using SiPixelErrorCompactVector = cms::cuda::SimpleVector<SiPixelErrorCompact>;

SiPixelDigiErrorsCUDA() = default;
explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream);
inline SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream);
~SiPixelDigiErrorsCUDA() = default;

SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
Expand All @@ -27,18 +27,66 @@ class SiPixelDigiErrorsCUDA {
SiPixelErrorCompactVector* error() { return error_d.get(); }
SiPixelErrorCompactVector const* error() const { return error_d.get(); }

using HostDataError = std::pair<SiPixelErrorCompactVector, cms::cuda::host::unique_ptr<SiPixelErrorCompact[]>>;
HostDataError dataErrorToHostAsync(cudaStream_t stream) const;
using HostDataError = std::pair<SiPixelErrorCompactVector, memoryPool::buffer<SiPixelErrorCompact>>;
inline HostDataError dataErrorToHostAsync(cudaStream_t stream) const;

void copyErrorToHostAsync(cudaStream_t stream);
inline void copyErrorToHostAsync(cudaStream_t stream);
int nErrorWords() const { return nErrorWords_; }

private:
cms::cuda::device::unique_ptr<SiPixelErrorCompact[]> data_d;
cms::cuda::device::unique_ptr<SiPixelErrorCompactVector> error_d;
cms::cuda::host::unique_ptr<SiPixelErrorCompactVector> error_h;
memoryPool::buffer<SiPixelErrorCompact> data_d;
memoryPool::buffer<SiPixelErrorCompactVector> error_d;
memoryPool::buffer<SiPixelErrorCompactVector> error_h;
SiPixelFormatterErrors formatterErrors_h;
int nErrorWords_ = 0;
};


#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"


SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream) :
formatterErrors_h(std::move(errors)),
nErrorWords_(maxFedWords) {
assert(maxFedWords != 0);

memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
assert(deleter.pool());

data_d = memoryPool::cuda::make_buffer<SiPixelErrorCompact>(maxFedWords, deleter);
error_d = memoryPool::cuda::make_buffer<SiPixelErrorCompactVector>(1,deleter);
error_h = memoryPool::cuda::make_buffer<SiPixelErrorCompactVector>(1,stream,memoryPool::onHost);


cudaMemsetAsync(data_d.get(), 0x00, maxFedWords, stream);

cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
assert(error_h->empty());
assert(error_h->capacity() == static_cast<int>(maxFedWords));

cudaCheck(memoryPool::cuda::copy(error_d, error_h, 1,stream));
}

void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) {
cudaCheck(memoryPool::cuda::copy(error_h, error_d, 1,stream));
}

SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const {
// On one hand size() could be sufficient. On the other hand, if
// someone copies the SimpleVector<>, (s)he might expect the data
// buffer to actually have space for capacity() elements.
auto data = memoryPool::cuda::make_buffer<SiPixelErrorCompact>(error_h->capacity(), stream, memoryPool::onHost);

// but transfer only the required amount
if (not error_h->empty()) {
cudaCheck(memoryPool::cuda::copy(data, data_d, error_h->size(), stream));
}
auto err = *error_h;
err.set_data(data.get());
return HostDataError(err, std::move(data));
}



#endif // CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
12 changes: 7 additions & 5 deletions CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,15 @@

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"

class SiPixelDigisCUDA {
public:
using StoreType = uint16_t;
SiPixelDigisCUDA() = default;
explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream);
inline SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream);
~SiPixelDigisCUDA() = default;

SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete;
Expand All @@ -28,19 +27,22 @@ class SiPixelDigisCUDA {
uint32_t nModules() const { return nModules_h; }
uint32_t nDigis() const { return nDigis_h; }

cms::cuda::host::unique_ptr<StoreType[]> copyAllToHostAsync(cudaStream_t stream) const;
inline memoryPool::buffer<StoreType> copyAllToHostAsync(cudaStream_t stream) const;

SiPixelDigisCUDASOAView view() { return m_view; }
SiPixelDigisCUDASOAView const view() const { return m_view; }

private:
// These are consumed by downstream device code
cms::cuda::device::unique_ptr<StoreType[]> m_store;
memoryPool::buffer<StoreType> m_store;

SiPixelDigisCUDASOAView m_view;

uint32_t nModules_h = 0;
uint32_t nDigis_h = 0;
};


#include "SiPixelDigisCUDAImpl.h"

#endif // CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"

#include <cstdint>
Expand Down Expand Up @@ -96,11 +94,11 @@ class SiPixelDigisCUDASOAView {
uint32_t* rawIdArr_;

template <typename ReturnType, typename StoreType, typename LocationType>
ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) {
static ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) {
return reinterpret_cast<ReturnType*>(store.get() + static_cast<int>(column) * roundFor128ByteAlignment(size));
}

static int roundFor128ByteAlignment(int size) {
static constexpr int roundFor128ByteAlignment(int size) {
constexpr int mul = 128 / sizeof(uint16_t);
return ((size + mul - 1) / mul) * mul;
};
Expand Down
41 changes: 0 additions & 41 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
Original file line number Diff line number Diff line change
@@ -1,42 +1 @@
#include <cassert>

#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"

SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream)
: data_d(cms::cuda::make_device_unique<SiPixelErrorCompact[]>(maxFedWords, stream)),
error_d(cms::cuda::make_device_unique<SiPixelErrorCompactVector>(stream)),
error_h(cms::cuda::make_host_unique<SiPixelErrorCompactVector>(stream)),
formatterErrors_h(std::move(errors)),
nErrorWords_(maxFedWords) {
assert(maxFedWords != 0);
cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);

cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
assert(error_h->empty());
assert(error_h->capacity() == static_cast<int>(maxFedWords));

cms::cuda::copyAsync(error_d, error_h, stream);
}

void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) {
cms::cuda::copyAsync(error_h, error_d, stream);
}

SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const {
// On one hand size() could be sufficient. On the other hand, if
// someone copies the SimpleVector<>, (s)he might expect the data
// buffer to actually have space for capacity() elements.
auto data = cms::cuda::make_host_unique<SiPixelErrorCompact[]>(error_h->capacity(), stream);

// but transfer only the required amount
if (not error_h->empty()) {
cms::cuda::copyAsync(data, data_d, error_h->size(), stream);
}
auto err = *error_h;
err.set_data(data.get());
return HostDataError(err, std::move(data));
}
28 changes: 0 additions & 28 deletions CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
Original file line number Diff line number Diff line change
@@ -1,29 +1 @@
#include <cassert>

#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream)
: m_store(cms::cuda::make_device_unique<SiPixelDigisCUDA::StoreType[]>(
SiPixelDigisCUDASOAView::roundFor128ByteAlignment(maxFedWords) *
static_cast<int>(SiPixelDigisCUDASOAView::StorageLocation::kMAX),
stream)),
m_view(m_store, maxFedWords, SiPixelDigisCUDASOAView::StorageLocation::kMAX) {
assert(maxFedWords != 0);
}

cms::cuda::host::unique_ptr<SiPixelDigisCUDA::StoreType[]> SiPixelDigisCUDA::copyAllToHostAsync(
cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<StoreType[]>(
m_view.roundFor128ByteAlignment(nDigis()) * static_cast<int>(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX),
stream);
cudaCheck(cudaMemcpyAsync(ret.get(),
m_view.clus(),
m_view.roundFor128ByteAlignment(nDigis()) * sizeof(SiPixelDigisCUDA::StoreType) *
static_cast<int>(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX),
cudaMemcpyDeviceToHost,
stream));
return ret;
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"


#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

class TrackingRecHit2DHeterogeneous {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
public:
Expand All @@ -27,7 +26,7 @@ class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer<edm::Externa
edm::EDGetTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
edm::EDPutTokenT<SiPixelErrorsSoA> digiErrorPutToken_;

cms::cuda::host::unique_ptr<SiPixelErrorCompact[]> data_;
memoryPool::buffer<SiPixelErrorCompact> data_;
cms::cuda::SimpleVector<SiPixelErrorCompact> error_;
const SiPixelFormatterErrors* formatterErrors_ = nullptr;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"

class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
public:
Expand All @@ -27,7 +26,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork
edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;

cms::cuda::host::unique_ptr<uint16_t[]> store_;
memoryPool::buffer<uint16_t> store_;

int nDigis_;
};
Expand Down
2 changes: 2 additions & 0 deletions HeterogeneousCore/CUDACore/src/ScopedContext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ namespace {
try {
auto error = cudaGetErrorName(status);
auto message = cudaGetErrorString(status);
std::cout << "Callback of CUDA stream " << streamId << " in device " << device
<< " error " << error << ": " << message << std::endl;
throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device
<< " error " << error << ": " << message;
} catch (cms::Exception&) {
Expand Down
26 changes: 19 additions & 7 deletions HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,15 @@
namespace {

// free callback
void CUDART_CB freeCallback(void *p) {
std::cout << "free callaback" << std::endl;
void CUDART_CB freeCallback(cudaStream_t streamId, cudaError_t status, void* p) {
//void CUDART_CB freeCallback(void *p) {
if (status != cudaSuccess) {
std::cout << "Error in free callaback in stream " << streamId << std::endl;
auto error = cudaGetErrorName(status);
auto message = cudaGetErrorString(status);
std::cout << " error " << error << ": " << message << std::endl;
}
// std::cout << "free callaback" << std::endl;
auto payload = (memoryPool::Payload *)(p);
memoryPool::scheduleFree(payload);
}
Expand All @@ -22,9 +29,10 @@ namespace {

struct CudaAlloc {
static void scheduleFree(memoryPool::Payload *payload, cudaStream_t stream) {
std::cout << "schedule free" << std::endl;
// std::cout << "schedule free" << std::endl;
if (stream)
cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload));
cudaCheck(cudaStreamAddCallback(stream, freeCallback, payload,0));
// cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload));
else
memoryPool::scheduleFree(payload);
}
Expand All @@ -36,10 +44,14 @@ struct CudaDeviceAlloc : public CudaAlloc {
static Pointer alloc(size_t size) {
Pointer p = nullptr;
auto err = cudaMalloc(&p, size);
std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
// std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
return err == cudaSuccess ? p : nullptr;
}
static void free(Pointer ptr) { auto err = cudaFree(ptr); std::cout << "free" << ((err == cudaSuccess) ? " ok" : " err") <<std::endl;}
static void free(Pointer ptr) {
auto err = cudaFree(ptr);
// std::cout << "free" << ((err == cudaSuccess) ? " ok" : " err") <<std::endl;
if (err != cudaSuccess) std::cout << " error in cudaFree??" << std::endl;
}
};

struct CudaHostAlloc : public CudaAlloc {
Expand All @@ -48,7 +60,7 @@ struct CudaHostAlloc : public CudaAlloc {
static Pointer alloc(size_t size) {
Pointer p = nullptr;
auto err = cudaMallocHost(&p, size);
std::cout << "alloc H " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
// std::cout << "alloc H " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
return err == cudaSuccess ? p : nullptr;
}
static void free(Pointer ptr) { cudaFreeHost(ptr); }
Expand Down
4 changes: 2 additions & 2 deletions RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ namespace {
BeamSpotPOD* data() { return data_h_.get(); }
BeamSpotPOD const* data() const { return data_h_.get(); }

cms::cuda::host::noncached::unique_ptr<BeamSpotPOD>& ptr() { return data_h_; }
auto & ptr() { return data_h_; }
cms::cuda::host::noncached::unique_ptr<BeamSpotPOD> const& ptr() const { return data_h_; }

private:
Expand Down Expand Up @@ -93,7 +93,7 @@ void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const e
bsHost->betaStar = bs.betaStar();

BeamSpotCUDA bsDevice(ctx.stream());
cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream());
cudaMemcpyAsync(bsDevice.data(), bsHost.get(), sizeof(BeamSpotPOD), cudaMemcpyHostToDevice, ctx.stream());

ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice));
}
Expand Down

0 comments on commit 29df6e2

Please sign in to comment.