From 29df6e20a122c8328831f9d2594e8630d9f43a45 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 7 May 2022 17:35:07 +0200 Subject: [PATCH] more port --- .../BeamSpot/interface/BeamSpotCUDA.h | 13 ++-- .../interface/SiPixelDigiErrorsCUDA.h | 66 ++++++++++++++++--- .../SiPixelDigi/interface/SiPixelDigisCUDA.h | 12 ++-- .../interface/SiPixelDigisCUDASOAView.h | 6 +- .../SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc | 41 ------------ .../SiPixelDigi/src/SiPixelDigisCUDA.cc | 28 -------- .../interface/TrackingRecHit2DHeterogeneous.h | 2 + .../plugins/SiPixelDigiErrorsSoAFromCUDA.cc | 3 +- .../plugins/SiPixelDigisSoAFromCUDA.cc | 3 +- .../CUDACore/src/ScopedContext.cc | 2 + .../interface/cudaMemoryPoolImpl.h | 26 ++++++-- .../plugins/BeamSpotToCUDA.cc | 4 +- 12 files changed, 101 insertions(+), 105 deletions(-) diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h index 7b04fac67b9f1..f5e493edf3f9b 100644 --- a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h +++ b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h @@ -4,15 +4,18 @@ #include #include "DataFormats/BeamSpot/interface/BeamSpotPOD.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" class BeamSpotCUDA { public: + + using buffer = memoryPool::buffer; + // default constructor, required by cms::cuda::Product BeamSpotCUDA() = default; // constructor that allocates cached device memory on the given CUDA stream - BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique(stream); } + BeamSpotCUDA(cudaStream_t stream) { data_d_ = memoryPool::cuda::make_buffer(1,stream, memoryPool::onDevice); } // movable, non-copiable BeamSpotCUDA(BeamSpotCUDA const&) = delete; @@ -23,11 +26,11 @@ class BeamSpotCUDA { BeamSpotPOD* data() { return data_d_.get(); } BeamSpotPOD const* data() const { return data_d_.get(); } - cms::cuda::device::unique_ptr& ptr() { return data_d_; } - cms::cuda::device::unique_ptr const& ptr() const { return data_d_; } + buffer & ptr() { return data_d_; } + buffer const& ptr() const { return data_d_; } private: - cms::cuda::device::unique_ptr data_d_; + buffer data_d_; }; #endif // CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h index eff550feeb22e..ddabd531e7602 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h @@ -5,16 +5,16 @@ #include "DataFormats/SiPixelRawData/interface/SiPixelErrorCompact.h" #include "DataFormats/SiPixelRawData/interface/SiPixelFormatterErrors.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + class SiPixelDigiErrorsCUDA { public: using SiPixelErrorCompactVector = cms::cuda::SimpleVector; SiPixelDigiErrorsCUDA() = default; - explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream); + inline SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream); ~SiPixelDigiErrorsCUDA() = default; SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete; @@ -27,18 +27,66 @@ class SiPixelDigiErrorsCUDA { SiPixelErrorCompactVector* error() { return error_d.get(); } SiPixelErrorCompactVector const* error() const { return error_d.get(); } - using HostDataError = std::pair>; - HostDataError dataErrorToHostAsync(cudaStream_t stream) const; + using HostDataError = std::pair>; + inline HostDataError dataErrorToHostAsync(cudaStream_t stream) const; - void copyErrorToHostAsync(cudaStream_t stream); + inline void copyErrorToHostAsync(cudaStream_t stream); int nErrorWords() const { return nErrorWords_; } private: - cms::cuda::device::unique_ptr data_d; - cms::cuda::device::unique_ptr error_d; - cms::cuda::host::unique_ptr error_h; + memoryPool::buffer data_d; + memoryPool::buffer error_d; + memoryPool::buffer error_h; SiPixelFormatterErrors formatterErrors_h; int nErrorWords_ = 0; }; + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + + +SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream) : + formatterErrors_h(std::move(errors)), + nErrorWords_(maxFedWords) { + assert(maxFedWords != 0); + + memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared(stream, memoryPool::onDevice)); + assert(deleter.pool()); + + data_d = memoryPool::cuda::make_buffer(maxFedWords, deleter); + error_d = memoryPool::cuda::make_buffer(1,deleter); + error_h = memoryPool::cuda::make_buffer(1,stream,memoryPool::onHost); + + + cudaMemsetAsync(data_d.get(), 0x00, maxFedWords, stream); + + cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); + assert(error_h->empty()); + assert(error_h->capacity() == static_cast(maxFedWords)); + + cudaCheck(memoryPool::cuda::copy(error_d, error_h, 1,stream)); +} + +void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) { + cudaCheck(memoryPool::cuda::copy(error_h, error_d, 1,stream)); +} + +SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const { + // On one hand size() could be sufficient. On the other hand, if + // someone copies the SimpleVector<>, (s)he might expect the data + // buffer to actually have space for capacity() elements. + auto data = memoryPool::cuda::make_buffer(error_h->capacity(), stream, memoryPool::onHost); + + // but transfer only the required amount + if (not error_h->empty()) { + cudaCheck(memoryPool::cuda::copy(data, data_d, error_h->size(), stream)); + } + auto err = *error_h; + err.set_data(data.get()); + return HostDataError(err, std::move(data)); +} + + + #endif // CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h index cf6b51687982f..6f3ceb17f5a28 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -3,8 +3,7 @@ #include -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" @@ -12,7 +11,7 @@ class SiPixelDigisCUDA { public: using StoreType = uint16_t; SiPixelDigisCUDA() = default; - explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); + inline SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); ~SiPixelDigisCUDA() = default; SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete; @@ -28,14 +27,14 @@ class SiPixelDigisCUDA { uint32_t nModules() const { return nModules_h; } uint32_t nDigis() const { return nDigis_h; } - cms::cuda::host::unique_ptr copyAllToHostAsync(cudaStream_t stream) const; + inline memoryPool::buffer copyAllToHostAsync(cudaStream_t stream) const; SiPixelDigisCUDASOAView view() { return m_view; } SiPixelDigisCUDASOAView const view() const { return m_view; } private: // These are consumed by downstream device code - cms::cuda::device::unique_ptr m_store; + memoryPool::buffer m_store; SiPixelDigisCUDASOAView m_view; @@ -43,4 +42,7 @@ class SiPixelDigisCUDA { uint32_t nDigis_h = 0; }; + +#include "SiPixelDigisCUDAImpl.h" + #endif // CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h index 70d00ae584279..f1efeaad2e2f3 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h @@ -3,8 +3,6 @@ #include -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include @@ -96,11 +94,11 @@ class SiPixelDigisCUDASOAView { uint32_t* rawIdArr_; template - ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) { + static ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) { return reinterpret_cast(store.get() + static_cast(column) * roundFor128ByteAlignment(size)); } - static int roundFor128ByteAlignment(int size) { + static constexpr int roundFor128ByteAlignment(int size) { constexpr int mul = 128 / sizeof(uint16_t); return ((size + mul - 1) / mul) * mul; }; diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc index e81b1b2b592af..fb2d7618d6eea 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc @@ -1,42 +1 @@ -#include - #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h" - -SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream) - : data_d(cms::cuda::make_device_unique(maxFedWords, stream)), - error_d(cms::cuda::make_device_unique(stream)), - error_h(cms::cuda::make_host_unique(stream)), - formatterErrors_h(std::move(errors)), - nErrorWords_(maxFedWords) { - assert(maxFedWords != 0); - cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream); - - cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); - assert(error_h->empty()); - assert(error_h->capacity() == static_cast(maxFedWords)); - - cms::cuda::copyAsync(error_d, error_h, stream); -} - -void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) { - cms::cuda::copyAsync(error_h, error_d, stream); -} - -SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const { - // On one hand size() could be sufficient. On the other hand, if - // someone copies the SimpleVector<>, (s)he might expect the data - // buffer to actually have space for capacity() elements. - auto data = cms::cuda::make_host_unique(error_h->capacity(), stream); - - // but transfer only the required amount - if (not error_h->empty()) { - cms::cuda::copyAsync(data, data_d, error_h->size(), stream); - } - auto err = *error_h; - err.set_data(data.get()); - return HostDataError(err, std::move(data)); -} diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc index 9a7f8ae8bdad5..2b230d26c9b4b 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -1,29 +1 @@ -#include - #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) - : m_store(cms::cuda::make_device_unique( - SiPixelDigisCUDASOAView::roundFor128ByteAlignment(maxFedWords) * - static_cast(SiPixelDigisCUDASOAView::StorageLocation::kMAX), - stream)), - m_view(m_store, maxFedWords, SiPixelDigisCUDASOAView::StorageLocation::kMAX) { - assert(maxFedWords != 0); -} - -cms::cuda::host::unique_ptr SiPixelDigisCUDA::copyAllToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique( - m_view.roundFor128ByteAlignment(nDigis()) * static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), - stream); - cudaCheck(cudaMemcpyAsync(ret.get(), - m_view.clus(), - m_view.roundFor128ByteAlignment(nDigis()) * sizeof(SiPixelDigisCUDA::StoreType) * - static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), - cudaMemcpyDeviceToHost, - stream)); - return ret; -} diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index 42bd4d32872e3..251c0b0edc933 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -4,6 +4,8 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" class TrackingRecHit2DHeterogeneous { diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc index 4037b4d50612c..6a1630b2441db 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc @@ -9,7 +9,6 @@ #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer { public: @@ -27,7 +26,7 @@ class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer> digiErrorGetToken_; edm::EDPutTokenT digiErrorPutToken_; - cms::cuda::host::unique_ptr data_; + memoryPool::buffer data_; cms::cuda::SimpleVector error_; const SiPixelFormatterErrors* formatterErrors_ = nullptr; }; diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc index 0702bc4830c7c..c73d446039895 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc @@ -9,7 +9,6 @@ #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer { public: @@ -27,7 +26,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer> digiGetToken_; edm::EDPutTokenT digiPutToken_; - cms::cuda::host::unique_ptr store_; + memoryPool::buffer store_; int nDigis_; }; diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc index ccf7995a20061..5f809dcc506db 100644 --- a/HeterogeneousCore/CUDACore/src/ScopedContext.cc +++ b/HeterogeneousCore/CUDACore/src/ScopedContext.cc @@ -27,6 +27,8 @@ namespace { try { auto error = cudaGetErrorName(status); auto message = cudaGetErrorString(status); + std::cout << "Callback of CUDA stream " << streamId << " in device " << device + << " error " << error << ": " << message << std::endl; throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message; } catch (cms::Exception&) { diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h index 31cba94c56b1d..47aa0f13f7395 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPoolImpl.h @@ -12,8 +12,15 @@ namespace { // free callback - void CUDART_CB freeCallback(void *p) { - std::cout << "free callaback" << std::endl; + void CUDART_CB freeCallback(cudaStream_t streamId, cudaError_t status, void* p) { + //void CUDART_CB freeCallback(void *p) { + if (status != cudaSuccess) { + std::cout << "Error in free callaback in stream " << streamId << std::endl; + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + std::cout << " error " << error << ": " << message << std::endl; + } + // std::cout << "free callaback" << std::endl; auto payload = (memoryPool::Payload *)(p); memoryPool::scheduleFree(payload); } @@ -22,9 +29,10 @@ namespace { struct CudaAlloc { static void scheduleFree(memoryPool::Payload *payload, cudaStream_t stream) { - std::cout << "schedule free" << std::endl; + // std::cout << "schedule free" << std::endl; if (stream) - cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload)); + cudaCheck(cudaStreamAddCallback(stream, freeCallback, payload,0)); + // cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload)); else memoryPool::scheduleFree(payload); } @@ -36,10 +44,14 @@ struct CudaDeviceAlloc : public CudaAlloc { static Pointer alloc(size_t size) { Pointer p = nullptr; auto err = cudaMalloc(&p, size); - std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl; + // std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl; return err == cudaSuccess ? p : nullptr; } - static void free(Pointer ptr) { auto err = cudaFree(ptr); std::cout << "free" << ((err == cudaSuccess) ? " ok" : " err") <& ptr() { return data_h_; } + auto & ptr() { return data_h_; } cms::cuda::host::noncached::unique_ptr const& ptr() const { return data_h_; } private: @@ -93,7 +93,7 @@ void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const e bsHost->betaStar = bs.betaStar(); BeamSpotCUDA bsDevice(ctx.stream()); - cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream()); + cudaMemcpyAsync(bsDevice.data(), bsHost.get(), sizeof(BeamSpotPOD), cudaMemcpyHostToDevice, ctx.stream()); ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice)); }