diff --git a/RecoTracker/LSTCore/BuildFile.xml b/RecoTracker/LSTCore/BuildFile.xml new file mode 100644 index 0000000000000..69c03f883986e --- /dev/null +++ b/RecoTracker/LSTCore/BuildFile.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/RecoTracker/LSTCore/README.md b/RecoTracker/LSTCore/README.md new file mode 100644 index 0000000000000..cc313ba97ddf1 --- /dev/null +++ b/RecoTracker/LSTCore/README.md @@ -0,0 +1,42 @@ +# LSTCore proof of concept + +**This is a proof of concept for how I think we could continue working towards the CMSSW integration while keeping the standalone version alive.** + +This branch of CMSSW contains all of the relevant LST code and can be built entirely within CMSSW. The setup process is what you would expect. + +```bash +export CMSSW_VERSION=CMSSW_14_1_0_pre3 +export CMSSW_BRANCH=${CMSSW_VERSION}_LST_X_LSTCore +source /cvmfs/cms.cern.ch/cmsset_default.sh +cmsrel $CMSSW_VERSION +cd $CMSSW_VERSION/src +cmsenv +git cms-init +git remote add SegLink https://github.com/SegmentLinking/cmssw.git +git fetch SegLink ${CMSSW_BRANCH}:SegLink_cmssw +git checkout SegLink_cmssw +git cms-addpkg RecoTracker/LST RecoTracker/LSTCore Configuration/ProcessModifiers RecoTracker/ConversionSeedGenerators RecoTracker/FinalTrackSelectors RecoTracker/IterativeTracking +git submodule update --init --recursive +scram b -j 8 +``` + +## How it works + +The [TrackLooper repository](https://github.com/SegmentLinking/TrackLooper) is included as a git submodule in `RecoTracker/LSTCore` and the rest of the structure is set up using symlinks. Since we have made a lot of progress getting the code ready for CMSSW, it was just a matter of writing a simple `BuildFile.xml` file. + +## Benefits + +- It would make it easier to work towards the full integration if we have a self-contained thing. It would probably be easier to slowly adapt more of the "proper" CMSSW conventions instead of having to switch them all at once. +- We can keep the standalone version alive for as long as needed. +- Our CI can start running the checks that are done by the `cms-bot` for CMSSW PRs. + +## Disadvantages + +- I might be better to work towards having a single CMSSW package instead of having them separated in `LST` and `LSTCore`. However, I think we could use a similar approach in that case. +- I couldn't think of anything else, but there's likely other disadvantages. + +## Things to do + +- There are a few minor changes that need to be made to the current LST package to get it to work with LSTCore. +- At some point we'll have to figure out how to properly integrate the `data` directory. + diff --git a/RecoTracker/LSTCore/interface/alpaka/Constants.h b/RecoTracker/LSTCore/interface/alpaka/Constants.h new file mode 100644 index 0000000000000..d4f023631af1d --- /dev/null +++ b/RecoTracker/LSTCore/interface/alpaka/Constants.h @@ -0,0 +1,157 @@ +#ifndef Constants_cuh +#define Constants_cuh + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +#ifdef CACHE_ALLOC +#include "HeterogeneousCore/AlpakaInterface/interface/CachedBufAlloc.h" +#endif + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#include +#endif + +namespace SDL { +// Half precision wrapper functions. +#if defined(FP16_Base) +#define __F2H __float2half +#define __H2F __half2float + typedef __half float FPX; +#else +#define __F2H +#define __H2F + typedef float FPX; +#endif + + using Idx = alpaka_common::Idx; + using Dim = alpaka_common::Dim3D; + using Dim1d = alpaka_common::Dim1D; + using Vec = alpaka_common::Vec3D; + using Vec1d = alpaka_common::Vec1D; + using WorkDiv = alpaka_common::WorkDiv3D; + + using Acc = ALPAKA_ACCELERATOR_NAMESPACE::Acc3D; + using Dev = ALPAKA_ACCELERATOR_NAMESPACE::Device; + using DevHost = ALPAKA_ACCELERATOR_NAMESPACE::DevHost; + using QueueAcc = ALPAKA_ACCELERATOR_NAMESPACE::Queue; + + Vec const elementsPerThread(Vec::all(static_cast(1))); + +// Needed for files that are compiled by g++ to not throw an error. +// uint4 is defined only for CUDA, so we will have to revisit this soon when running on other backends. +#if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) + struct uint4 { + unsigned int x; + unsigned int y; + unsigned int z; + unsigned int w; + }; +#endif + + // Buffer type for allocations where auto type can't be used. + template + using Buf = alpaka::Buf; + + // Allocation wrapper function to make integration of the caching allocator easier and reduce code boilerplate. + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf, T> allocBufWrapper(TAcc const& devAccIn, + TSize nElements, + TQueue queue) { +#ifdef CACHE_ALLOC + return cms::alpakatools::allocCachedBuf(devAccIn, queue, Vec1d(static_cast(nElements))); +#else + return alpaka::allocBuf(devAccIn, Vec1d(static_cast(nElements))); +#endif + } + + // Second allocation wrapper function when queue is not given. Reduces code boilerplate. + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf, T> allocBufWrapper(TAcc const& devAccIn, TSize nElements) { + return alpaka::allocBuf(devAccIn, Vec1d(static_cast(nElements))); + } + + // Wrapper function to reduce code boilerplate for defining grid/block sizes. + ALPAKA_FN_HOST ALPAKA_FN_INLINE Vec createVec(int x, int y, int z) { + return Vec(static_cast(x), static_cast(y), static_cast(z)); + } + + // Adjust grid and block sizes based on backend configuration + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE WorkDiv createWorkDiv(const Vec& blocksPerGrid, + const Vec& threadsPerBlock, + const Vec& elementsPerThreadArg) { + Vec adjustedBlocks = blocksPerGrid; + Vec adjustedThreads = threadsPerBlock; + + // Serial execution, so all launch parameters set to 1. +#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) + adjustedBlocks = Vec::all(static_cast(1)); + adjustedThreads = Vec::all(static_cast(1)); +#endif + + // Threads enabled, set number of blocks to 1. +#if defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) + adjustedBlocks = Vec::all(static_cast(1)); +#endif + + return WorkDiv(adjustedBlocks, adjustedThreads, elementsPerThreadArg); + } + +// If a compile time flag does not define PT_CUT, default to 0.8 (GeV) +#ifndef PT_CUT + constexpr float PT_CUT = 0.8f; +#endif + + const unsigned int MAX_BLOCKS = 80; + const unsigned int MAX_CONNECTED_MODULES = 40; + + const unsigned int N_MAX_PIXEL_SEGMENTS_PER_MODULE = 50000; + + const unsigned int N_MAX_PIXEL_MD_PER_MODULES = 2 * N_MAX_PIXEL_SEGMENTS_PER_MODULE; + + const unsigned int N_MAX_PIXEL_TRIPLETS = 5000; + const unsigned int N_MAX_PIXEL_QUINTUPLETS = 15000; + + const unsigned int N_MAX_PIXEL_TRACK_CANDIDATES = 30000; + const unsigned int N_MAX_NONPIXEL_TRACK_CANDIDATES = 1000; + + const unsigned int size_superbins = 45000; + + //defining the constant host device variables right up here + ALPAKA_STATIC_ACC_MEM_GLOBAL const float miniMulsPtScaleBarrel[6] = {0.0052, 0.0038, 0.0034, 0.0034, 0.0032, 0.0034}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float miniMulsPtScaleEndcap[5] = {0.006, 0.006, 0.006, 0.006, 0.006}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float miniRminMeanBarrel[6] = { + 25.007152356, 37.2186993757, 52.3104270826, 68.6658656666, 85.9770373007, 108.301772384}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float miniRminMeanEndcap[5] = { + 130.992832231, 154.813883559, 185.352604327, 221.635123002, 265.022076742}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float k2Rinv1GeVf = (2.99792458e-3 * 3.8) / 2; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float kR1GeVf = 1. / (2.99792458e-3 * 3.8); + ALPAKA_STATIC_ACC_MEM_GLOBAL const float sinAlphaMax = 0.95; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float ptCut = PT_CUT; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float deltaZLum = 15.0; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float pixelPSZpitch = 0.15; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float strip2SZpitch = 5.0; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float pt_betaMax = 7.0; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float magnetic_field = 3.8112; + // Since C++ can't represent infinity, SDL_INF = 123456789 was used to represent infinity in the data table + ALPAKA_STATIC_ACC_MEM_GLOBAL const float SDL_INF = 123456789.0; +} //namespace SDL + +namespace T5DNN { + // Working points matching LST fake rate (43.9%) or signal acceptance (82.0%) + ALPAKA_STATIC_ACC_MEM_GLOBAL const float LSTWP1 = 0.3418833f; // 94.0% TPR, 43.9% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float LSTWP2 = 0.6177366f; // 82.0% TPR, 20.0% FPR + // Other working points + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP70 = 0.7776195f; // 70.0% TPR, 10.0% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP75 = 0.7181118f; // 75.0% TPR, 13.5% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP80 = 0.6492643f; // 80.0% TPR, 17.9% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP85 = 0.5655319f; // 85.0% TPR, 23.8% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP90 = 0.4592205f; // 90.0% TPR, 32.6% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP95 = 0.3073708f; // 95.0% TPR, 47.7% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP97p5 = 0.2001348f; // 97.5% TPR, 61.2% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP99 = 0.1120605f; // 99.0% TPR, 75.9% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL const float WP99p9 = 0.0218196f; // 99.9% TPR, 95.4% FPR +} // namespace T5DNN +#endif diff --git a/RecoTracker/LSTCore/interface/alpaka/LST.h b/RecoTracker/LSTCore/interface/alpaka/LST.h new file mode 100644 index 0000000000000..7d9d11745ab3c --- /dev/null +++ b/RecoTracker/LSTCore/interface/alpaka/LST.h @@ -0,0 +1,115 @@ +#ifndef LST_H +#define LST_H + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/LSTESData.h" +#else +#include "Constants.h" +#include "LSTESData.h" +#endif + +#include +#include +#include +#include + +namespace SDL { + template + class Event; + + template + class LST; + + template <> + class LST { + public: + LST() = default; + + void run(QueueAcc& queue, + bool verbose, + const LSTESDeviceData* deviceESData, + const std::vector see_px, + const std::vector see_py, + const std::vector see_pz, + const std::vector see_dxy, + const std::vector see_dz, + const std::vector see_ptErr, + const std::vector see_etaErr, + const std::vector see_stateTrajGlbX, + const std::vector see_stateTrajGlbY, + const std::vector see_stateTrajGlbZ, + const std::vector see_stateTrajGlbPx, + const std::vector see_stateTrajGlbPy, + const std::vector see_stateTrajGlbPz, + const std::vector see_q, + const std::vector> see_hitIdx, + const std::vector ph2_detId, + const std::vector ph2_x, + const std::vector ph2_y, + const std::vector ph2_z); + std::vector> hits() { return out_tc_hitIdxs_; } + std::vector len() { return out_tc_len_; } + std::vector seedIdx() { return out_tc_seedIdx_; } + std::vector trackCandidateType() { return out_tc_trackCandidateType_; } + + private: + void prepareInput(const std::vector see_px, + const std::vector see_py, + const std::vector see_pz, + const std::vector see_dxy, + const std::vector see_dz, + const std::vector see_ptErr, + const std::vector see_etaErr, + const std::vector see_stateTrajGlbX, + const std::vector see_stateTrajGlbY, + const std::vector see_stateTrajGlbZ, + const std::vector see_stateTrajGlbPx, + const std::vector see_stateTrajGlbPy, + const std::vector see_stateTrajGlbPz, + const std::vector see_q, + const std::vector> see_hitIdx, + const std::vector ph2_detId, + const std::vector ph2_x, + const std::vector ph2_y, + const std::vector ph2_z); + + void getOutput(SDL::Event& event); + std::vector getHitIdxs(const short trackCandidateType, + const unsigned int TCIdx, + const unsigned int* TCHitIndices, + const unsigned int* hitIndices); + + // Input and output vectors + std::vector in_trkX_; + std::vector in_trkY_; + std::vector in_trkZ_; + std::vector in_hitId_; + std::vector in_hitIdxs_; + std::vector in_hitIndices_vec0_; + std::vector in_hitIndices_vec1_; + std::vector in_hitIndices_vec2_; + std::vector in_hitIndices_vec3_; + std::vector in_deltaPhi_vec_; + std::vector in_ptIn_vec_; + std::vector in_ptErr_vec_; + std::vector in_px_vec_; + std::vector in_py_vec_; + std::vector in_pz_vec_; + std::vector in_eta_vec_; + std::vector in_etaErr_vec_; + std::vector in_phi_vec_; + std::vector in_charge_vec_; + std::vector in_seedIdx_vec_; + std::vector in_superbin_vec_; + std::vector in_pixelType_vec_; + std::vector in_isQuad_vec_; + std::vector> out_tc_hitIdxs_; + std::vector out_tc_len_; + std::vector out_tc_seedIdx_; + std::vector out_tc_trackCandidateType_; + }; + +} // namespace SDL + +#endif diff --git a/RecoTracker/LSTCore/interface/alpaka/LSTESData.h b/RecoTracker/LSTCore/interface/alpaka/LSTESData.h new file mode 100644 index 0000000000000..29ae19a5484e8 --- /dev/null +++ b/RecoTracker/LSTCore/interface/alpaka/LSTESData.h @@ -0,0 +1,94 @@ +#ifndef LSTESData_H +#define LSTESData_H + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#else +#include "Constants.h" +#endif + +#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h" + +#include +#include + +namespace SDL { + + struct pixelMap; + + template + class TiltedGeometry; + + template + class ModuleConnectionMap; + using MapPLStoLayer = std::array, 4>, 3>; + + template + struct modulesBuffer; + + template + class EndcapGeometryHost; + + template + class EndcapGeometry; + + template + struct LSTESHostData; + + // FIXME: This shouldn't be a templated struct + template <> + struct LSTESHostData { + std::shared_ptr mapPLStoLayer; + std::shared_ptr> endcapGeometry; + std::shared_ptr> tiltedGeometry; + std::shared_ptr> moduleConnectionMap; + + LSTESHostData(std::shared_ptr mapPLStoLayerIn, + std::shared_ptr> endcapGeometryIn, + std::shared_ptr> tiltedGeometryIn, + std::shared_ptr> moduleConnectionMapIn) + : mapPLStoLayer(mapPLStoLayerIn), + endcapGeometry(endcapGeometryIn), + tiltedGeometry(tiltedGeometryIn), + moduleConnectionMap(moduleConnectionMapIn) {} + }; + + template + struct LSTESDeviceData { + uint16_t nModules; + uint16_t nLowerModules; + unsigned int nPixels; + std::shared_ptr> modulesBuffers; + std::shared_ptr> endcapGeometry; + std::shared_ptr pixelMapping; + + LSTESDeviceData(uint16_t nModulesIn, + uint16_t nLowerModulesIn, + unsigned int nPixelsIn, + std::shared_ptr> modulesBuffersIn, + std::shared_ptr> endcapGeometryIn, + std::shared_ptr pixelMappingIn) + : nModules(nModulesIn), + nLowerModules(nLowerModulesIn), + nPixels(nPixelsIn), + modulesBuffers(modulesBuffersIn), + endcapGeometry(endcapGeometryIn), + pixelMapping(pixelMappingIn) {} + }; + + std::unique_ptr> loadAndFillESHost(); + std::unique_ptr> loadAndFillESDevice(SDL::QueueAcc& queue, const LSTESHostData* hostData); + +} // namespace SDL + +namespace cms::alpakatools { + template <> + struct CopyToDevice> { + template + static auto copyAsync(TQueue& queue, SDL::LSTESHostData const& hostData) { + return std::make_unique>(hostData); + } + }; +} // namespace cms::alpakatools + +#endif diff --git a/RecoTracker/LSTCore/interface/alpaka/Module.h b/RecoTracker/LSTCore/interface/alpaka/Module.h new file mode 100644 index 0000000000000..0a269eaa6e16a --- /dev/null +++ b/RecoTracker/LSTCore/interface/alpaka/Module.h @@ -0,0 +1,372 @@ +#ifndef Module_cuh +#define Module_cuh + +#include + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#else +#include "Constants.h" +#endif + +namespace SDL { + enum SubDet { InnerPixel = 0, Barrel = 5, Endcap = 4 }; + + enum Side { NegZ = 1, PosZ = 2, Center = 3 }; + + enum ModuleType { PS, TwoS, PixelModule }; + + enum ModuleLayerType { Pixel, Strip, InnerPixelLayer }; + + struct objectRanges { + int* hitRanges; + int* hitRangesLower; + int* hitRangesUpper; + int8_t* hitRangesnLower; + int8_t* hitRangesnUpper; + int* mdRanges; + int* segmentRanges; + int* trackletRanges; + int* tripletRanges; + int* trackCandidateRanges; + // Others will be added later + int* quintupletRanges; + + // This number is just nEligibleModules - 1, but still we want this to be independent of the TC kernel + uint16_t* nEligibleT5Modules; + // Will be allocated in createQuintuplets kernel! + uint16_t* indicesOfEligibleT5Modules; + // To store different starting points for variable occupancy stuff + int* quintupletModuleIndices; + int* quintupletModuleOccupancy; + int* miniDoubletModuleIndices; + int* miniDoubletModuleOccupancy; + int* segmentModuleIndices; + int* segmentModuleOccupancy; + int* tripletModuleIndices; + int* tripletModuleOccupancy; + + unsigned int* device_nTotalMDs; + unsigned int* device_nTotalSegs; + unsigned int* device_nTotalTrips; + unsigned int* device_nTotalQuints; + + template + void setData(TBuff& objectRangesbuf) { + hitRanges = alpaka::getPtrNative(objectRangesbuf.hitRanges_buf); + hitRangesLower = alpaka::getPtrNative(objectRangesbuf.hitRangesLower_buf); + hitRangesUpper = alpaka::getPtrNative(objectRangesbuf.hitRangesUpper_buf); + hitRangesnLower = alpaka::getPtrNative(objectRangesbuf.hitRangesnLower_buf); + hitRangesnUpper = alpaka::getPtrNative(objectRangesbuf.hitRangesnUpper_buf); + mdRanges = alpaka::getPtrNative(objectRangesbuf.mdRanges_buf); + segmentRanges = alpaka::getPtrNative(objectRangesbuf.segmentRanges_buf); + trackletRanges = alpaka::getPtrNative(objectRangesbuf.trackletRanges_buf); + tripletRanges = alpaka::getPtrNative(objectRangesbuf.tripletRanges_buf); + trackCandidateRanges = alpaka::getPtrNative(objectRangesbuf.trackCandidateRanges_buf); + quintupletRanges = alpaka::getPtrNative(objectRangesbuf.quintupletRanges_buf); + + nEligibleT5Modules = alpaka::getPtrNative(objectRangesbuf.nEligibleT5Modules_buf); + indicesOfEligibleT5Modules = alpaka::getPtrNative(objectRangesbuf.indicesOfEligibleT5Modules_buf); + + quintupletModuleIndices = alpaka::getPtrNative(objectRangesbuf.quintupletModuleIndices_buf); + quintupletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.quintupletModuleOccupancy_buf); + miniDoubletModuleIndices = alpaka::getPtrNative(objectRangesbuf.miniDoubletModuleIndices_buf); + miniDoubletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.miniDoubletModuleOccupancy_buf); + segmentModuleIndices = alpaka::getPtrNative(objectRangesbuf.segmentModuleIndices_buf); + segmentModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.segmentModuleOccupancy_buf); + tripletModuleIndices = alpaka::getPtrNative(objectRangesbuf.tripletModuleIndices_buf); + tripletModuleOccupancy = alpaka::getPtrNative(objectRangesbuf.tripletModuleOccupancy_buf); + + device_nTotalMDs = alpaka::getPtrNative(objectRangesbuf.device_nTotalMDs_buf); + device_nTotalSegs = alpaka::getPtrNative(objectRangesbuf.device_nTotalSegs_buf); + device_nTotalTrips = alpaka::getPtrNative(objectRangesbuf.device_nTotalTrips_buf); + device_nTotalQuints = alpaka::getPtrNative(objectRangesbuf.device_nTotalQuints_buf); + } + }; + + template + struct objectRangesBuffer : objectRanges { + Buf hitRanges_buf; + Buf hitRangesLower_buf; + Buf hitRangesUpper_buf; + Buf hitRangesnLower_buf; + Buf hitRangesnUpper_buf; + Buf mdRanges_buf; + Buf segmentRanges_buf; + Buf trackletRanges_buf; + Buf tripletRanges_buf; + Buf trackCandidateRanges_buf; + Buf quintupletRanges_buf; + + Buf nEligibleT5Modules_buf; + Buf indicesOfEligibleT5Modules_buf; + + Buf quintupletModuleIndices_buf; + Buf quintupletModuleOccupancy_buf; + Buf miniDoubletModuleIndices_buf; + Buf miniDoubletModuleOccupancy_buf; + Buf segmentModuleIndices_buf; + Buf segmentModuleOccupancy_buf; + Buf tripletModuleIndices_buf; + Buf tripletModuleOccupancy_buf; + + Buf device_nTotalMDs_buf; + Buf device_nTotalSegs_buf; + Buf device_nTotalTrips_buf; + Buf device_nTotalQuints_buf; + + template + objectRangesBuffer(unsigned int nMod, unsigned int nLowerMod, TDevAcc const& devAccIn, TQueue& queue) + : hitRanges_buf(allocBufWrapper(devAccIn, nMod * 2, queue)), + hitRangesLower_buf(allocBufWrapper(devAccIn, nMod, queue)), + hitRangesUpper_buf(allocBufWrapper(devAccIn, nMod, queue)), + hitRangesnLower_buf(allocBufWrapper(devAccIn, nMod, queue)), + hitRangesnUpper_buf(allocBufWrapper(devAccIn, nMod, queue)), + mdRanges_buf(allocBufWrapper(devAccIn, nMod * 2, queue)), + segmentRanges_buf(allocBufWrapper(devAccIn, nMod * 2, queue)), + trackletRanges_buf(allocBufWrapper(devAccIn, nMod * 2, queue)), + tripletRanges_buf(allocBufWrapper(devAccIn, nMod * 2, queue)), + trackCandidateRanges_buf(allocBufWrapper(devAccIn, nMod * 2, queue)), + quintupletRanges_buf(allocBufWrapper(devAccIn, nMod * 2, queue)), + nEligibleT5Modules_buf(allocBufWrapper(devAccIn, 1, queue)), + indicesOfEligibleT5Modules_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + quintupletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + quintupletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + miniDoubletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod + 1, queue)), + miniDoubletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod + 1, queue)), + segmentModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod + 1, queue)), + segmentModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod + 1, queue)), + tripletModuleIndices_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + tripletModuleOccupancy_buf(allocBufWrapper(devAccIn, nLowerMod, queue)), + device_nTotalMDs_buf(allocBufWrapper(devAccIn, 1, queue)), + device_nTotalSegs_buf(allocBufWrapper(devAccIn, 1, queue)), + device_nTotalTrips_buf(allocBufWrapper(devAccIn, 1, queue)), + device_nTotalQuints_buf(allocBufWrapper(devAccIn, 1, queue)) { + alpaka::memset(queue, hitRanges_buf, 0xff); + alpaka::memset(queue, hitRangesLower_buf, 0xff); + alpaka::memset(queue, hitRangesUpper_buf, 0xff); + alpaka::memset(queue, hitRangesnLower_buf, 0xff); + alpaka::memset(queue, hitRangesnUpper_buf, 0xff); + alpaka::memset(queue, mdRanges_buf, 0xff); + alpaka::memset(queue, segmentRanges_buf, 0xff); + alpaka::memset(queue, trackletRanges_buf, 0xff); + alpaka::memset(queue, tripletRanges_buf, 0xff); + alpaka::memset(queue, trackCandidateRanges_buf, 0xff); + alpaka::memset(queue, quintupletRanges_buf, 0xff); + alpaka::memset(queue, quintupletModuleIndices_buf, 0xff); + alpaka::wait(queue); + } + }; + + struct modules { + const unsigned int* detIds; + const uint16_t* moduleMap; + const unsigned int* mapdetId; + const uint16_t* mapIdx; + const uint16_t* nConnectedModules; + const float* drdzs; + const float* dxdys; + const uint16_t* nModules; + const uint16_t* nLowerModules; + const uint16_t* partnerModuleIndices; + + const short* layers; + const short* rings; + const short* modules; + const short* rods; + const short* subdets; + const short* sides; + const float* eta; + const float* r; + const bool* isInverted; + const bool* isLower; + const bool* isAnchor; + const ModuleType* moduleType; + const ModuleLayerType* moduleLayerType; + const int* sdlLayers; + const unsigned int* connectedPixels; + + static bool parseIsInverted(short subdet, short side, short module, short layer) { + if (subdet == Endcap) { + if (side == NegZ) { + return module % 2 == 1; + } else if (side == PosZ) { + return module % 2 == 0; + } else { + return false; + } + } else if (subdet == Barrel) { + if (side == Center) { + if (layer <= 3) { + return module % 2 == 1; + } else if (layer >= 4) { + return module % 2 == 0; + } else { + return false; + } + } else if (side == NegZ or side == PosZ) { + if (layer <= 2) { + return module % 2 == 1; + } else if (layer == 3) { + return module % 2 == 0; + } else { + return false; + } + } else { + return false; + } + } else { + return false; + } + }; + + static bool parseIsLower(bool isInvertedx, unsigned int detId) { + return (isInvertedx) ? !(detId & 1) : (detId & 1); + }; + + static unsigned int parsePartnerModuleId(unsigned int detId, bool isLowerx, bool isInvertedx) { + return isLowerx ? (isInvertedx ? detId - 1 : detId + 1) : (isInvertedx ? detId + 1 : detId - 1); + }; + + template + void setData(const TBuff& modulesbuf) { + detIds = alpaka::getPtrNative(modulesbuf.detIds_buf); + moduleMap = alpaka::getPtrNative(modulesbuf.moduleMap_buf); + mapdetId = alpaka::getPtrNative(modulesbuf.mapdetId_buf); + mapIdx = alpaka::getPtrNative(modulesbuf.mapIdx_buf); + nConnectedModules = alpaka::getPtrNative(modulesbuf.nConnectedModules_buf); + drdzs = alpaka::getPtrNative(modulesbuf.drdzs_buf); + dxdys = alpaka::getPtrNative(modulesbuf.dxdys_buf); + nModules = alpaka::getPtrNative(modulesbuf.nModules_buf); + nLowerModules = alpaka::getPtrNative(modulesbuf.nLowerModules_buf); + partnerModuleIndices = alpaka::getPtrNative(modulesbuf.partnerModuleIndices_buf); + + layers = alpaka::getPtrNative(modulesbuf.layers_buf); + rings = alpaka::getPtrNative(modulesbuf.rings_buf); + modules = alpaka::getPtrNative(modulesbuf.modules_buf); + rods = alpaka::getPtrNative(modulesbuf.rods_buf); + subdets = alpaka::getPtrNative(modulesbuf.subdets_buf); + sides = alpaka::getPtrNative(modulesbuf.sides_buf); + eta = alpaka::getPtrNative(modulesbuf.eta_buf); + r = alpaka::getPtrNative(modulesbuf.r_buf); + isInverted = alpaka::getPtrNative(modulesbuf.isInverted_buf); + isLower = alpaka::getPtrNative(modulesbuf.isLower_buf); + isAnchor = alpaka::getPtrNative(modulesbuf.isAnchor_buf); + moduleType = alpaka::getPtrNative(modulesbuf.moduleType_buf); + moduleLayerType = alpaka::getPtrNative(modulesbuf.moduleLayerType_buf); + sdlLayers = alpaka::getPtrNative(modulesbuf.sdlLayers_buf); + connectedPixels = alpaka::getPtrNative(modulesbuf.connectedPixels_buf); + } + }; + + template + struct modulesBuffer : modules { + Buf detIds_buf; + Buf moduleMap_buf; + Buf mapdetId_buf; + Buf mapIdx_buf; + Buf nConnectedModules_buf; + Buf drdzs_buf; + Buf dxdys_buf; + Buf nModules_buf; + Buf nLowerModules_buf; + Buf partnerModuleIndices_buf; + + Buf layers_buf; + Buf rings_buf; + Buf modules_buf; + Buf rods_buf; + Buf subdets_buf; + Buf sides_buf; + Buf eta_buf; + Buf r_buf; + Buf isInverted_buf; + Buf isLower_buf; + Buf isAnchor_buf; + Buf moduleType_buf; + Buf moduleLayerType_buf; + Buf sdlLayers_buf; + Buf connectedPixels_buf; + + modulesBuffer(TDev const& dev, unsigned int nMod, unsigned int nPixs) + : detIds_buf(allocBufWrapper(dev, nMod)), + moduleMap_buf(allocBufWrapper(dev, nMod * MAX_CONNECTED_MODULES)), + mapdetId_buf(allocBufWrapper(dev, nMod)), + mapIdx_buf(allocBufWrapper(dev, nMod)), + nConnectedModules_buf(allocBufWrapper(dev, nMod)), + drdzs_buf(allocBufWrapper(dev, nMod)), + dxdys_buf(allocBufWrapper(dev, nMod)), + nModules_buf(allocBufWrapper(dev, 1)), + nLowerModules_buf(allocBufWrapper(dev, 1)), + partnerModuleIndices_buf(allocBufWrapper(dev, nMod)), + + layers_buf(allocBufWrapper(dev, nMod)), + rings_buf(allocBufWrapper(dev, nMod)), + modules_buf(allocBufWrapper(dev, nMod)), + rods_buf(allocBufWrapper(dev, nMod)), + subdets_buf(allocBufWrapper(dev, nMod)), + sides_buf(allocBufWrapper(dev, nMod)), + eta_buf(allocBufWrapper(dev, nMod)), + r_buf(allocBufWrapper(dev, nMod)), + isInverted_buf(allocBufWrapper(dev, nMod)), + isLower_buf(allocBufWrapper(dev, nMod)), + isAnchor_buf(allocBufWrapper(dev, nMod)), + moduleType_buf(allocBufWrapper(dev, nMod)), + moduleLayerType_buf(allocBufWrapper(dev, nMod)), + sdlLayers_buf(allocBufWrapper(dev, nMod)), + connectedPixels_buf(allocBufWrapper(dev, nPixs)) { + setData(*this); + } + + template + inline void copyFromSrc(TQueue queue, const modulesBuffer& src, bool isFull = true) { + alpaka::memcpy(queue, detIds_buf, src.detIds_buf); + if (isFull) { + alpaka::memcpy(queue, moduleMap_buf, src.moduleMap_buf); + alpaka::memcpy(queue, mapdetId_buf, src.mapdetId_buf); + alpaka::memcpy(queue, mapIdx_buf, src.mapIdx_buf); + alpaka::memcpy(queue, nConnectedModules_buf, src.nConnectedModules_buf); + alpaka::memcpy(queue, drdzs_buf, src.drdzs_buf); + alpaka::memcpy(queue, dxdys_buf, src.dxdys_buf); + } + alpaka::memcpy(queue, nModules_buf, src.nModules_buf); + alpaka::memcpy(queue, nLowerModules_buf, src.nLowerModules_buf); + if (isFull) { + alpaka::memcpy(queue, partnerModuleIndices_buf, src.partnerModuleIndices_buf); + } + + alpaka::memcpy(queue, layers_buf, src.layers_buf); + alpaka::memcpy(queue, rings_buf, src.rings_buf); + alpaka::memcpy(queue, modules_buf, src.modules_buf); + alpaka::memcpy(queue, rods_buf, src.rods_buf); + alpaka::memcpy(queue, subdets_buf, src.subdets_buf); + alpaka::memcpy(queue, sides_buf, src.sides_buf); + alpaka::memcpy(queue, eta_buf, src.eta_buf); + alpaka::memcpy(queue, r_buf, src.r_buf); + if (isFull) { + alpaka::memcpy(queue, isInverted_buf, src.isInverted_buf); + } + alpaka::memcpy(queue, isLower_buf, src.isLower_buf); + if (isFull) { + alpaka::memcpy(queue, isAnchor_buf, src.isAnchor_buf); + } + alpaka::memcpy(queue, moduleType_buf, src.moduleType_buf); + if (isFull) { + alpaka::memcpy(queue, moduleLayerType_buf, src.moduleLayerType_buf); + alpaka::memcpy(queue, sdlLayers_buf, src.sdlLayers_buf); + alpaka::memcpy(queue, connectedPixels_buf, src.connectedPixels_buf); + } + alpaka::wait(queue); + } + + template + modulesBuffer(TQueue queue, const modulesBuffer& src, unsigned int nMod, unsigned int nPixs) + : modulesBuffer(alpaka::getDev(queue), nMod, nPixs) { + copyFromSrc(queue, src); + } + + inline SDL::modules const* data() const { return this; } + }; + +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/EndcapGeometry.dev.cc b/RecoTracker/LSTCore/src/alpaka/EndcapGeometry.dev.cc new file mode 100644 index 0000000000000..2b5be62ec94bc --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/EndcapGeometry.dev.cc @@ -0,0 +1,85 @@ +#include "EndcapGeometry.h" + +SDL::EndcapGeometry::EndcapGeometry(SDL::Dev const& devAccIn, + SDL::QueueAcc& queue, + SDL::EndcapGeometryHost const& endcapGeometryIn) + : geoMapDetId_buf(allocBufWrapper(devAccIn, endcapGeometryIn.centroid_phis_.size())), + geoMapPhi_buf(allocBufWrapper(devAccIn, endcapGeometryIn.centroid_phis_.size())) { + dxdy_slope_ = endcapGeometryIn.dxdy_slope_; + centroid_phis_ = endcapGeometryIn.centroid_phis_; + fillGeoMapArraysExplicit(queue); +} + +void SDL::EndcapGeometryHost::load(std::string filename) { + dxdy_slope_.clear(); + centroid_phis_.clear(); + + std::ifstream ifile(filename, std::ios::binary); + if (!ifile.is_open()) { + throw std::runtime_error("Unable to open file: " + filename); + } + + while (!ifile.eof()) { + unsigned int detid; + float dxdy_slope, centroid_phi; + + // Read the detid, dxdy_slope, and centroid_phi from binary file + ifile.read(reinterpret_cast(&detid), sizeof(detid)); + ifile.read(reinterpret_cast(&dxdy_slope), sizeof(dxdy_slope)); + ifile.read(reinterpret_cast(¢roid_phi), sizeof(centroid_phi)); + + if (ifile) { + dxdy_slope_[detid] = dxdy_slope; + centroid_phis_[detid] = centroid_phi; + } else { + // End of file or read failed + if (!ifile.eof()) { + throw std::runtime_error("Failed to read Endcap Geometry binary data."); + } + } + } +} + +void SDL::EndcapGeometry::fillGeoMapArraysExplicit(SDL::QueueAcc& queue) { + unsigned int phi_size = centroid_phis_.size(); + + // Allocate buffers on host + SDL::DevHost const& devHost = cms::alpakatools::host(); + auto mapPhi_host_buf = allocBufWrapper(devHost, phi_size); + auto mapDetId_host_buf = allocBufWrapper(devHost, phi_size); + + // Access the raw pointers of the buffers + float* mapPhi = alpaka::getPtrNative(mapPhi_host_buf); + unsigned int* mapDetId = alpaka::getPtrNative(mapDetId_host_buf); + + unsigned int counter = 0; + for (auto it = centroid_phis_.begin(); it != centroid_phis_.end(); ++it) { + unsigned int detId = it->first; + float Phi = it->second; + mapPhi[counter] = Phi; + mapDetId[counter] = detId; + counter++; + } + + nEndCapMap = counter; + + // Copy data from host to device buffers + alpaka::memcpy(queue, geoMapPhi_buf, mapPhi_host_buf); + alpaka::memcpy(queue, geoMapDetId_buf, mapDetId_host_buf); + alpaka::wait(queue); +} + +float SDL::EndcapGeometry::getdxdy_slope(unsigned int detid) const { + if (dxdy_slope_.find(detid) != dxdy_slope_.end()) { + return dxdy_slope_.at(detid); + } else { + return 0; + } +} +float SDL::EndcapGeometryHost::getdxdy_slope(unsigned int detid) const { + if (dxdy_slope_.find(detid) != dxdy_slope_.end()) { + return dxdy_slope_.at(detid); + } else { + return 0; + } +} diff --git a/RecoTracker/LSTCore/src/alpaka/EndcapGeometry.h b/RecoTracker/LSTCore/src/alpaka/EndcapGeometry.h new file mode 100644 index 0000000000000..93da945c00b33 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/EndcapGeometry.h @@ -0,0 +1,63 @@ +#ifndef EndcapGeometry_h +#define EndcapGeometry_h + +#include +#include +#include +#include +#include +#include +#include + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#else +#include "Constants.h" +#endif + +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" + +namespace SDL { + + // FIXME: Need to separate this better into host and device classes + // This is only needed for host, but we template it to avoid symbol conflicts + template + class EndcapGeometryHost; + + template <> + class EndcapGeometryHost { + public: + std::map dxdy_slope_; // dx/dy slope + std::map centroid_phis_; // centroid phi + + EndcapGeometryHost() = default; + ~EndcapGeometryHost() = default; + + void load(std::string); + float getdxdy_slope(unsigned int detid) const; + }; + + template + class EndcapGeometry; + + template <> + class EndcapGeometry { + private: + std::map dxdy_slope_; // dx/dy slope + std::map centroid_phis_; // centroid phi + + public: + Buf geoMapDetId_buf; + Buf geoMapPhi_buf; + + unsigned int nEndCapMap; + + EndcapGeometry(Dev const& devAccIn, QueueAcc& queue, SDL::EndcapGeometryHost const& endcapGeometryIn); + ~EndcapGeometry() = default; + + void fillGeoMapArraysExplicit(QueueAcc& queue); + float getdxdy_slope(unsigned int detid) const; + }; +} // namespace SDL + +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/Event.dev.cc b/RecoTracker/LSTCore/src/alpaka/Event.dev.cc new file mode 100644 index 0000000000000..d539a02b80bf5 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/Event.dev.cc @@ -0,0 +1,1814 @@ +#include "Event.h" + +void SDL::Event::init(bool verbose) { + addObjects = verbose; + hitsInGPU = nullptr; + mdsInGPU = nullptr; + segmentsInGPU = nullptr; + tripletsInGPU = nullptr; + quintupletsInGPU = nullptr; + trackCandidatesInGPU = nullptr; + pixelTripletsInGPU = nullptr; + pixelQuintupletsInGPU = nullptr; + rangesInGPU = nullptr; + + hitsInCPU = nullptr; + rangesInCPU = nullptr; + mdsInCPU = nullptr; + segmentsInCPU = nullptr; + tripletsInCPU = nullptr; + trackCandidatesInCPU = nullptr; + modulesInCPU = nullptr; + quintupletsInCPU = nullptr; + pixelTripletsInCPU = nullptr; + pixelQuintupletsInCPU = nullptr; + + //reset the arrays + for (int i = 0; i < 6; i++) { + n_hits_by_layer_barrel_[i] = 0; + n_minidoublets_by_layer_barrel_[i] = 0; + n_segments_by_layer_barrel_[i] = 0; + n_triplets_by_layer_barrel_[i] = 0; + n_trackCandidates_by_layer_barrel_[i] = 0; + n_quintuplets_by_layer_barrel_[i] = 0; + if (i < 5) { + n_hits_by_layer_endcap_[i] = 0; + n_minidoublets_by_layer_endcap_[i] = 0; + n_segments_by_layer_endcap_[i] = 0; + n_triplets_by_layer_endcap_[i] = 0; + n_trackCandidates_by_layer_endcap_[i] = 0; + n_quintuplets_by_layer_endcap_[i] = 0; + } + } +} + +void SDL::Event::resetEvent() { + //reset the arrays + for (int i = 0; i < 6; i++) { + n_hits_by_layer_barrel_[i] = 0; + n_minidoublets_by_layer_barrel_[i] = 0; + n_segments_by_layer_barrel_[i] = 0; + n_triplets_by_layer_barrel_[i] = 0; + n_trackCandidates_by_layer_barrel_[i] = 0; + n_quintuplets_by_layer_barrel_[i] = 0; + if (i < 5) { + n_hits_by_layer_endcap_[i] = 0; + n_minidoublets_by_layer_endcap_[i] = 0; + n_segments_by_layer_endcap_[i] = 0; + n_triplets_by_layer_endcap_[i] = 0; + n_trackCandidates_by_layer_endcap_[i] = 0; + n_quintuplets_by_layer_endcap_[i] = 0; + } + } + if (hitsInGPU) { + delete hitsInGPU; + delete hitsBuffers; + hitsInGPU = nullptr; + } + if (mdsInGPU) { + delete mdsInGPU; + delete miniDoubletsBuffers; + mdsInGPU = nullptr; + } + if (rangesInGPU) { + delete rangesInGPU; + delete rangesBuffers; + rangesInGPU = nullptr; + } + if (segmentsInGPU) { + delete segmentsInGPU; + delete segmentsBuffers; + segmentsInGPU = nullptr; + } + if (tripletsInGPU) { + delete tripletsInGPU; + delete tripletsBuffers; + tripletsInGPU = nullptr; + } + if (quintupletsInGPU) { + delete quintupletsInGPU; + delete quintupletsBuffers; + quintupletsInGPU = nullptr; + } + if (trackCandidatesInGPU) { + delete trackCandidatesInGPU; + delete trackCandidatesBuffers; + trackCandidatesInGPU = nullptr; + } + if (pixelTripletsInGPU) { + delete pixelTripletsInGPU; + delete pixelTripletsBuffers; + pixelTripletsInGPU = nullptr; + } + if (pixelQuintupletsInGPU) { + delete pixelQuintupletsInGPU; + delete pixelQuintupletsBuffers; + pixelQuintupletsInGPU = nullptr; + } + + if (hitsInCPU != nullptr) { + delete hitsInCPU; + hitsInCPU = nullptr; + } + if (rangesInCPU != nullptr) { + delete rangesInCPU; + rangesInCPU = nullptr; + } + if (mdsInCPU != nullptr) { + delete mdsInCPU; + mdsInCPU = nullptr; + } + if (segmentsInCPU != nullptr) { + delete segmentsInCPU; + segmentsInCPU = nullptr; + } + if (tripletsInCPU != nullptr) { + delete tripletsInCPU; + tripletsInCPU = nullptr; + } + if (quintupletsInCPU != nullptr) { + delete quintupletsInCPU; + quintupletsInCPU = nullptr; + } + if (pixelTripletsInCPU != nullptr) { + delete pixelTripletsInCPU; + pixelTripletsInCPU = nullptr; + } + if (pixelQuintupletsInCPU != nullptr) { + delete pixelQuintupletsInCPU; + pixelQuintupletsInCPU = nullptr; + } + if (trackCandidatesInCPU != nullptr) { + delete trackCandidatesInCPU; + trackCandidatesInCPU = nullptr; + } + if (modulesInCPU != nullptr) { + delete modulesInCPU; + modulesInCPU = nullptr; + } +} + +void SDL::Event::addHitToEvent(std::vector x, + std::vector y, + std::vector z, + std::vector detId, + std::vector idxInNtuple) { + // Use the actual number of hits instead of a max. + unsigned int nHits = x.size(); + + // Initialize space on device/host for next event. + if (hitsInGPU == nullptr) { + hitsInGPU = new SDL::hits(); + hitsBuffers = new SDL::hitsBuffer(nModules_, nHits, devAcc, queue); + hitsInGPU->setData(*hitsBuffers); + } + + if (rangesInGPU == nullptr) { + rangesInGPU = new SDL::objectRanges(); + rangesBuffers = new SDL::objectRangesBuffer(nModules_, nLowerModules_, devAcc, queue); + rangesInGPU->setData(*rangesBuffers); + } + + // Need a view here before transferring to the device. + auto nHits_view = alpaka::createView(devHost, &nHits, (Idx)1u); + + // Copy the host arrays to the GPU. + alpaka::memcpy(queue, hitsBuffers->xs_buf, x, nHits); + alpaka::memcpy(queue, hitsBuffers->ys_buf, y, nHits); + alpaka::memcpy(queue, hitsBuffers->zs_buf, z, nHits); + alpaka::memcpy(queue, hitsBuffers->detid_buf, detId, nHits); + alpaka::memcpy(queue, hitsBuffers->idxs_buf, idxInNtuple, nHits); + alpaka::memcpy(queue, hitsBuffers->nHits_buf, nHits_view); + alpaka::wait(queue); + + Vec const threadsPerBlock1 = createVec(1, 1, 256); + Vec const blocksPerGrid1 = createVec(1, 1, MAX_BLOCKS); + WorkDiv const hit_loop_workdiv = createWorkDiv(blocksPerGrid1, threadsPerBlock1, elementsPerThread); + + hitLoopKernel hit_loop_kernel; + auto const hit_loop_task(alpaka::createTaskKernel(hit_loop_workdiv, + hit_loop_kernel, + Endcap, + TwoS, + nModules_, + endcapGeometry_->nEndCapMap, + alpaka::getPtrNative(endcapGeometry_->geoMapDetId_buf), + alpaka::getPtrNative(endcapGeometry_->geoMapPhi_buf), + *modulesBuffers_->data(), + *hitsInGPU, + nHits)); + + alpaka::enqueue(queue, hit_loop_task); + + Vec const threadsPerBlock2 = createVec(1, 1, 256); + Vec const blocksPerGrid2 = createVec(1, 1, MAX_BLOCKS); + WorkDiv const module_ranges_workdiv = createWorkDiv(blocksPerGrid2, threadsPerBlock2, elementsPerThread); + + moduleRangesKernel module_ranges_kernel; + auto const module_ranges_task(alpaka::createTaskKernel( + module_ranges_workdiv, module_ranges_kernel, *modulesBuffers_->data(), *hitsInGPU, nLowerModules_)); + + // Waiting isn't needed after second kernel call. Saves ~100 us. + // This is because addPixelSegmentToEvent (which is run next) doesn't rely on hitsBuffers->hitrange variables. + // Also, modulesInGPU->partnerModuleIndices is not alterned in addPixelSegmentToEvent. + alpaka::enqueue(queue, module_ranges_task); +} + +void SDL::Event::addPixelSegmentToEvent(std::vector hitIndices0, + std::vector hitIndices1, + std::vector hitIndices2, + std::vector hitIndices3, + std::vector dPhiChange, + std::vector ptIn, + std::vector ptErr, + std::vector px, + std::vector py, + std::vector pz, + std::vector eta, + std::vector etaErr, + std::vector phi, + std::vector charge, + std::vector seedIdx, + std::vector superbin, + std::vector pixelType, + std::vector isQuad) { + unsigned int size = ptIn.size(); + + if (size > N_MAX_PIXEL_SEGMENTS_PER_MODULE) { + printf( + "*********************************************************\n" + "* Warning: Pixel line segments will be truncated. *\n" + "* You need to increase N_MAX_PIXEL_SEGMENTS_PER_MODULE. *\n" + "*********************************************************\n"); + size = N_MAX_PIXEL_SEGMENTS_PER_MODULE; + } + + unsigned int mdSize = 2 * size; + uint16_t pixelModuleIndex = pixelMapping_->pixelModuleIndex; + + if (mdsInGPU == nullptr) { + // Create a view for the element nLowerModules_ inside rangesBuffers->miniDoubletModuleOccupancy + auto dst_view_miniDoubletModuleOccupancy = + alpaka::createSubView(rangesBuffers->miniDoubletModuleOccupancy_buf, (Idx)1u, (Idx)nLowerModules_); + + // Create a source view for the value to be set + int value = N_MAX_PIXEL_MD_PER_MODULES; + auto src_view_value = alpaka::createView(devHost, &value, (Idx)1u); + + alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, src_view_value); + alpaka::wait(queue); + + Vec const threadsPerBlockCreateMD = createVec(1, 1, 1024); + Vec const blocksPerGridCreateMD = createVec(1, 1, 1); + WorkDiv const createMDArrayRangesGPU_workDiv = + createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread); + + SDL::createMDArrayRangesGPU createMDArrayRangesGPU_kernel; + auto const createMDArrayRangesGPUTask(alpaka::createTaskKernel( + createMDArrayRangesGPU_workDiv, createMDArrayRangesGPU_kernel, *modulesBuffers_->data(), *rangesInGPU)); + + alpaka::enqueue(queue, createMDArrayRangesGPUTask); + alpaka::wait(queue); + + unsigned int nTotalMDs; + auto nTotalMDs_view = alpaka::createView(devHost, &nTotalMDs, (Idx)1u); + + alpaka::memcpy(queue, nTotalMDs_view, rangesBuffers->device_nTotalMDs_buf); + alpaka::wait(queue); + + nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES; + + mdsInGPU = new SDL::miniDoublets(); + miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules_, devAcc, queue); + mdsInGPU->setData(*miniDoubletsBuffers); + + alpaka::memcpy(queue, miniDoubletsBuffers->nMemoryLocations_buf, nTotalMDs_view); + alpaka::wait(queue); + } + if (segmentsInGPU == nullptr) { + // can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously. + // If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them + + Vec const threadsPerBlockCreateSeg = createVec(1, 1, 1024); + Vec const blocksPerGridCreateSeg = createVec(1, 1, 1); + WorkDiv const createSegmentArrayRanges_workDiv = + createWorkDiv(blocksPerGridCreateSeg, threadsPerBlockCreateSeg, elementsPerThread); + + SDL::createSegmentArrayRanges createSegmentArrayRanges_kernel; + auto const createSegmentArrayRangesTask(alpaka::createTaskKernel(createSegmentArrayRanges_workDiv, + createSegmentArrayRanges_kernel, + *modulesBuffers_->data(), + *rangesInGPU, + *mdsInGPU)); + + alpaka::enqueue(queue, createSegmentArrayRangesTask); + alpaka::wait(queue); + + auto nTotalSegments_view = alpaka::createView(devHost, &nTotalSegments, (Idx)1u); + + alpaka::memcpy(queue, nTotalSegments_view, rangesBuffers->device_nTotalSegs_buf); + alpaka::wait(queue); + + nTotalSegments += N_MAX_PIXEL_SEGMENTS_PER_MODULE; + + segmentsInGPU = new SDL::segments(); + segmentsBuffers = + new SDL::segmentsBuffer(nTotalSegments, nLowerModules_, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); + segmentsInGPU->setData(*segmentsBuffers); + + alpaka::memcpy(queue, segmentsBuffers->nMemoryLocations_buf, nTotalSegments_view); + alpaka::wait(queue); + } + + auto hitIndices0_dev = allocBufWrapper(devAcc, size, queue); + auto hitIndices1_dev = allocBufWrapper(devAcc, size, queue); + auto hitIndices2_dev = allocBufWrapper(devAcc, size, queue); + auto hitIndices3_dev = allocBufWrapper(devAcc, size, queue); + auto dPhiChange_dev = allocBufWrapper(devAcc, size, queue); + + alpaka::memcpy(queue, hitIndices0_dev, hitIndices0, size); + alpaka::memcpy(queue, hitIndices1_dev, hitIndices1, size); + alpaka::memcpy(queue, hitIndices2_dev, hitIndices2, size); + alpaka::memcpy(queue, hitIndices3_dev, hitIndices3, size); + alpaka::memcpy(queue, dPhiChange_dev, dPhiChange, size); + + alpaka::memcpy(queue, segmentsBuffers->ptIn_buf, ptIn, size); + alpaka::memcpy(queue, segmentsBuffers->ptErr_buf, ptErr, size); + alpaka::memcpy(queue, segmentsBuffers->px_buf, px, size); + alpaka::memcpy(queue, segmentsBuffers->py_buf, py, size); + alpaka::memcpy(queue, segmentsBuffers->pz_buf, pz, size); + alpaka::memcpy(queue, segmentsBuffers->etaErr_buf, etaErr, size); + alpaka::memcpy(queue, segmentsBuffers->isQuad_buf, isQuad, size); + alpaka::memcpy(queue, segmentsBuffers->eta_buf, eta, size); + alpaka::memcpy(queue, segmentsBuffers->phi_buf, phi, size); + alpaka::memcpy(queue, segmentsBuffers->charge_buf, charge, size); + alpaka::memcpy(queue, segmentsBuffers->seedIdx_buf, seedIdx, size); + alpaka::memcpy(queue, segmentsBuffers->superbin_buf, superbin, size); + alpaka::memcpy(queue, segmentsBuffers->pixelType_buf, pixelType, size); + + // Create source views for size and mdSize + auto src_view_size = alpaka::createView(devHost, &size, (Idx)1u); + auto src_view_mdSize = alpaka::createView(devHost, &mdSize, (Idx)1u); + + auto dst_view_segments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx)1u, (Idx)pixelModuleIndex); + alpaka::memcpy(queue, dst_view_segments, src_view_size); + + auto dst_view_totOccupancySegments = + alpaka::createSubView(segmentsBuffers->totOccupancySegments_buf, (Idx)1u, (Idx)pixelModuleIndex); + alpaka::memcpy(queue, dst_view_totOccupancySegments, src_view_size); + + auto dst_view_nMDs = alpaka::createSubView(miniDoubletsBuffers->nMDs_buf, (Idx)1u, (Idx)pixelModuleIndex); + alpaka::memcpy(queue, dst_view_nMDs, src_view_mdSize); + + auto dst_view_totOccupancyMDs = + alpaka::createSubView(miniDoubletsBuffers->totOccupancyMDs_buf, (Idx)1u, (Idx)pixelModuleIndex); + alpaka::memcpy(queue, dst_view_totOccupancyMDs, src_view_mdSize); + + alpaka::wait(queue); + + Vec const threadsPerBlock = createVec(1, 1, 256); + Vec const blocksPerGrid = createVec(1, 1, MAX_BLOCKS); + WorkDiv const addPixelSegmentToEvent_workdiv = createWorkDiv(blocksPerGrid, threadsPerBlock, elementsPerThread); + + addPixelSegmentToEventKernel addPixelSegmentToEvent_kernel; + auto const addPixelSegmentToEvent_task(alpaka::createTaskKernel(addPixelSegmentToEvent_workdiv, + addPixelSegmentToEvent_kernel, + *modulesBuffers_->data(), + *rangesInGPU, + *hitsInGPU, + *mdsInGPU, + *segmentsInGPU, + alpaka::getPtrNative(hitIndices0_dev), + alpaka::getPtrNative(hitIndices1_dev), + alpaka::getPtrNative(hitIndices2_dev), + alpaka::getPtrNative(hitIndices3_dev), + alpaka::getPtrNative(dPhiChange_dev), + pixelModuleIndex, + size)); + + alpaka::enqueue(queue, addPixelSegmentToEvent_task); + alpaka::wait(queue); +} + +void SDL::Event::createMiniDoublets() { + // Create a view for the element nLowerModules_ inside rangesBuffers->miniDoubletModuleOccupancy + auto dst_view_miniDoubletModuleOccupancy = + alpaka::createSubView(rangesBuffers->miniDoubletModuleOccupancy_buf, (Idx)1u, (Idx)nLowerModules_); + + // Create a source view for the value to be set + int value = N_MAX_PIXEL_MD_PER_MODULES; + auto src_view_value = alpaka::createView(devHost, &value, (Idx)1u); + + alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, src_view_value); + alpaka::wait(queue); + + Vec const threadsPerBlockCreateMD = createVec(1, 1, 1024); + Vec const blocksPerGridCreateMD = createVec(1, 1, 1); + WorkDiv const createMDArrayRangesGPU_workDiv = + createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread); + + SDL::createMDArrayRangesGPU createMDArrayRangesGPU_kernel; + auto const createMDArrayRangesGPUTask(alpaka::createTaskKernel( + createMDArrayRangesGPU_workDiv, createMDArrayRangesGPU_kernel, *modulesBuffers_->data(), *rangesInGPU)); + + alpaka::enqueue(queue, createMDArrayRangesGPUTask); + alpaka::wait(queue); + + auto nTotalMDs_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nTotalMDs_buf, rangesBuffers->device_nTotalMDs_buf); + alpaka::wait(queue); + + unsigned int nTotalMDs = *alpaka::getPtrNative(nTotalMDs_buf); + + nTotalMDs += N_MAX_PIXEL_MD_PER_MODULES; + + if (mdsInGPU == nullptr) { + mdsInGPU = new SDL::miniDoublets(); + miniDoubletsBuffers = new SDL::miniDoubletsBuffer(nTotalMDs, nLowerModules_, devAcc, queue); + mdsInGPU->setData(*miniDoubletsBuffers); + } + + Vec const threadsPerBlockCreateMDInGPU = createVec(1, 16, 32); + Vec const blocksPerGridCreateMDInGPU = createVec(1, nLowerModules_ / threadsPerBlockCreateMDInGPU[1], 1); + WorkDiv const createMiniDoubletsInGPUv2_workDiv = + createWorkDiv(blocksPerGridCreateMDInGPU, threadsPerBlockCreateMDInGPU, elementsPerThread); + + SDL::createMiniDoubletsInGPUv2 createMiniDoubletsInGPUv2_kernel; + auto const createMiniDoubletsInGPUv2Task(alpaka::createTaskKernel(createMiniDoubletsInGPUv2_workDiv, + createMiniDoubletsInGPUv2_kernel, + *modulesBuffers_->data(), + *hitsInGPU, + *mdsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, createMiniDoubletsInGPUv2Task); + + Vec const threadsPerBlockAddMD = createVec(1, 1, 1024); + Vec const blocksPerGridAddMD = createVec(1, 1, 1); + WorkDiv const addMiniDoubletRangesToEventExplicit_workDiv = + createWorkDiv(blocksPerGridAddMD, threadsPerBlockAddMD, elementsPerThread); + + SDL::addMiniDoubletRangesToEventExplicit addMiniDoubletRangesToEventExplicit_kernel; + auto const addMiniDoubletRangesToEventExplicitTask( + alpaka::createTaskKernel(addMiniDoubletRangesToEventExplicit_workDiv, + addMiniDoubletRangesToEventExplicit_kernel, + *modulesBuffers_->data(), + *mdsInGPU, + *rangesInGPU, + *hitsInGPU)); + + alpaka::enqueue(queue, addMiniDoubletRangesToEventExplicitTask); + alpaka::wait(queue); + + if (addObjects) { + addMiniDoubletsToEventExplicit(); + } +} + +void SDL::Event::createSegmentsWithModuleMap() { + if (segmentsInGPU == nullptr) { + segmentsInGPU = new SDL::segments(); + segmentsBuffers = + new SDL::segmentsBuffer(nTotalSegments, nLowerModules_, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devAcc, queue); + segmentsInGPU->setData(*segmentsBuffers); + } + + Vec const threadsPerBlockCreateSeg = createVec(1, 1, 64); + Vec const blocksPerGridCreateSeg = createVec(1, 1, nLowerModules_); + WorkDiv const createSegmentsInGPUv2_workDiv = + createWorkDiv(blocksPerGridCreateSeg, threadsPerBlockCreateSeg, elementsPerThread); + + SDL::createSegmentsInGPUv2 createSegmentsInGPUv2_kernel; + auto const createSegmentsInGPUv2Task(alpaka::createTaskKernel(createSegmentsInGPUv2_workDiv, + createSegmentsInGPUv2_kernel, + *modulesBuffers_->data(), + *mdsInGPU, + *segmentsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, createSegmentsInGPUv2Task); + + Vec const threadsPerBlockAddSeg = createVec(1, 1, 1024); + Vec const blocksPerGridAddSeg = createVec(1, 1, 1); + WorkDiv const addSegmentRangesToEventExplicit_workDiv = + createWorkDiv(blocksPerGridAddSeg, threadsPerBlockAddSeg, elementsPerThread); + + SDL::addSegmentRangesToEventExplicit addSegmentRangesToEventExplicit_kernel; + auto const addSegmentRangesToEventExplicitTask(alpaka::createTaskKernel(addSegmentRangesToEventExplicit_workDiv, + addSegmentRangesToEventExplicit_kernel, + *modulesBuffers_->data(), + *segmentsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, addSegmentRangesToEventExplicitTask); + alpaka::wait(queue); + + if (addObjects) { + addSegmentsToEventExplicit(); + } +} + +void SDL::Event::createTriplets() { + if (tripletsInGPU == nullptr) { + Vec const threadsPerBlockCreateTrip = createVec(1, 1, 1024); + Vec const blocksPerGridCreateTrip = createVec(1, 1, 1); + WorkDiv const createTripletArrayRanges_workDiv = + createWorkDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread); + + SDL::createTripletArrayRanges createTripletArrayRanges_kernel; + auto const createTripletArrayRangesTask(alpaka::createTaskKernel(createTripletArrayRanges_workDiv, + createTripletArrayRanges_kernel, + *modulesBuffers_->data(), + *rangesInGPU, + *segmentsInGPU)); + + alpaka::enqueue(queue, createTripletArrayRangesTask); + alpaka::wait(queue); + + // TODO: Why are we pulling this back down only to put it back on the device in a new struct? + auto maxTriplets_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, maxTriplets_buf, rangesBuffers->device_nTotalTrips_buf); + alpaka::wait(queue); + + tripletsInGPU = new SDL::triplets(); + tripletsBuffers = + new SDL::tripletsBuffer(*alpaka::getPtrNative(maxTriplets_buf), nLowerModules_, devAcc, queue); + tripletsInGPU->setData(*tripletsBuffers); + + alpaka::memcpy(queue, tripletsBuffers->nMemoryLocations_buf, maxTriplets_buf); + alpaka::wait(queue); + } + + uint16_t nonZeroModules = 0; + unsigned int max_InnerSeg = 0; + + // Allocate host index + auto index_buf = allocBufWrapper(devHost, nLowerModules_, queue); + uint16_t* index = alpaka::getPtrNative(index_buf); + + // Allocate device index + auto index_gpu_buf = allocBufWrapper(devAcc, nLowerModules_, queue); + + // Allocate and copy nSegments from device to host + auto nSegments_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, nSegments_buf, segmentsBuffers->nSegments_buf, nLowerModules_); + alpaka::wait(queue); + + unsigned int* nSegments = alpaka::getPtrNative(nSegments_buf); + + // Allocate and copy module_nConnectedModules from device to host + auto module_nConnectedModules_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_nConnectedModules_buf, modulesBuffers_->nConnectedModules_buf, nLowerModules_); + alpaka::wait(queue); + + uint16_t* module_nConnectedModules = alpaka::getPtrNative(module_nConnectedModules_buf); + + for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex < nLowerModules_; innerLowerModuleIndex++) { + uint16_t nConnectedModules = module_nConnectedModules[innerLowerModuleIndex]; + unsigned int nInnerSegments = nSegments[innerLowerModuleIndex]; + if (nConnectedModules != 0 and nInnerSegments != 0) { + index[nonZeroModules] = innerLowerModuleIndex; + nonZeroModules++; + } + max_InnerSeg = std::max(max_InnerSeg, nInnerSegments); + } + + // Copy index from host to device + alpaka::memcpy(queue, index_gpu_buf, index_buf, nonZeroModules); + alpaka::wait(queue); + + Vec const threadsPerBlockCreateTrip = createVec(1, 16, 16); + Vec const blocksPerGridCreateTrip = createVec(MAX_BLOCKS, 1, 1); + WorkDiv const createTripletsInGPUv2_workDiv = + createWorkDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread); + + SDL::createTripletsInGPUv2 createTripletsInGPUv2_kernel; + auto const createTripletsInGPUv2Task(alpaka::createTaskKernel(createTripletsInGPUv2_workDiv, + createTripletsInGPUv2_kernel, + *modulesBuffers_->data(), + *mdsInGPU, + *segmentsInGPU, + *tripletsInGPU, + *rangesInGPU, + alpaka::getPtrNative(index_gpu_buf), + nonZeroModules)); + + alpaka::enqueue(queue, createTripletsInGPUv2Task); + + Vec const threadsPerBlockAddTrip = createVec(1, 1, 1024); + Vec const blocksPerGridAddTrip = createVec(1, 1, 1); + WorkDiv const addTripletRangesToEventExplicit_workDiv = + createWorkDiv(blocksPerGridAddTrip, threadsPerBlockAddTrip, elementsPerThread); + + SDL::addTripletRangesToEventExplicit addTripletRangesToEventExplicit_kernel; + auto const addTripletRangesToEventExplicitTask(alpaka::createTaskKernel(addTripletRangesToEventExplicit_workDiv, + addTripletRangesToEventExplicit_kernel, + *modulesBuffers_->data(), + *tripletsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, addTripletRangesToEventExplicitTask); + alpaka::wait(queue); + + if (addObjects) { + addTripletsToEventExplicit(); + } +} + +void SDL::Event::createTrackCandidates() { + if (trackCandidatesInGPU == nullptr) { + trackCandidatesInGPU = new SDL::trackCandidates(); + trackCandidatesBuffers = new SDL::trackCandidatesBuffer( + N_MAX_NONPIXEL_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devAcc, queue); + trackCandidatesInGPU->setData(*trackCandidatesBuffers); + } + + // Pull nEligibleT5Modules from the device. + auto nEligibleModules_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nEligibleModules_buf, rangesBuffers->nEligibleT5Modules_buf); + alpaka::wait(queue); + uint16_t nEligibleModules = *alpaka::getPtrNative(nEligibleModules_buf); + + Vec const threadsPerBlock_crossCleanpT3 = createVec(1, 16, 64); + Vec const blocksPerGrid_crossCleanpT3 = createVec(1, 4, 20); + WorkDiv const crossCleanpT3_workDiv = + createWorkDiv(blocksPerGrid_crossCleanpT3, threadsPerBlock_crossCleanpT3, elementsPerThread); + + SDL::crossCleanpT3 crossCleanpT3_kernel; + auto const crossCleanpT3Task(alpaka::createTaskKernel(crossCleanpT3_workDiv, + crossCleanpT3_kernel, + *modulesBuffers_->data(), + *rangesInGPU, + *pixelTripletsInGPU, + *segmentsInGPU, + *pixelQuintupletsInGPU)); + + alpaka::enqueue(queue, crossCleanpT3Task); + + Vec const threadsPerBlock_addpT3asTrackCandidatesInGPU = createVec(1, 1, 512); + Vec const blocksPerGrid_addpT3asTrackCandidatesInGPU = createVec(1, 1, 1); + WorkDiv const addpT3asTrackCandidatesInGPU_workDiv = createWorkDiv( + blocksPerGrid_addpT3asTrackCandidatesInGPU, threadsPerBlock_addpT3asTrackCandidatesInGPU, elementsPerThread); + + SDL::addpT3asTrackCandidatesInGPU addpT3asTrackCandidatesInGPU_kernel; + auto const addpT3asTrackCandidatesInGPUTask(alpaka::createTaskKernel(addpT3asTrackCandidatesInGPU_workDiv, + addpT3asTrackCandidatesInGPU_kernel, + nLowerModules_, + *pixelTripletsInGPU, + *trackCandidatesInGPU, + *segmentsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, addpT3asTrackCandidatesInGPUTask); + + Vec const threadsPerBlockRemoveDupQuints = createVec(1, 16, 32); + Vec const blocksPerGridRemoveDupQuints = + createVec(1, std::max(nEligibleModules / 16, 1), std::max(nEligibleModules / 32, 1)); + WorkDiv const removeDupQuintupletsInGPUBeforeTC_workDiv = + createWorkDiv(blocksPerGridRemoveDupQuints, threadsPerBlockRemoveDupQuints, elementsPerThread); + + SDL::removeDupQuintupletsInGPUBeforeTC removeDupQuintupletsInGPUBeforeTC_kernel; + auto const removeDupQuintupletsInGPUBeforeTCTask( + alpaka::createTaskKernel(removeDupQuintupletsInGPUBeforeTC_workDiv, + removeDupQuintupletsInGPUBeforeTC_kernel, + *quintupletsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, removeDupQuintupletsInGPUBeforeTCTask); + + Vec const threadsPerBlock_crossCleanT5 = createVec(32, 1, 32); + Vec const blocksPerGrid_crossCleanT5 = createVec((13296 / 32) + 1, 1, MAX_BLOCKS); + WorkDiv const crossCleanT5_workDiv = + createWorkDiv(blocksPerGrid_crossCleanT5, threadsPerBlock_crossCleanT5, elementsPerThread); + + SDL::crossCleanT5 crossCleanT5_kernel; + auto const crossCleanT5Task(alpaka::createTaskKernel(crossCleanT5_workDiv, + crossCleanT5_kernel, + *modulesBuffers_->data(), + *quintupletsInGPU, + *pixelQuintupletsInGPU, + *pixelTripletsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, crossCleanT5Task); + + Vec const threadsPerBlock_addT5asTrackCandidateInGPU = createVec(1, 8, 128); + Vec const blocksPerGrid_addT5asTrackCandidateInGPU = createVec(1, 8, 10); + WorkDiv const addT5asTrackCandidateInGPU_workDiv = createWorkDiv( + blocksPerGrid_addT5asTrackCandidateInGPU, threadsPerBlock_addT5asTrackCandidateInGPU, elementsPerThread); + + SDL::addT5asTrackCandidateInGPU addT5asTrackCandidateInGPU_kernel; + auto const addT5asTrackCandidateInGPUTask(alpaka::createTaskKernel(addT5asTrackCandidateInGPU_workDiv, + addT5asTrackCandidateInGPU_kernel, + nLowerModules_, + *quintupletsInGPU, + *trackCandidatesInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, addT5asTrackCandidateInGPUTask); + +#ifndef NOPLSDUPCLEAN + Vec const threadsPerBlockCheckHitspLS = createVec(1, 16, 16); + Vec const blocksPerGridCheckHitspLS = createVec(1, MAX_BLOCKS * 4, MAX_BLOCKS / 4); + WorkDiv const checkHitspLS_workDiv = + createWorkDiv(blocksPerGridCheckHitspLS, threadsPerBlockCheckHitspLS, elementsPerThread); + + SDL::checkHitspLS checkHitspLS_kernel; + auto const checkHitspLSTask(alpaka::createTaskKernel( + checkHitspLS_workDiv, checkHitspLS_kernel, *modulesBuffers_->data(), *segmentsInGPU, true)); + + alpaka::enqueue(queue, checkHitspLSTask); +#endif + + Vec const threadsPerBlock_crossCleanpLS = createVec(1, 16, 32); + Vec const blocksPerGrid_crossCleanpLS = createVec(1, 4, 20); + WorkDiv const crossCleanpLS_workDiv = + createWorkDiv(blocksPerGrid_crossCleanpLS, threadsPerBlock_crossCleanpLS, elementsPerThread); + + SDL::crossCleanpLS crossCleanpLS_kernel; + auto const crossCleanpLSTask(alpaka::createTaskKernel(crossCleanpLS_workDiv, + crossCleanpLS_kernel, + *modulesBuffers_->data(), + *rangesInGPU, + *pixelTripletsInGPU, + *trackCandidatesInGPU, + *segmentsInGPU, + *mdsInGPU, + *hitsInGPU, + *quintupletsInGPU)); + + alpaka::enqueue(queue, crossCleanpLSTask); + + Vec const threadsPerBlock_addpLSasTrackCandidateInGPU = createVec(1, 1, 384); + Vec const blocksPerGrid_addpLSasTrackCandidateInGPU = createVec(1, 1, MAX_BLOCKS); + WorkDiv const addpLSasTrackCandidateInGPU_workDiv = createWorkDiv( + blocksPerGrid_addpLSasTrackCandidateInGPU, threadsPerBlock_addpLSasTrackCandidateInGPU, elementsPerThread); + + SDL::addpLSasTrackCandidateInGPU addpLSasTrackCandidateInGPU_kernel; + auto const addpLSasTrackCandidateInGPUTask(alpaka::createTaskKernel(addpLSasTrackCandidateInGPU_workDiv, + addpLSasTrackCandidateInGPU_kernel, + nLowerModules_, + *trackCandidatesInGPU, + *segmentsInGPU)); + + alpaka::enqueue(queue, addpLSasTrackCandidateInGPUTask); + + // Check if either N_MAX_PIXEL_TRACK_CANDIDATES or N_MAX_NONPIXEL_TRACK_CANDIDATES was reached + auto nTrackCanpT5Host_buf = allocBufWrapper(devHost, 1, queue); + auto nTrackCanpT3Host_buf = allocBufWrapper(devHost, 1, queue); + auto nTrackCanpLSHost_buf = allocBufWrapper(devHost, 1, queue); + auto nTrackCanT5Host_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nTrackCanpT5Host_buf, trackCandidatesBuffers->nTrackCandidatespT5_buf); + alpaka::memcpy(queue, nTrackCanpT3Host_buf, trackCandidatesBuffers->nTrackCandidatespT3_buf); + alpaka::memcpy(queue, nTrackCanpLSHost_buf, trackCandidatesBuffers->nTrackCandidatespLS_buf); + alpaka::memcpy(queue, nTrackCanT5Host_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf); + alpaka::wait(queue); + + int nTrackCandidatespT5 = *alpaka::getPtrNative(nTrackCanpT5Host_buf); + int nTrackCandidatespT3 = *alpaka::getPtrNative(nTrackCanpT3Host_buf); + int nTrackCandidatespLS = *alpaka::getPtrNative(nTrackCanpLSHost_buf); + int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCanT5Host_buf); + if ((nTrackCandidatespT5 + nTrackCandidatespT3 + nTrackCandidatespLS == N_MAX_PIXEL_TRACK_CANDIDATES) || + (nTrackCandidatesT5 == N_MAX_NONPIXEL_TRACK_CANDIDATES)) { + printf( + "****************************************************************************************************\n" + "* Warning: Track candidates were possibly truncated. *\n" + "* You may need to increase either N_MAX_PIXEL_TRACK_CANDIDATES or N_MAX_NONPIXEL_TRACK_CANDIDATES. *\n" + "* Run the code with the Warnings flag activated for more details. *\n" + "****************************************************************************************************\n"); + } +} + +void SDL::Event::createPixelTriplets() { + if (pixelTripletsInGPU == nullptr) { + pixelTripletsInGPU = new SDL::pixelTriplets(); + pixelTripletsBuffers = new SDL::pixelTripletsBuffer(N_MAX_PIXEL_TRIPLETS, devAcc, queue); + pixelTripletsInGPU->setData(*pixelTripletsBuffers); + } + + unsigned int nInnerSegments; + auto nInnerSegments_src_view = alpaka::createView(devHost, &nInnerSegments, (size_t)1u); + + auto dev_view_nSegments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx)1u, (Idx)nLowerModules_); + + alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments); + alpaka::wait(queue); + + auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue); + auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue); + + alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf); + alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf); + alpaka::wait(queue); + + auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); + auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); + auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); + auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); + + int* superbins = alpaka::getPtrNative(superbins_buf); + int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf); + unsigned int* connectedPixelSize_host = alpaka::getPtrNative(connectedPixelSize_host_buf); + unsigned int* connectedPixelIndex_host = alpaka::getPtrNative(connectedPixelIndex_host_buf); + alpaka::wait(queue); + + int pixelIndexOffsetPos = + pixelMapping_->connectedPixelsIndex[size_superbins - 1] + pixelMapping_->connectedPixelsSizes[size_superbins - 1]; + int pixelIndexOffsetNeg = pixelMapping_->connectedPixelsIndexPos[size_superbins - 1] + + pixelMapping_->connectedPixelsSizesPos[size_superbins - 1] + pixelIndexOffsetPos; + + // TODO: check if a map/reduction to just eligible pLSs would speed up the kernel + // the current selection still leaves a significant fraction of unmatchable pLSs + for (unsigned int i = 0; i < nInnerSegments; i++) { // loop over # pLS + int8_t pixelType = pixelTypes[i]; // Get pixel type for this pLS + int superbin = superbins[i]; // Get superbin for this pixel + if ((superbin < 0) or (superbin >= (int)size_superbins) or (pixelType > 2) or (pixelType < 0)) { + connectedPixelSize_host[i] = 0; + connectedPixelIndex_host[i] = 0; + continue; + } + + // Used pixel type to select correct size-index arrays + if (pixelType == 0) { + connectedPixelSize_host[i] = + pixelMapping_->connectedPixelsSizes[superbin]; // number of connected modules to this pixel + auto connectedIdxBase = pixelMapping_->connectedPixelsIndex[superbin]; + connectedPixelIndex_host[i] = + connectedIdxBase; // index to get start of connected modules for this superbin in map + } else if (pixelType == 1) { + connectedPixelSize_host[i] = + pixelMapping_->connectedPixelsSizesPos[superbin]; // number of pixel connected modules + auto connectedIdxBase = pixelMapping_->connectedPixelsIndexPos[superbin] + pixelIndexOffsetPos; + connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected pixel modules + } else if (pixelType == 2) { + connectedPixelSize_host[i] = + pixelMapping_->connectedPixelsSizesNeg[superbin]; // number of pixel connected modules + auto connectedIdxBase = pixelMapping_->connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg; + connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected pixel modules + } + } + + alpaka::memcpy(queue, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments); + alpaka::memcpy(queue, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments); + alpaka::wait(queue); + + Vec const threadsPerBlock = createVec(1, 4, 32); + Vec const blocksPerGrid = createVec(16 /* above median of connected modules*/, 4096, 1); + WorkDiv const createPixelTripletsInGPUFromMapv2_workDiv = + createWorkDiv(blocksPerGrid, threadsPerBlock, elementsPerThread); + + SDL::createPixelTripletsInGPUFromMapv2 createPixelTripletsInGPUFromMapv2_kernel; + auto const createPixelTripletsInGPUFromMapv2Task( + alpaka::createTaskKernel(createPixelTripletsInGPUFromMapv2_workDiv, + createPixelTripletsInGPUFromMapv2_kernel, + *modulesBuffers_->data(), + *rangesInGPU, + *mdsInGPU, + *segmentsInGPU, + *tripletsInGPU, + *pixelTripletsInGPU, + alpaka::getPtrNative(connectedPixelSize_dev_buf), + alpaka::getPtrNative(connectedPixelIndex_dev_buf), + nInnerSegments)); + + alpaka::enqueue(queue, createPixelTripletsInGPUFromMapv2Task); + alpaka::wait(queue); + +#ifdef Warnings + auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf); + alpaka::wait(queue); + + std::cout << "number of pixel triplets = " << *alpaka::getPtrNative(nPixelTriplets_buf) << std::endl; +#endif + + //pT3s can be cleaned here because they're not used in making pT5s! + Vec const threadsPerBlockDupPixTrip = createVec(1, 16, 16); + //seems like more blocks lead to conflicting writes + Vec const blocksPerGridDupPixTrip = createVec(1, 40, 1); + WorkDiv const removeDupPixelTripletsInGPUFromMap_workDiv = + createWorkDiv(blocksPerGridDupPixTrip, threadsPerBlockDupPixTrip, elementsPerThread); + + SDL::removeDupPixelTripletsInGPUFromMap removeDupPixelTripletsInGPUFromMap_kernel; + auto const removeDupPixelTripletsInGPUFromMapTask(alpaka::createTaskKernel( + removeDupPixelTripletsInGPUFromMap_workDiv, removeDupPixelTripletsInGPUFromMap_kernel, *pixelTripletsInGPU)); + + alpaka::enqueue(queue, removeDupPixelTripletsInGPUFromMapTask); + alpaka::wait(queue); +} + +void SDL::Event::createQuintuplets() { + Vec const threadsPerBlockCreateQuints = createVec(1, 1, 1024); + Vec const blocksPerGridCreateQuints = createVec(1, 1, 1); + WorkDiv const createEligibleModulesListForQuintupletsGPU_workDiv = + createWorkDiv(blocksPerGridCreateQuints, threadsPerBlockCreateQuints, elementsPerThread); + + SDL::createEligibleModulesListForQuintupletsGPU createEligibleModulesListForQuintupletsGPU_kernel; + auto const createEligibleModulesListForQuintupletsGPUTask( + alpaka::createTaskKernel(createEligibleModulesListForQuintupletsGPU_workDiv, + createEligibleModulesListForQuintupletsGPU_kernel, + *modulesBuffers_->data(), + *tripletsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, createEligibleModulesListForQuintupletsGPUTask); + alpaka::wait(queue); + + auto nEligibleT5Modules_buf = allocBufWrapper(devHost, 1, queue); + auto nTotalQuintuplets_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nEligibleT5Modules_buf, rangesBuffers->nEligibleT5Modules_buf); + alpaka::memcpy(queue, nTotalQuintuplets_buf, rangesBuffers->device_nTotalQuints_buf); + alpaka::wait(queue); + + uint16_t nEligibleT5Modules = *alpaka::getPtrNative(nEligibleT5Modules_buf); + unsigned int nTotalQuintuplets = *alpaka::getPtrNative(nTotalQuintuplets_buf); + + if (quintupletsInGPU == nullptr) { + quintupletsInGPU = new SDL::quintuplets(); + quintupletsBuffers = new SDL::quintupletsBuffer(nTotalQuintuplets, nLowerModules_, devAcc, queue); + quintupletsInGPU->setData(*quintupletsBuffers); + + alpaka::memcpy(queue, quintupletsBuffers->nMemoryLocations_buf, nTotalQuintuplets_buf); + alpaka::wait(queue); + } + + Vec const threadsPerBlockQuints = createVec(1, 8, 32); + Vec const blocksPerGridQuints = createVec(std::max((int)nEligibleT5Modules, 1), 1, 1); + WorkDiv const createQuintupletsInGPUv2_workDiv = + createWorkDiv(blocksPerGridQuints, threadsPerBlockQuints, elementsPerThread); + + SDL::createQuintupletsInGPUv2 createQuintupletsInGPUv2_kernel; + auto const createQuintupletsInGPUv2Task(alpaka::createTaskKernel(createQuintupletsInGPUv2_workDiv, + createQuintupletsInGPUv2_kernel, + *modulesBuffers_->data(), + *mdsInGPU, + *segmentsInGPU, + *tripletsInGPU, + *quintupletsInGPU, + *rangesInGPU, + nEligibleT5Modules)); + + alpaka::enqueue(queue, createQuintupletsInGPUv2Task); + + Vec const threadsPerBlockDupQuint = createVec(1, 16, 16); + Vec const blocksPerGridDupQuint = createVec(MAX_BLOCKS, 1, 1); + WorkDiv const removeDupQuintupletsInGPUAfterBuild_workDiv = + createWorkDiv(blocksPerGridDupQuint, threadsPerBlockDupQuint, elementsPerThread); + + SDL::removeDupQuintupletsInGPUAfterBuild removeDupQuintupletsInGPUAfterBuild_kernel; + auto const removeDupQuintupletsInGPUAfterBuildTask( + alpaka::createTaskKernel(removeDupQuintupletsInGPUAfterBuild_workDiv, + removeDupQuintupletsInGPUAfterBuild_kernel, + *modulesBuffers_->data(), + *quintupletsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, removeDupQuintupletsInGPUAfterBuildTask); + + Vec const threadsPerBlockAddQuint = createVec(1, 1, 1024); + Vec const blocksPerGridAddQuint = createVec(1, 1, 1); + WorkDiv const addQuintupletRangesToEventExplicit_workDiv = + createWorkDiv(blocksPerGridAddQuint, threadsPerBlockAddQuint, elementsPerThread); + + SDL::addQuintupletRangesToEventExplicit addQuintupletRangesToEventExplicit_kernel; + auto const addQuintupletRangesToEventExplicitTask( + alpaka::createTaskKernel(addQuintupletRangesToEventExplicit_workDiv, + addQuintupletRangesToEventExplicit_kernel, + *modulesBuffers_->data(), + *quintupletsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, addQuintupletRangesToEventExplicitTask); + alpaka::wait(queue); + + if (addObjects) { + addQuintupletsToEventExplicit(); + } +} + +void SDL::Event::pixelLineSegmentCleaning() { +#ifndef NOPLSDUPCLEAN + Vec const threadsPerBlockCheckHitspLS = createVec(1, 16, 16); + Vec const blocksPerGridCheckHitspLS = createVec(1, MAX_BLOCKS * 4, MAX_BLOCKS / 4); + WorkDiv const checkHitspLS_workDiv = + createWorkDiv(blocksPerGridCheckHitspLS, threadsPerBlockCheckHitspLS, elementsPerThread); + + SDL::checkHitspLS checkHitspLS_kernel; + auto const checkHitspLSTask(alpaka::createTaskKernel( + checkHitspLS_workDiv, checkHitspLS_kernel, *modulesBuffers_->data(), *segmentsInGPU, false)); + + alpaka::enqueue(queue, checkHitspLSTask); + alpaka::wait(queue); +#endif +} + +void SDL::Event::createPixelQuintuplets() { + if (pixelQuintupletsInGPU == nullptr) { + pixelQuintupletsInGPU = new SDL::pixelQuintuplets(); + pixelQuintupletsBuffers = new SDL::pixelQuintupletsBuffer(N_MAX_PIXEL_QUINTUPLETS, devAcc, queue); + pixelQuintupletsInGPU->setData(*pixelQuintupletsBuffers); + } + if (trackCandidatesInGPU == nullptr) { + trackCandidatesInGPU = new SDL::trackCandidates(); + trackCandidatesBuffers = new SDL::trackCandidatesBuffer( + N_MAX_NONPIXEL_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devAcc, queue); + trackCandidatesInGPU->setData(*trackCandidatesBuffers); + } + + unsigned int nInnerSegments; + auto nInnerSegments_src_view = alpaka::createView(devHost, &nInnerSegments, (size_t)1u); + + // Create a sub-view for the device buffer + auto dev_view_nSegments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx)1u, (Idx)nLowerModules_); + + alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments); + alpaka::wait(queue); + + auto superbins_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue); + auto pixelTypes_buf = allocBufWrapper(devHost, N_MAX_PIXEL_SEGMENTS_PER_MODULE, queue); + + alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf); + alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf); + alpaka::wait(queue); + + auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); + auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); + auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); + auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); + + int* superbins = alpaka::getPtrNative(superbins_buf); + int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf); + unsigned int* connectedPixelSize_host = alpaka::getPtrNative(connectedPixelSize_host_buf); + unsigned int* connectedPixelIndex_host = alpaka::getPtrNative(connectedPixelIndex_host_buf); + alpaka::wait(queue); + + int pixelIndexOffsetPos = + pixelMapping_->connectedPixelsIndex[size_superbins - 1] + pixelMapping_->connectedPixelsSizes[size_superbins - 1]; + int pixelIndexOffsetNeg = pixelMapping_->connectedPixelsIndexPos[size_superbins - 1] + + pixelMapping_->connectedPixelsSizesPos[size_superbins - 1] + pixelIndexOffsetPos; + + // Loop over # pLS + for (unsigned int i = 0; i < nInnerSegments; i++) { + int8_t pixelType = pixelTypes[i]; // Get pixel type for this pLS + int superbin = superbins[i]; // Get superbin for this pixel + if ((superbin < 0) or (superbin >= (int)size_superbins) or (pixelType > 2) or (pixelType < 0)) { + connectedPixelIndex_host[i] = 0; + connectedPixelSize_host[i] = 0; + continue; + } + // Used pixel type to select correct size-index arrays + if (pixelType == 0) { + connectedPixelSize_host[i] = + pixelMapping_->connectedPixelsSizes[superbin]; //number of connected modules to this pixel + unsigned int connectedIdxBase = pixelMapping_->connectedPixelsIndex[superbin]; + connectedPixelIndex_host[i] = connectedIdxBase; + } else if (pixelType == 1) { + connectedPixelSize_host[i] = + pixelMapping_->connectedPixelsSizesPos[superbin]; //number of pixel connected modules + unsigned int connectedIdxBase = pixelMapping_->connectedPixelsIndexPos[superbin] + pixelIndexOffsetPos; + connectedPixelIndex_host[i] = connectedIdxBase; + } else if (pixelType == 2) { + connectedPixelSize_host[i] = + pixelMapping_->connectedPixelsSizesNeg[superbin]; //number of pixel connected modules + unsigned int connectedIdxBase = pixelMapping_->connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg; + connectedPixelIndex_host[i] = connectedIdxBase; + } + } + + alpaka::memcpy(queue, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments); + alpaka::memcpy(queue, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments); + alpaka::wait(queue); + + Vec const threadsPerBlockCreatePixQuints = createVec(1, 16, 16); + Vec const blocksPerGridCreatePixQuints = createVec(16, MAX_BLOCKS, 1); + WorkDiv const createPixelQuintupletsInGPUFromMapv2_workDiv = + createWorkDiv(blocksPerGridCreatePixQuints, threadsPerBlockCreatePixQuints, elementsPerThread); + + SDL::createPixelQuintupletsInGPUFromMapv2 createPixelQuintupletsInGPUFromMapv2_kernel; + auto const createPixelQuintupletsInGPUFromMapv2Task( + alpaka::createTaskKernel(createPixelQuintupletsInGPUFromMapv2_workDiv, + createPixelQuintupletsInGPUFromMapv2_kernel, + *modulesBuffers_->data(), + *mdsInGPU, + *segmentsInGPU, + *tripletsInGPU, + *quintupletsInGPU, + *pixelQuintupletsInGPU, + alpaka::getPtrNative(connectedPixelSize_dev_buf), + alpaka::getPtrNative(connectedPixelIndex_dev_buf), + nInnerSegments, + *rangesInGPU)); + + alpaka::enqueue(queue, createPixelQuintupletsInGPUFromMapv2Task); + + Vec const threadsPerBlockDupPix = createVec(1, 16, 16); + Vec const blocksPerGridDupPix = createVec(1, MAX_BLOCKS, 1); + WorkDiv const removeDupPixelQuintupletsInGPUFromMap_workDiv = + createWorkDiv(blocksPerGridDupPix, threadsPerBlockDupPix, elementsPerThread); + + SDL::removeDupPixelQuintupletsInGPUFromMap removeDupPixelQuintupletsInGPUFromMap_kernel; + auto const removeDupPixelQuintupletsInGPUFromMapTask( + alpaka::createTaskKernel(removeDupPixelQuintupletsInGPUFromMap_workDiv, + removeDupPixelQuintupletsInGPUFromMap_kernel, + *pixelQuintupletsInGPU)); + + alpaka::enqueue(queue, removeDupPixelQuintupletsInGPUFromMapTask); + + Vec const threadsPerBlockAddpT5asTrackCan = createVec(1, 1, 256); + Vec const blocksPerGridAddpT5asTrackCan = createVec(1, 1, 1); + WorkDiv const addpT5asTrackCandidateInGPU_workDiv = + createWorkDiv(blocksPerGridAddpT5asTrackCan, threadsPerBlockAddpT5asTrackCan, elementsPerThread); + + SDL::addpT5asTrackCandidateInGPU addpT5asTrackCandidateInGPU_kernel; + auto const addpT5asTrackCandidateInGPUTask(alpaka::createTaskKernel(addpT5asTrackCandidateInGPU_workDiv, + addpT5asTrackCandidateInGPU_kernel, + nLowerModules_, + *pixelQuintupletsInGPU, + *trackCandidatesInGPU, + *segmentsInGPU, + *rangesInGPU)); + + alpaka::enqueue(queue, addpT5asTrackCandidateInGPUTask); + alpaka::wait(queue); + +#ifdef Warnings + auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf); + alpaka::wait(queue); + + std::cout << "number of pixel quintuplets = " << *alpaka::getPtrNative(nPixelQuintuplets_buf) << std::endl; +#endif +} + +void SDL::Event::addMiniDoubletsToEventExplicit() { + auto nMDsCPU_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules_); + + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_->subdets_buf, nLowerModules_); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers_->layers_buf, nLowerModules_); + + auto module_hitRanges_buf = allocBufWrapper(devHost, nLowerModules_ * 2, queue); + alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules_ * 2u); + + alpaka::wait(queue); + + unsigned int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); + int* module_hitRanges = alpaka::getPtrNative(module_hitRanges_buf); + + for (unsigned int i = 0; i < nLowerModules_; i++) { + if (!(nMDsCPU[i] == 0 or module_hitRanges[i * 2] == -1)) { + if (module_subdets[i] == Barrel) { + n_minidoublets_by_layer_barrel_[module_layers[i] - 1] += nMDsCPU[i]; + } else { + n_minidoublets_by_layer_endcap_[module_layers[i] - 1] += nMDsCPU[i]; + } + } + } +} + +void SDL::Event::addSegmentsToEventExplicit() { + auto nSegmentsCPU_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules_); + + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_->subdets_buf, nLowerModules_); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers_->layers_buf, nLowerModules_); + + alpaka::wait(queue); + + unsigned int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); + + for (unsigned int i = 0; i < nLowerModules_; i++) { + if (!(nSegmentsCPU[i] == 0)) { + if (module_subdets[i] == Barrel) { + n_segments_by_layer_barrel_[module_layers[i] - 1] += nSegmentsCPU[i]; + } else { + n_segments_by_layer_endcap_[module_layers[i] - 1] += nSegmentsCPU[i]; + } + } + } +} + +void SDL::Event::addQuintupletsToEventExplicit() { + auto nQuintupletsCPU_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, nQuintupletsCPU_buf, quintupletsBuffers->nQuintuplets_buf); + + auto module_subdets_buf = allocBufWrapper(devHost, nModules_, queue); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_->subdets_buf, nModules_); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers_->layers_buf, nLowerModules_); + + auto module_quintupletModuleIndices_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf); + + alpaka::wait(queue); + + unsigned int* nQuintupletsCPU = alpaka::getPtrNative(nQuintupletsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); + int* module_quintupletModuleIndices = alpaka::getPtrNative(module_quintupletModuleIndices_buf); + + for (uint16_t i = 0; i < nLowerModules_; i++) { + if (!(nQuintupletsCPU[i] == 0 or module_quintupletModuleIndices[i] == -1)) { + if (module_subdets[i] == Barrel) { + n_quintuplets_by_layer_barrel_[module_layers[i] - 1] += nQuintupletsCPU[i]; + } else { + n_quintuplets_by_layer_endcap_[module_layers[i] - 1] += nQuintupletsCPU[i]; + } + } + } +} + +void SDL::Event::addTripletsToEventExplicit() { + auto nTripletsCPU_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, nTripletsCPU_buf, tripletsBuffers->nTriplets_buf); + + auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_->subdets_buf, nLowerModules_); + + auto module_layers_buf = allocBufWrapper(devHost, nLowerModules_, queue); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers_->layers_buf, nLowerModules_); + + alpaka::wait(queue); + unsigned int* nTripletsCPU = alpaka::getPtrNative(nTripletsCPU_buf); + short* module_subdets = alpaka::getPtrNative(module_subdets_buf); + short* module_layers = alpaka::getPtrNative(module_layers_buf); + + for (uint16_t i = 0; i < nLowerModules_; i++) { + if (nTripletsCPU[i] != 0) { + if (module_subdets[i] == Barrel) { + n_triplets_by_layer_barrel_[module_layers[i] - 1] += nTripletsCPU[i]; + } else { + n_triplets_by_layer_endcap_[module_layers[i] - 1] += nTripletsCPU[i]; + } + } + } +} + +unsigned int SDL::Event::getNumberOfHits() { + unsigned int hits = 0; + for (auto& it : n_hits_by_layer_barrel_) { + hits += it; + } + for (auto& it : n_hits_by_layer_endcap_) { + hits += it; + } + + return hits; +} + +unsigned int SDL::Event::getNumberOfHitsByLayer(unsigned int layer) { + if (layer == 6) + return n_hits_by_layer_barrel_[layer]; + else + return n_hits_by_layer_barrel_[layer] + n_hits_by_layer_endcap_[layer]; +} + +unsigned int SDL::Event::getNumberOfHitsByLayerBarrel(unsigned int layer) { + return n_hits_by_layer_barrel_[layer]; +} + +unsigned int SDL::Event::getNumberOfHitsByLayerEndcap(unsigned int layer) { + return n_hits_by_layer_endcap_[layer]; +} + +unsigned int SDL::Event::getNumberOfMiniDoublets() { + unsigned int miniDoublets = 0; + for (auto& it : n_minidoublets_by_layer_barrel_) { + miniDoublets += it; + } + for (auto& it : n_minidoublets_by_layer_endcap_) { + miniDoublets += it; + } + + return miniDoublets; +} + +unsigned int SDL::Event::getNumberOfMiniDoubletsByLayer(unsigned int layer) { + if (layer == 6) + return n_minidoublets_by_layer_barrel_[layer]; + else + return n_minidoublets_by_layer_barrel_[layer] + n_minidoublets_by_layer_endcap_[layer]; +} + +unsigned int SDL::Event::getNumberOfMiniDoubletsByLayerBarrel(unsigned int layer) { + return n_minidoublets_by_layer_barrel_[layer]; +} + +unsigned int SDL::Event::getNumberOfMiniDoubletsByLayerEndcap(unsigned int layer) { + return n_minidoublets_by_layer_endcap_[layer]; +} + +unsigned int SDL::Event::getNumberOfSegments() { + unsigned int segments = 0; + for (auto& it : n_segments_by_layer_barrel_) { + segments += it; + } + for (auto& it : n_segments_by_layer_endcap_) { + segments += it; + } + + return segments; +} + +unsigned int SDL::Event::getNumberOfSegmentsByLayer(unsigned int layer) { + if (layer == 6) + return n_segments_by_layer_barrel_[layer]; + else + return n_segments_by_layer_barrel_[layer] + n_segments_by_layer_endcap_[layer]; +} + +unsigned int SDL::Event::getNumberOfSegmentsByLayerBarrel(unsigned int layer) { + return n_segments_by_layer_barrel_[layer]; +} + +unsigned int SDL::Event::getNumberOfSegmentsByLayerEndcap(unsigned int layer) { + return n_segments_by_layer_endcap_[layer]; +} + +unsigned int SDL::Event::getNumberOfTriplets() { + unsigned int triplets = 0; + for (auto& it : n_triplets_by_layer_barrel_) { + triplets += it; + } + for (auto& it : n_triplets_by_layer_endcap_) { + triplets += it; + } + + return triplets; +} + +unsigned int SDL::Event::getNumberOfTripletsByLayer(unsigned int layer) { + if (layer == 6) + return n_triplets_by_layer_barrel_[layer]; + else + return n_triplets_by_layer_barrel_[layer] + n_triplets_by_layer_endcap_[layer]; +} + +unsigned int SDL::Event::getNumberOfTripletsByLayerBarrel(unsigned int layer) { + return n_triplets_by_layer_barrel_[layer]; +} + +unsigned int SDL::Event::getNumberOfTripletsByLayerEndcap(unsigned int layer) { + return n_triplets_by_layer_endcap_[layer]; +} + +int SDL::Event::getNumberOfPixelTriplets() { + auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf); + alpaka::wait(queue); + + int nPixelTriplets = *alpaka::getPtrNative(nPixelTriplets_buf); + + return nPixelTriplets; +} + +int SDL::Event::getNumberOfPixelQuintuplets() { + auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf); + alpaka::wait(queue); + + int nPixelQuintuplets = *alpaka::getPtrNative(nPixelQuintuplets_buf); + + return nPixelQuintuplets; +} + +unsigned int SDL::Event::getNumberOfQuintuplets() { + unsigned int quintuplets = 0; + for (auto& it : n_quintuplets_by_layer_barrel_) { + quintuplets += it; + } + for (auto& it : n_quintuplets_by_layer_endcap_) { + quintuplets += it; + } + + return quintuplets; +} + +unsigned int SDL::Event::getNumberOfQuintupletsByLayer(unsigned int layer) { + if (layer == 6) + return n_quintuplets_by_layer_barrel_[layer]; + else + return n_quintuplets_by_layer_barrel_[layer] + n_quintuplets_by_layer_endcap_[layer]; +} + +unsigned int SDL::Event::getNumberOfQuintupletsByLayerBarrel(unsigned int layer) { + return n_quintuplets_by_layer_barrel_[layer]; +} + +unsigned int SDL::Event::getNumberOfQuintupletsByLayerEndcap(unsigned int layer) { + return n_quintuplets_by_layer_endcap_[layer]; +} + +int SDL::Event::getNumberOfTrackCandidates() { + auto nTrackCandidates_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf); + alpaka::wait(queue); + + int nTrackCandidates = *alpaka::getPtrNative(nTrackCandidates_buf); + + return nTrackCandidates; +} + +int SDL::Event::getNumberOfPT5TrackCandidates() { + auto nTrackCandidatesPT5_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nTrackCandidatesPT5_buf, trackCandidatesBuffers->nTrackCandidatespT5_buf); + alpaka::wait(queue); + + int nTrackCandidatesPT5 = *alpaka::getPtrNative(nTrackCandidatesPT5_buf); + + return nTrackCandidatesPT5; +} + +int SDL::Event::getNumberOfPT3TrackCandidates() { + auto nTrackCandidatesPT3_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nTrackCandidatesPT3_buf, trackCandidatesBuffers->nTrackCandidatespT3_buf); + alpaka::wait(queue); + + int nTrackCandidatesPT3 = *alpaka::getPtrNative(nTrackCandidatesPT3_buf); + + return nTrackCandidatesPT3; +} + +int SDL::Event::getNumberOfPLSTrackCandidates() { + auto nTrackCandidatesPLS_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nTrackCandidatesPLS_buf, trackCandidatesBuffers->nTrackCandidatespLS_buf); + alpaka::wait(queue); + + unsigned int nTrackCandidatesPLS = *alpaka::getPtrNative(nTrackCandidatesPLS_buf); + + return nTrackCandidatesPLS; +} + +int SDL::Event::getNumberOfPixelTrackCandidates() { + auto nTrackCandidates_buf = allocBufWrapper(devHost, 1, queue); + auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf); + alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf); + alpaka::wait(queue); + + int nTrackCandidates = *alpaka::getPtrNative(nTrackCandidates_buf); + int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCandidatesT5_buf); + + return nTrackCandidates - nTrackCandidatesT5; +} + +int SDL::Event::getNumberOfT5TrackCandidates() { + auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1, queue); + + alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf); + alpaka::wait(queue); + + int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCandidatesT5_buf); + + return nTrackCandidatesT5; +} + +SDL::hitsBuffer* SDL::Event::getHits() //std::shared_ptr should take care of garbage collection +{ + if (hitsInCPU == nullptr) { + auto nHits_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf); + alpaka::wait(queue); + + unsigned int nHits = *alpaka::getPtrNative(nHits_buf); + hitsInCPU = new SDL::hitsBuffer(nModules_, nHits, devHost, queue); + hitsInCPU->setData(*hitsInCPU); + + *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits; + alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsBuffers->idxs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->detid_buf, hitsBuffers->detid_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->xs_buf, hitsBuffers->xs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->ys_buf, hitsBuffers->ys_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->zs_buf, hitsBuffers->zs_buf, nHits); + alpaka::memcpy(queue, hitsInCPU->moduleIndices_buf, hitsBuffers->moduleIndices_buf, nHits); + alpaka::wait(queue); + } + return hitsInCPU; +} + +SDL::hitsBuffer* SDL::Event::getHitsInCMSSW() { + if (hitsInCPU == nullptr) { + auto nHits_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf); + alpaka::wait(queue); + + unsigned int nHits = *alpaka::getPtrNative(nHits_buf); + hitsInCPU = new SDL::hitsBuffer(nModules_, nHits, devHost, queue); + hitsInCPU->setData(*hitsInCPU); + + *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits; + alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsBuffers->idxs_buf, nHits); + alpaka::wait(queue); + } + return hitsInCPU; +} + +SDL::objectRangesBuffer* SDL::Event::getRanges() { + if (rangesInCPU == nullptr) { + rangesInCPU = new SDL::objectRangesBuffer(nModules_, nLowerModules_, devHost, queue); + rangesInCPU->setData(*rangesInCPU); + + alpaka::memcpy(queue, rangesInCPU->hitRanges_buf, rangesBuffers->hitRanges_buf); + alpaka::memcpy(queue, rangesInCPU->quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf); + alpaka::memcpy(queue, rangesInCPU->miniDoubletModuleIndices_buf, rangesBuffers->miniDoubletModuleIndices_buf); + alpaka::memcpy(queue, rangesInCPU->segmentModuleIndices_buf, rangesBuffers->segmentModuleIndices_buf); + alpaka::memcpy(queue, rangesInCPU->tripletModuleIndices_buf, rangesBuffers->tripletModuleIndices_buf); + alpaka::wait(queue); + } + return rangesInCPU; +} + +SDL::miniDoubletsBuffer* SDL::Event::getMiniDoublets() { + if (mdsInCPU == nullptr) { + // Get nMemoryLocations parameter to initialize host based mdsInCPU + auto nMemHost_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nMemHost_buf, miniDoubletsBuffers->nMemoryLocations_buf); + alpaka::wait(queue); + + unsigned int nMemHost = *alpaka::getPtrNative(nMemHost_buf); + mdsInCPU = new SDL::miniDoubletsBuffer(nMemHost, nLowerModules_, devHost, queue); + mdsInCPU->setData(*mdsInCPU); + + *alpaka::getPtrNative(mdsInCPU->nMemoryLocations_buf) = nMemHost; + alpaka::memcpy(queue, mdsInCPU->anchorHitIndices_buf, miniDoubletsBuffers->anchorHitIndices_buf, nMemHost); + alpaka::memcpy(queue, mdsInCPU->outerHitIndices_buf, miniDoubletsBuffers->outerHitIndices_buf, nMemHost); + alpaka::memcpy(queue, mdsInCPU->dphichanges_buf, miniDoubletsBuffers->dphichanges_buf, nMemHost); + alpaka::memcpy(queue, mdsInCPU->nMDs_buf, miniDoubletsBuffers->nMDs_buf); + alpaka::memcpy(queue, mdsInCPU->totOccupancyMDs_buf, miniDoubletsBuffers->totOccupancyMDs_buf); + alpaka::wait(queue); + } + return mdsInCPU; +} + +SDL::segmentsBuffer* SDL::Event::getSegments() { + if (segmentsInCPU == nullptr) { + // Get nMemoryLocations parameter to initialize host based segmentsInCPU + auto nMemHost_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nMemHost_buf, segmentsBuffers->nMemoryLocations_buf); + alpaka::wait(queue); + + unsigned int nMemHost = *alpaka::getPtrNative(nMemHost_buf); + segmentsInCPU = new SDL::segmentsBuffer( + nMemHost, nLowerModules_, N_MAX_PIXEL_SEGMENTS_PER_MODULE, devHost, queue); + segmentsInCPU->setData(*segmentsInCPU); + + *alpaka::getPtrNative(segmentsInCPU->nMemoryLocations_buf) = nMemHost; + alpaka::memcpy(queue, segmentsInCPU->nSegments_buf, segmentsBuffers->nSegments_buf); + alpaka::memcpy(queue, segmentsInCPU->mdIndices_buf, segmentsBuffers->mdIndices_buf, 2u * nMemHost); + alpaka::memcpy(queue, + segmentsInCPU->innerMiniDoubletAnchorHitIndices_buf, + segmentsBuffers->innerMiniDoubletAnchorHitIndices_buf, + nMemHost); + alpaka::memcpy(queue, + segmentsInCPU->outerMiniDoubletAnchorHitIndices_buf, + segmentsBuffers->outerMiniDoubletAnchorHitIndices_buf, + nMemHost); + alpaka::memcpy(queue, segmentsInCPU->totOccupancySegments_buf, segmentsBuffers->totOccupancySegments_buf); + alpaka::memcpy(queue, segmentsInCPU->ptIn_buf, segmentsBuffers->ptIn_buf); + alpaka::memcpy(queue, segmentsInCPU->eta_buf, segmentsBuffers->eta_buf); + alpaka::memcpy(queue, segmentsInCPU->phi_buf, segmentsBuffers->phi_buf); + alpaka::memcpy(queue, segmentsInCPU->seedIdx_buf, segmentsBuffers->seedIdx_buf); + alpaka::memcpy(queue, segmentsInCPU->isDup_buf, segmentsBuffers->isDup_buf); + alpaka::memcpy(queue, segmentsInCPU->isQuad_buf, segmentsBuffers->isQuad_buf); + alpaka::memcpy(queue, segmentsInCPU->score_buf, segmentsBuffers->score_buf); + alpaka::wait(queue); + } + return segmentsInCPU; +} + +SDL::tripletsBuffer* SDL::Event::getTriplets() { + if (tripletsInCPU == nullptr) { + // Get nMemoryLocations parameter to initialize host based tripletsInCPU + auto nMemHost_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nMemHost_buf, tripletsBuffers->nMemoryLocations_buf); + alpaka::wait(queue); + + unsigned int nMemHost = *alpaka::getPtrNative(nMemHost_buf); + tripletsInCPU = new SDL::tripletsBuffer(nMemHost, nLowerModules_, devHost, queue); + tripletsInCPU->setData(*tripletsInCPU); + + *alpaka::getPtrNative(tripletsInCPU->nMemoryLocations_buf) = nMemHost; +#ifdef CUT_VALUE_DEBUG + alpaka::memcpy(queue, tripletsInCPU->zOut_buf, tripletsBuffers->zOut_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->zLo_buf, tripletsBuffers->zLo_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->zHi_buf, tripletsBuffers->zHi_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->zLoPointed_buf, tripletsBuffers->zLoPointed_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->zHiPointed_buf, tripletsBuffers->zHiPointed_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->sdlCut_buf, tripletsBuffers->sdlCut_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->betaInCut_buf, tripletsBuffers->betaInCut_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->rtLo_buf, tripletsBuffers->rtLo_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->rtHi_buf, tripletsBuffers->rtHi_buf, nMemHost); +#endif + alpaka::memcpy(queue, tripletsInCPU->hitIndices_buf, tripletsBuffers->hitIndices_buf, 6 * nMemHost); + alpaka::memcpy(queue, tripletsInCPU->logicalLayers_buf, tripletsBuffers->logicalLayers_buf, 3 * nMemHost); + alpaka::memcpy(queue, tripletsInCPU->segmentIndices_buf, tripletsBuffers->segmentIndices_buf, 2 * nMemHost); + alpaka::memcpy(queue, tripletsInCPU->betaIn_buf, tripletsBuffers->betaIn_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->circleRadius_buf, tripletsBuffers->circleRadius_buf, nMemHost); + alpaka::memcpy(queue, tripletsInCPU->nTriplets_buf, tripletsBuffers->nTriplets_buf); + alpaka::memcpy(queue, tripletsInCPU->totOccupancyTriplets_buf, tripletsBuffers->totOccupancyTriplets_buf); + alpaka::wait(queue); + } + return tripletsInCPU; +} + +SDL::quintupletsBuffer* SDL::Event::getQuintuplets() { + if (quintupletsInCPU == nullptr) { + // Get nMemoryLocations parameter to initialize host based quintupletsInCPU + auto nMemHost_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nMemHost_buf, quintupletsBuffers->nMemoryLocations_buf); + alpaka::wait(queue); + + unsigned int nMemHost = *alpaka::getPtrNative(nMemHost_buf); + quintupletsInCPU = new SDL::quintupletsBuffer(nMemHost, nLowerModules_, devHost, queue); + quintupletsInCPU->setData(*quintupletsInCPU); + + *alpaka::getPtrNative(quintupletsInCPU->nMemoryLocations_buf) = nMemHost; + alpaka::memcpy(queue, quintupletsInCPU->nQuintuplets_buf, quintupletsBuffers->nQuintuplets_buf); + alpaka::memcpy( + queue, quintupletsInCPU->totOccupancyQuintuplets_buf, quintupletsBuffers->totOccupancyQuintuplets_buf); + alpaka::memcpy(queue, quintupletsInCPU->tripletIndices_buf, quintupletsBuffers->tripletIndices_buf, 2 * nMemHost); + alpaka::memcpy( + queue, quintupletsInCPU->lowerModuleIndices_buf, quintupletsBuffers->lowerModuleIndices_buf, 5 * nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->innerRadius_buf, quintupletsBuffers->innerRadius_buf, nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->bridgeRadius_buf, quintupletsBuffers->bridgeRadius_buf, nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->outerRadius_buf, quintupletsBuffers->outerRadius_buf, nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->isDup_buf, quintupletsBuffers->isDup_buf, nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->score_rphisum_buf, quintupletsBuffers->score_rphisum_buf, nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->eta_buf, quintupletsBuffers->eta_buf, nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->phi_buf, quintupletsBuffers->phi_buf, nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->chiSquared_buf, quintupletsBuffers->chiSquared_buf, nMemHost); + alpaka::memcpy(queue, quintupletsInCPU->rzChiSquared_buf, quintupletsBuffers->rzChiSquared_buf, nMemHost); + alpaka::memcpy( + queue, quintupletsInCPU->nonAnchorChiSquared_buf, quintupletsBuffers->nonAnchorChiSquared_buf, nMemHost); + alpaka::wait(queue); + } + return quintupletsInCPU; +} + +SDL::pixelTripletsBuffer* SDL::Event::getPixelTriplets() { + if (pixelTripletsInCPU == nullptr) { + // Get nPixelTriplets parameter to initialize host based quintupletsInCPU + auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf); + alpaka::wait(queue); + + unsigned int nPixelTriplets = *alpaka::getPtrNative(nPixelTriplets_buf); + pixelTripletsInCPU = new SDL::pixelTripletsBuffer(nPixelTriplets, devHost, queue); + pixelTripletsInCPU->setData(*pixelTripletsInCPU); + + *alpaka::getPtrNative(pixelTripletsInCPU->nPixelTriplets_buf) = nPixelTriplets; + alpaka::memcpy( + queue, pixelTripletsInCPU->totOccupancyPixelTriplets_buf, pixelTripletsBuffers->totOccupancyPixelTriplets_buf); + alpaka::memcpy(queue, pixelTripletsInCPU->rzChiSquared_buf, pixelTripletsBuffers->rzChiSquared_buf, nPixelTriplets); + alpaka::memcpy( + queue, pixelTripletsInCPU->rPhiChiSquared_buf, pixelTripletsBuffers->rPhiChiSquared_buf, nPixelTriplets); + alpaka::memcpy(queue, + pixelTripletsInCPU->rPhiChiSquaredInwards_buf, + pixelTripletsBuffers->rPhiChiSquaredInwards_buf, + nPixelTriplets); + alpaka::memcpy( + queue, pixelTripletsInCPU->tripletIndices_buf, pixelTripletsBuffers->tripletIndices_buf, nPixelTriplets); + alpaka::memcpy(queue, + pixelTripletsInCPU->pixelSegmentIndices_buf, + pixelTripletsBuffers->pixelSegmentIndices_buf, + nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->pixelRadius_buf, pixelTripletsBuffers->pixelRadius_buf, nPixelTriplets); + alpaka::memcpy( + queue, pixelTripletsInCPU->tripletRadius_buf, pixelTripletsBuffers->tripletRadius_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->isDup_buf, pixelTripletsBuffers->isDup_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->eta_buf, pixelTripletsBuffers->eta_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->phi_buf, pixelTripletsBuffers->phi_buf, nPixelTriplets); + alpaka::memcpy(queue, pixelTripletsInCPU->score_buf, pixelTripletsBuffers->score_buf, nPixelTriplets); + alpaka::wait(queue); + } + return pixelTripletsInCPU; +} + +SDL::pixelQuintupletsBuffer* SDL::Event::getPixelQuintuplets() { + if (pixelQuintupletsInCPU == nullptr) { + // Get nPixelQuintuplets parameter to initialize host based quintupletsInCPU + auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf); + alpaka::wait(queue); + + unsigned int nPixelQuintuplets = *alpaka::getPtrNative(nPixelQuintuplets_buf); + pixelQuintupletsInCPU = new SDL::pixelQuintupletsBuffer(nPixelQuintuplets, devHost, queue); + pixelQuintupletsInCPU->setData(*pixelQuintupletsInCPU); + + *alpaka::getPtrNative(pixelQuintupletsInCPU->nPixelQuintuplets_buf) = nPixelQuintuplets; + alpaka::memcpy(queue, + pixelQuintupletsInCPU->totOccupancyPixelQuintuplets_buf, + pixelQuintupletsBuffers->totOccupancyPixelQuintuplets_buf); + alpaka::memcpy( + queue, pixelQuintupletsInCPU->rzChiSquared_buf, pixelQuintupletsBuffers->rzChiSquared_buf, nPixelQuintuplets); + alpaka::memcpy(queue, + pixelQuintupletsInCPU->rPhiChiSquared_buf, + pixelQuintupletsBuffers->rPhiChiSquared_buf, + nPixelQuintuplets); + alpaka::memcpy(queue, + pixelQuintupletsInCPU->rPhiChiSquaredInwards_buf, + pixelQuintupletsBuffers->rPhiChiSquaredInwards_buf, + nPixelQuintuplets); + alpaka::memcpy( + queue, pixelQuintupletsInCPU->pixelIndices_buf, pixelQuintupletsBuffers->pixelIndices_buf, nPixelQuintuplets); + alpaka::memcpy( + queue, pixelQuintupletsInCPU->T5Indices_buf, pixelQuintupletsBuffers->T5Indices_buf, nPixelQuintuplets); + alpaka::memcpy(queue, pixelQuintupletsInCPU->isDup_buf, pixelQuintupletsBuffers->isDup_buf, nPixelQuintuplets); + alpaka::memcpy(queue, pixelQuintupletsInCPU->score_buf, pixelQuintupletsBuffers->score_buf, nPixelQuintuplets); + alpaka::wait(queue); + } + return pixelQuintupletsInCPU; +} + +SDL::trackCandidatesBuffer* SDL::Event::getTrackCandidates() { + if (trackCandidatesInCPU == nullptr) { + // Get nTrackCanHost parameter to initialize host based trackCandidatesInCPU + auto nTrackCanHost_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nTrackCanHost_buf, trackCandidatesBuffers->nTrackCandidates_buf); + alpaka::wait(queue); + + unsigned int nTrackCanHost = *alpaka::getPtrNative(nTrackCanHost_buf); + trackCandidatesInCPU = new SDL::trackCandidatesBuffer( + N_MAX_NONPIXEL_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devHost, queue); + trackCandidatesInCPU->setData(*trackCandidatesInCPU); + + *alpaka::getPtrNative(trackCandidatesInCPU->nTrackCandidates_buf) = nTrackCanHost; + alpaka::memcpy( + queue, trackCandidatesInCPU->hitIndices_buf, trackCandidatesBuffers->hitIndices_buf, 14 * nTrackCanHost); + alpaka::memcpy( + queue, trackCandidatesInCPU->pixelSeedIndex_buf, trackCandidatesBuffers->pixelSeedIndex_buf, nTrackCanHost); + alpaka::memcpy( + queue, trackCandidatesInCPU->logicalLayers_buf, trackCandidatesBuffers->logicalLayers_buf, 7 * nTrackCanHost); + alpaka::memcpy(queue, + trackCandidatesInCPU->directObjectIndices_buf, + trackCandidatesBuffers->directObjectIndices_buf, + nTrackCanHost); + alpaka::memcpy( + queue, trackCandidatesInCPU->objectIndices_buf, trackCandidatesBuffers->objectIndices_buf, 2 * nTrackCanHost); + alpaka::memcpy(queue, + trackCandidatesInCPU->trackCandidateType_buf, + trackCandidatesBuffers->trackCandidateType_buf, + nTrackCanHost); + alpaka::wait(queue); + } + return trackCandidatesInCPU; +} + +SDL::trackCandidatesBuffer* SDL::Event::getTrackCandidatesInCMSSW() { + if (trackCandidatesInCPU == nullptr) { + // Get nTrackCanHost parameter to initialize host based trackCandidatesInCPU + auto nTrackCanHost_buf = allocBufWrapper(devHost, 1, queue); + alpaka::memcpy(queue, nTrackCanHost_buf, trackCandidatesBuffers->nTrackCandidates_buf); + alpaka::wait(queue); + + unsigned int nTrackCanHost = *alpaka::getPtrNative(nTrackCanHost_buf); + trackCandidatesInCPU = new SDL::trackCandidatesBuffer( + N_MAX_NONPIXEL_TRACK_CANDIDATES + N_MAX_PIXEL_TRACK_CANDIDATES, devHost, queue); + trackCandidatesInCPU->setData(*trackCandidatesInCPU); + + *alpaka::getPtrNative(trackCandidatesInCPU->nTrackCandidates_buf) = nTrackCanHost; + alpaka::memcpy( + queue, trackCandidatesInCPU->hitIndices_buf, trackCandidatesBuffers->hitIndices_buf, 14 * nTrackCanHost); + alpaka::memcpy( + queue, trackCandidatesInCPU->pixelSeedIndex_buf, trackCandidatesBuffers->pixelSeedIndex_buf, nTrackCanHost); + alpaka::memcpy(queue, + trackCandidatesInCPU->trackCandidateType_buf, + trackCandidatesBuffers->trackCandidateType_buf, + nTrackCanHost); + alpaka::wait(queue); + } + return trackCandidatesInCPU; +} + +SDL::modulesBuffer* SDL::Event::getModules(bool isFull) { + if (modulesInCPU == nullptr) { + // The last input here is just a small placeholder for the allocation. + modulesInCPU = new SDL::modulesBuffer(devHost, nModules_, nPixels_); + + modulesInCPU->copyFromSrc(queue, *modulesBuffers_, isFull); + } + return modulesInCPU; +} diff --git a/RecoTracker/LSTCore/src/alpaka/Event.h b/RecoTracker/LSTCore/src/alpaka/Event.h new file mode 100644 index 0000000000000..3d301c2c65069 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/Event.h @@ -0,0 +1,208 @@ +#ifndef Event_cuh +#define Event_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#include "RecoTracker/LSTCore/interface/alpaka/LST.h" +#else +#include "Constants.h" +#include "Module.h" +#include "LST.h" +#endif + +#include "Hit.h" +#include "ModuleMethods.h" +#include "Segment.h" +#include "Triplet.h" +#include "Kernels.h" +#include "Quintuplet.h" +#include "MiniDoublet.h" +#include "PixelTriplet.h" +#include "TrackCandidate.h" + +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" + +namespace SDL { + template + class Event {}; + + template <> + class Event { + private: + QueueAcc queue; + Dev devAcc; + DevHost devHost; + bool addObjects; + + std::array n_hits_by_layer_barrel_; + std::array n_hits_by_layer_endcap_; + std::array n_minidoublets_by_layer_barrel_; + std::array n_minidoublets_by_layer_endcap_; + std::array n_segments_by_layer_barrel_; + std::array n_segments_by_layer_endcap_; + std::array n_triplets_by_layer_barrel_; + std::array n_triplets_by_layer_endcap_; + std::array n_trackCandidates_by_layer_barrel_; + std::array n_trackCandidates_by_layer_endcap_; + std::array n_quintuplets_by_layer_barrel_; + std::array n_quintuplets_by_layer_endcap_; + + //Device stuff + unsigned int nTotalSegments; + struct objectRanges* rangesInGPU; + struct objectRangesBuffer* rangesBuffers; + struct hits* hitsInGPU; + struct hitsBuffer* hitsBuffers; + struct miniDoublets* mdsInGPU; + struct miniDoubletsBuffer* miniDoubletsBuffers; + struct segments* segmentsInGPU; + struct segmentsBuffer* segmentsBuffers; + struct triplets* tripletsInGPU; + struct tripletsBuffer* tripletsBuffers; + struct quintuplets* quintupletsInGPU; + struct quintupletsBuffer* quintupletsBuffers; + struct trackCandidates* trackCandidatesInGPU; + struct trackCandidatesBuffer* trackCandidatesBuffers; + struct pixelTriplets* pixelTripletsInGPU; + struct pixelTripletsBuffer* pixelTripletsBuffers; + struct pixelQuintuplets* pixelQuintupletsInGPU; + struct pixelQuintupletsBuffer* pixelQuintupletsBuffers; + + //CPU interface stuff + objectRangesBuffer* rangesInCPU; + hitsBuffer* hitsInCPU; + miniDoubletsBuffer* mdsInCPU; + segmentsBuffer* segmentsInCPU; + tripletsBuffer* tripletsInCPU; + trackCandidatesBuffer* trackCandidatesInCPU; + modulesBuffer* modulesInCPU; + quintupletsBuffer* quintupletsInCPU; + pixelTripletsBuffer* pixelTripletsInCPU; + pixelQuintupletsBuffer* pixelQuintupletsInCPU; + + void init(bool verbose); + + int* superbinCPU; + int8_t* pixelTypeCPU; + + // Stuff that used to be global + const uint16_t nModules_; + const uint16_t nLowerModules_; + const unsigned int nPixels_; + const std::shared_ptr> modulesBuffers_; + const std::shared_ptr pixelMapping_; + const std::shared_ptr> endcapGeometry_; + + public: + // Constructor used for CMSSW integration. Uses an external queue. + template + Event(bool verbose, TQueue const& q, const LSTESDeviceData* deviceESData) + : queue(q), + devAcc(alpaka::getDev(q)), + devHost(cms::alpakatools::host()), + nModules_(deviceESData->nModules), + nLowerModules_(deviceESData->nLowerModules), + nPixels_(deviceESData->nPixels), + modulesBuffers_(deviceESData->modulesBuffers), + pixelMapping_(deviceESData->pixelMapping), + endcapGeometry_(deviceESData->endcapGeometry) { + init(verbose); + } + void resetEvent(); + + void addHitToEvent( + std::vector x, + std::vector y, + std::vector z, + std::vector detId, + std::vector idxInNtuple); //call the appropriate hit function, then increment the counter here + void addPixelSegmentToEvent(std::vector hitIndices0, + std::vector hitIndices1, + std::vector hitIndices2, + std::vector hitIndices3, + std::vector dPhiChange, + std::vector ptIn, + std::vector ptErr, + std::vector px, + std::vector py, + std::vector pz, + std::vector eta, + std::vector etaErr, + std::vector phi, + std::vector charge, + std::vector seedIdx, + std::vector superbin, + std::vector pixelType, + std::vector isQuad); + + // functions that map the objects to the appropriate modules + void addMiniDoubletsToEventExplicit(); + void addSegmentsToEventExplicit(); + void addTripletsToEventExplicit(); + void addQuintupletsToEventExplicit(); + void resetObjectsInModule(); + + void createMiniDoublets(); + void createSegmentsWithModuleMap(); + void createTriplets(); + void createPixelTracklets(); + void createPixelTrackletsWithMap(); + void createTrackCandidates(); + void createExtendedTracks(); + void createQuintuplets(); + void createPixelTriplets(); + void createPixelQuintuplets(); + void pixelLineSegmentCleaning(); + + unsigned int getNumberOfHits(); + unsigned int getNumberOfHitsByLayer(unsigned int layer); + unsigned int getNumberOfHitsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfHitsByLayerEndcap(unsigned int layer); + + unsigned int getNumberOfMiniDoublets(); + unsigned int getNumberOfMiniDoubletsByLayer(unsigned int layer); + unsigned int getNumberOfMiniDoubletsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfMiniDoubletsByLayerEndcap(unsigned int layer); + + unsigned int getNumberOfSegments(); + unsigned int getNumberOfSegmentsByLayer(unsigned int layer); + unsigned int getNumberOfSegmentsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfSegmentsByLayerEndcap(unsigned int layer); + + unsigned int getNumberOfTriplets(); + unsigned int getNumberOfTripletsByLayer(unsigned int layer); + unsigned int getNumberOfTripletsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfTripletsByLayerEndcap(unsigned int layer); + + int getNumberOfTrackCandidates(); + int getNumberOfPixelTrackCandidates(); + int getNumberOfPT5TrackCandidates(); + int getNumberOfPT3TrackCandidates(); + int getNumberOfT5TrackCandidates(); + int getNumberOfPLSTrackCandidates(); + + unsigned int getNumberOfQuintuplets(); + unsigned int getNumberOfQuintupletsByLayer(unsigned int layer); + unsigned int getNumberOfQuintupletsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfQuintupletsByLayerEndcap(unsigned int layer); + + int getNumberOfPixelTriplets(); + int getNumberOfPixelQuintuplets(); + + objectRangesBuffer* getRanges(); + hitsBuffer* getHits(); + hitsBuffer* getHitsInCMSSW(); + miniDoubletsBuffer* getMiniDoublets(); + segmentsBuffer* getSegments(); + tripletsBuffer* getTriplets(); + quintupletsBuffer* getQuintuplets(); + trackCandidatesBuffer* getTrackCandidates(); + trackCandidatesBuffer* getTrackCandidatesInCMSSW(); + pixelTripletsBuffer* getPixelTriplets(); + pixelQuintupletsBuffer* getPixelQuintuplets(); + modulesBuffer* getModules(bool isFull = false); + }; + +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/Hit.h b/RecoTracker/LSTCore/src/alpaka/Hit.h new file mode 100644 index 0000000000000..dfe9c4c56ef95 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/Hit.h @@ -0,0 +1,272 @@ +#ifndef Hit_cuh +#define Hit_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +namespace SDL { + struct hits { + unsigned int* nHits; + float* xs; + float* ys; + float* zs; + uint16_t* moduleIndices; + unsigned int* idxs; + unsigned int* detid; + float* rts; + float* phis; + float* etas; + float* highEdgeXs; + float* highEdgeYs; + float* lowEdgeXs; + float* lowEdgeYs; + int* hitRanges; + int* hitRangesLower; + int* hitRangesUpper; + int8_t* hitRangesnLower; + int8_t* hitRangesnUpper; + + template + void setData(TBuff& hitsbuf) { + nHits = alpaka::getPtrNative(hitsbuf.nHits_buf); + xs = alpaka::getPtrNative(hitsbuf.xs_buf); + ys = alpaka::getPtrNative(hitsbuf.ys_buf); + zs = alpaka::getPtrNative(hitsbuf.zs_buf); + moduleIndices = alpaka::getPtrNative(hitsbuf.moduleIndices_buf); + idxs = alpaka::getPtrNative(hitsbuf.idxs_buf); + detid = alpaka::getPtrNative(hitsbuf.detid_buf); + rts = alpaka::getPtrNative(hitsbuf.rts_buf); + phis = alpaka::getPtrNative(hitsbuf.phis_buf); + etas = alpaka::getPtrNative(hitsbuf.etas_buf); + highEdgeXs = alpaka::getPtrNative(hitsbuf.highEdgeXs_buf); + highEdgeYs = alpaka::getPtrNative(hitsbuf.highEdgeYs_buf); + lowEdgeXs = alpaka::getPtrNative(hitsbuf.lowEdgeXs_buf); + lowEdgeYs = alpaka::getPtrNative(hitsbuf.lowEdgeYs_buf); + hitRanges = alpaka::getPtrNative(hitsbuf.hitRanges_buf); + hitRangesLower = alpaka::getPtrNative(hitsbuf.hitRangesLower_buf); + hitRangesUpper = alpaka::getPtrNative(hitsbuf.hitRangesUpper_buf); + hitRangesnLower = alpaka::getPtrNative(hitsbuf.hitRangesnLower_buf); + hitRangesnUpper = alpaka::getPtrNative(hitsbuf.hitRangesnUpper_buf); + } + }; + + template + struct hitsBuffer : hits { + Buf nHits_buf; + Buf xs_buf; + Buf ys_buf; + Buf zs_buf; + Buf moduleIndices_buf; + Buf idxs_buf; + Buf detid_buf; + Buf rts_buf; + Buf phis_buf; + Buf etas_buf; + Buf highEdgeXs_buf; + Buf highEdgeYs_buf; + Buf lowEdgeXs_buf; + Buf lowEdgeYs_buf; + Buf hitRanges_buf; + Buf hitRangesLower_buf; + Buf hitRangesUpper_buf; + Buf hitRangesnLower_buf; + Buf hitRangesnUpper_buf; + + template + hitsBuffer(unsigned int nModules, unsigned int nMaxHits, TDevAcc const& devAccIn, TQueue& queue) + : nHits_buf(allocBufWrapper(devAccIn, 1u, queue)), + xs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + ys_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + zs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + moduleIndices_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + idxs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + detid_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + rts_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + phis_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + etas_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + highEdgeXs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + highEdgeYs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + lowEdgeXs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + lowEdgeYs_buf(allocBufWrapper(devAccIn, nMaxHits, queue)), + hitRanges_buf(allocBufWrapper(devAccIn, nModules * 2, queue)), + hitRangesLower_buf(allocBufWrapper(devAccIn, nModules, queue)), + hitRangesUpper_buf(allocBufWrapper(devAccIn, nModules, queue)), + hitRangesnLower_buf(allocBufWrapper(devAccIn, nModules, queue)), + hitRangesnUpper_buf(allocBufWrapper(devAccIn, nModules, queue)) { + alpaka::memset(queue, hitRanges_buf, 0xff); + alpaka::memset(queue, hitRangesLower_buf, 0xff); + alpaka::memset(queue, hitRangesUpper_buf, 0xff); + alpaka::memset(queue, hitRangesnLower_buf, 0xff); + alpaka::memset(queue, hitRangesnUpper_buf, 0xff); + alpaka::wait(queue); + } + }; + + // Alpaka does not support log10 natively right now. + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float temp_log10(TAcc const& acc, float val) { + constexpr float ln10 = 2.302585093f; // precomputed ln(10) + return alpaka::math::log(acc, val) / ln10; + }; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float eta(TAcc const& acc, float x, float y, float z) { + float r3 = alpaka::math::sqrt(acc, x * x + y * y + z * z); + float rt = alpaka::math::sqrt(acc, x * x + y * y); + float eta = ((z > 0) - (z < 0)) * alpaka::math::acosh(acc, r3 / rt); + return eta; + }; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float phi_mpi_pi(TAcc const& acc, float x) { + if (alpaka::math::abs(acc, x) <= float(M_PI)) + return x; + + constexpr float o2pi = 1.f / (2.f * float(M_PI)); + float n = alpaka::math::round(acc, x * o2pi); + return x - n * float(2.f * float(M_PI)); + }; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float phi(TAcc const& acc, float x, float y) { + return phi_mpi_pi(acc, float(M_PI) + alpaka::math::atan2(acc, -y, -x)); + }; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float deltaPhi(TAcc const& acc, float x1, float y1, float x2, float y2) { + float phi1 = phi(acc, x1, y1); + float phi2 = phi(acc, x2, y2); + return phi_mpi_pi(acc, (phi2 - phi1)); + }; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float deltaPhiChange(TAcc const& acc, float x1, float y1, float x2, float y2) { + return deltaPhi(acc, x1, y1, x2 - x1, y2 - y1); + }; + + // Alpaka: This function is not yet implemented directly in Alpaka. + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float copysignf(float a, float b) { + int sign_a = (a < 0) ? -1 : 1; + int sign_b = (b < 0) ? -1 : 1; + return sign_a * sign_b * a; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float calculate_dPhi(float phi1, float phi2) { + // Calculate dPhi + float dPhi = phi1 - phi2; + + // Normalize dPhi to be between -pi and pi + if (dPhi > float(M_PI)) { + dPhi -= 2 * float(M_PI); + } else if (dPhi < -float(M_PI)) { + dPhi += 2 * float(M_PI); + } + + return dPhi; + }; + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int binary_search(const unsigned int* data, // Array that we are searching over + unsigned int search_val, // Value we want to find in data array + unsigned int ndata) // Number of elements in data array + { + unsigned int low = 0; + unsigned int high = ndata - 1; + + while (low <= high) { + unsigned int mid = (low + high) / 2; + unsigned int test_val = data[mid]; + if (test_val == search_val) + return mid; + else if (test_val > search_val) + high = mid - 1; + else + low = mid + 1; + } + // Couldn't find search value in array. + return -1; + }; + + struct moduleRangesKernel { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::hits hitsInGPU, + int const& nLowerModules) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (int lowerIndex = globalThreadIdx[2]; lowerIndex < nLowerModules; lowerIndex += gridThreadExtent[2]) { + uint16_t upperIndex = modulesInGPU.partnerModuleIndices[lowerIndex]; + if (hitsInGPU.hitRanges[lowerIndex * 2] != -1 && hitsInGPU.hitRanges[upperIndex * 2] != -1) { + hitsInGPU.hitRangesLower[lowerIndex] = hitsInGPU.hitRanges[lowerIndex * 2]; + hitsInGPU.hitRangesUpper[lowerIndex] = hitsInGPU.hitRanges[upperIndex * 2]; + hitsInGPU.hitRangesnLower[lowerIndex] = + hitsInGPU.hitRanges[lowerIndex * 2 + 1] - hitsInGPU.hitRanges[lowerIndex * 2] + 1; + hitsInGPU.hitRangesnUpper[lowerIndex] = + hitsInGPU.hitRanges[upperIndex * 2 + 1] - hitsInGPU.hitRanges[upperIndex * 2] + 1; + } + } + } + }; + + struct hitLoopKernel { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + uint16_t Endcap, // Integer corresponding to endcap in module subdets + uint16_t TwoS, // Integer corresponding to TwoS in moduleType + unsigned int nModules, // Number of modules + unsigned int nEndCapMap, // Number of elements in endcap map + const unsigned int* geoMapDetId, // DetId's from endcap map + const float* geoMapPhi, // Phi values from endcap map + struct SDL::modules modulesInGPU, + struct SDL::hits hitsInGPU, + unsigned int const& nHits) const // Total number of hits in event + { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + for (unsigned int ihit = globalThreadIdx[2]; ihit < nHits; ihit += gridThreadExtent[2]) { + float ihit_x = hitsInGPU.xs[ihit]; + float ihit_y = hitsInGPU.ys[ihit]; + float ihit_z = hitsInGPU.zs[ihit]; + int iDetId = hitsInGPU.detid[ihit]; + + hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x * ihit_x + ihit_y * ihit_y); + hitsInGPU.phis[ihit] = SDL::phi(acc, ihit_x, ihit_y); + hitsInGPU.etas[ihit] = + ((ihit_z > 0) - (ihit_z < 0)) * + alpaka::math::acosh( + acc, + alpaka::math::sqrt(acc, ihit_x * ihit_x + ihit_y * ihit_y + ihit_z * ihit_z) / hitsInGPU.rts[ihit]); + int found_index = binary_search(modulesInGPU.mapdetId, iDetId, nModules); + uint16_t lastModuleIndex = modulesInGPU.mapIdx[found_index]; + + hitsInGPU.moduleIndices[ihit] = lastModuleIndex; + + if (modulesInGPU.subdets[lastModuleIndex] == Endcap && modulesInGPU.moduleType[lastModuleIndex] == TwoS) { + found_index = binary_search(geoMapDetId, iDetId, nEndCapMap); + float phi = geoMapPhi[found_index]; + float cos_phi = alpaka::math::cos(acc, phi); + hitsInGPU.highEdgeXs[ihit] = ihit_x + 2.5f * cos_phi; + hitsInGPU.lowEdgeXs[ihit] = ihit_x - 2.5f * cos_phi; + float sin_phi = alpaka::math::sin(acc, phi); + hitsInGPU.highEdgeYs[ihit] = ihit_y + 2.5f * sin_phi; + hitsInGPU.lowEdgeYs[ihit] = ihit_y - 2.5f * sin_phi; + } + // Need to set initial value if index hasn't been seen before. + int old = alpaka::atomicOp( + acc, &(hitsInGPU.hitRanges[lastModuleIndex * 2]), -1, static_cast(ihit)); + // For subsequent visits, stores the min value. + if (old != -1) + alpaka::atomicOp(acc, &hitsInGPU.hitRanges[lastModuleIndex * 2], static_cast(ihit)); + + alpaka::atomicOp(acc, &hitsInGPU.hitRanges[lastModuleIndex * 2 + 1], static_cast(ihit)); + } + } + }; +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/Kernels.h b/RecoTracker/LSTCore/src/alpaka/Kernels.h new file mode 100644 index 0000000000000..334c8e1a9eb8a --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/Kernels.h @@ -0,0 +1,475 @@ +#ifndef Kernels_cuh +#define Kernels_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "Hit.h" +#include "MiniDoublet.h" +#include "Segment.h" +#include "Triplet.h" +#include "Quintuplet.h" +#include "PixelTriplet.h" + +namespace SDL { + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmQuintupletFromMemory(struct SDL::quintuplets& quintupletsInGPU, + unsigned int quintupletIndex, + bool secondpass = false) { + quintupletsInGPU.isDup[quintupletIndex] |= 1 + secondpass; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelTripletFromMemory(struct SDL::pixelTriplets& pixelTripletsInGPU, + unsigned int pixelTripletIndex) { + pixelTripletsInGPU.isDup[pixelTripletIndex] = true; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelQuintupletFromMemory(struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, + unsigned int pixelQuintupletIndex) { + pixelQuintupletsInGPU.isDup[pixelQuintupletIndex] = true; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(struct SDL::segments& segmentsInGPU, + unsigned int pixelSegmentArrayIndex, + bool secondpass = false) { + segmentsInGPU.isDup[pixelSegmentArrayIndex] |= 1 + secondpass; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkHitsT5(unsigned int ix, + unsigned int jx, + struct SDL::quintuplets& quintupletsInGPU) { + unsigned int hits1[10]; + unsigned int hits2[10]; + + hits1[0] = quintupletsInGPU.hitIndices[10 * ix]; + hits1[1] = quintupletsInGPU.hitIndices[10 * ix + 1]; + hits1[2] = quintupletsInGPU.hitIndices[10 * ix + 2]; + hits1[3] = quintupletsInGPU.hitIndices[10 * ix + 3]; + hits1[4] = quintupletsInGPU.hitIndices[10 * ix + 4]; + hits1[5] = quintupletsInGPU.hitIndices[10 * ix + 5]; + hits1[6] = quintupletsInGPU.hitIndices[10 * ix + 6]; + hits1[7] = quintupletsInGPU.hitIndices[10 * ix + 7]; + hits1[8] = quintupletsInGPU.hitIndices[10 * ix + 8]; + hits1[9] = quintupletsInGPU.hitIndices[10 * ix + 9]; + + hits2[0] = quintupletsInGPU.hitIndices[10 * jx]; + hits2[1] = quintupletsInGPU.hitIndices[10 * jx + 1]; + hits2[2] = quintupletsInGPU.hitIndices[10 * jx + 2]; + hits2[3] = quintupletsInGPU.hitIndices[10 * jx + 3]; + hits2[4] = quintupletsInGPU.hitIndices[10 * jx + 4]; + hits2[5] = quintupletsInGPU.hitIndices[10 * jx + 5]; + hits2[6] = quintupletsInGPU.hitIndices[10 * jx + 6]; + hits2[7] = quintupletsInGPU.hitIndices[10 * jx + 7]; + hits2[8] = quintupletsInGPU.hitIndices[10 * jx + 8]; + hits2[9] = quintupletsInGPU.hitIndices[10 * jx + 9]; + + int nMatched = 0; + for (int i = 0; i < 10; i++) { + bool matched = false; + for (int j = 0; j < 10; j++) { + if (hits1[i] == hits2[j]) { + matched = true; + break; + } + } + if (matched) { + nMatched++; + } + } + return nMatched; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkHitspT5(unsigned int ix, + unsigned int jx, + struct SDL::pixelQuintuplets& pixelQuintupletsInGPU) { + unsigned int hits1[14]; + unsigned int hits2[14]; + + hits1[0] = pixelQuintupletsInGPU.hitIndices[14 * ix]; + hits1[1] = pixelQuintupletsInGPU.hitIndices[14 * ix + 1]; + hits1[2] = pixelQuintupletsInGPU.hitIndices[14 * ix + 2]; + hits1[3] = pixelQuintupletsInGPU.hitIndices[14 * ix + 3]; + hits1[4] = pixelQuintupletsInGPU.hitIndices[14 * ix + 4]; + hits1[5] = pixelQuintupletsInGPU.hitIndices[14 * ix + 5]; + hits1[6] = pixelQuintupletsInGPU.hitIndices[14 * ix + 6]; + hits1[7] = pixelQuintupletsInGPU.hitIndices[14 * ix + 7]; + hits1[8] = pixelQuintupletsInGPU.hitIndices[14 * ix + 8]; + hits1[9] = pixelQuintupletsInGPU.hitIndices[14 * ix + 9]; + hits1[10] = pixelQuintupletsInGPU.hitIndices[14 * ix + 10]; + hits1[11] = pixelQuintupletsInGPU.hitIndices[14 * ix + 11]; + hits1[12] = pixelQuintupletsInGPU.hitIndices[14 * ix + 12]; + hits1[13] = pixelQuintupletsInGPU.hitIndices[14 * ix + 13]; + + hits2[0] = pixelQuintupletsInGPU.hitIndices[14 * jx]; + hits2[1] = pixelQuintupletsInGPU.hitIndices[14 * jx + 1]; + hits2[2] = pixelQuintupletsInGPU.hitIndices[14 * jx + 2]; + hits2[3] = pixelQuintupletsInGPU.hitIndices[14 * jx + 3]; + hits2[4] = pixelQuintupletsInGPU.hitIndices[14 * jx + 4]; + hits2[5] = pixelQuintupletsInGPU.hitIndices[14 * jx + 5]; + hits2[6] = pixelQuintupletsInGPU.hitIndices[14 * jx + 6]; + hits2[7] = pixelQuintupletsInGPU.hitIndices[14 * jx + 7]; + hits2[8] = pixelQuintupletsInGPU.hitIndices[14 * jx + 8]; + hits2[9] = pixelQuintupletsInGPU.hitIndices[14 * jx + 9]; + hits2[10] = pixelQuintupletsInGPU.hitIndices[14 * jx + 10]; + hits2[11] = pixelQuintupletsInGPU.hitIndices[14 * jx + 11]; + hits2[12] = pixelQuintupletsInGPU.hitIndices[14 * jx + 12]; + hits2[13] = pixelQuintupletsInGPU.hitIndices[14 * jx + 13]; + + int nMatched = 0; + for (int i = 0; i < 14; i++) { + bool matched = false; + for (int j = 0; j < 14; j++) { + if (hits1[i] == hits2[j]) { + matched = true; + break; + } + } + if (matched) { + nMatched++; + } + } + return nMatched; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void checkHitspT3(unsigned int ix, + unsigned int jx, + struct SDL::pixelTriplets& pixelTripletsInGPU, + int* matched) { + int phits1[4] = {-1, -1, -1, -1}; + int phits2[4] = {-1, -1, -1, -1}; + phits1[0] = pixelTripletsInGPU.hitIndices[10 * ix]; + phits1[1] = pixelTripletsInGPU.hitIndices[10 * ix + 1]; + phits1[2] = pixelTripletsInGPU.hitIndices[10 * ix + 2]; + phits1[3] = pixelTripletsInGPU.hitIndices[10 * ix + 3]; + + phits2[0] = pixelTripletsInGPU.hitIndices[10 * jx]; + phits2[1] = pixelTripletsInGPU.hitIndices[10 * jx + 1]; + phits2[2] = pixelTripletsInGPU.hitIndices[10 * jx + 2]; + phits2[3] = pixelTripletsInGPU.hitIndices[10 * jx + 3]; + + int npMatched = 0; + for (int i = 0; i < 4; i++) { + bool pmatched = false; + for (int j = 0; j < 4; j++) { + if (phits1[i] == phits2[j]) { + pmatched = true; + break; + } + } + if (pmatched) { + npMatched++; + } + } + + int hits1[6] = {-1, -1, -1, -1, -1, -1}; + int hits2[6] = {-1, -1, -1, -1, -1, -1}; + hits1[0] = pixelTripletsInGPU.hitIndices[10 * ix + 4]; + hits1[1] = pixelTripletsInGPU.hitIndices[10 * ix + 5]; + hits1[2] = pixelTripletsInGPU.hitIndices[10 * ix + 6]; + hits1[3] = pixelTripletsInGPU.hitIndices[10 * ix + 7]; + hits1[4] = pixelTripletsInGPU.hitIndices[10 * ix + 8]; + hits1[5] = pixelTripletsInGPU.hitIndices[10 * ix + 9]; + + hits2[0] = pixelTripletsInGPU.hitIndices[10 * jx + 4]; + hits2[1] = pixelTripletsInGPU.hitIndices[10 * jx + 5]; + hits2[2] = pixelTripletsInGPU.hitIndices[10 * jx + 6]; + hits2[3] = pixelTripletsInGPU.hitIndices[10 * jx + 7]; + hits2[4] = pixelTripletsInGPU.hitIndices[10 * jx + 8]; + hits2[5] = pixelTripletsInGPU.hitIndices[10 * jx + 9]; + + int nMatched = 0; + for (int i = 0; i < 6; i++) { + bool tmatched = false; + for (int j = 0; j < 6; j++) { + if (hits1[i] == hits2[j]) { + tmatched = true; + break; + } + } + if (tmatched) { + nMatched++; + } + } + + matched[0] = npMatched; + matched[1] = nMatched; + }; + + struct removeDupQuintupletsInGPUAfterBuild { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::quintuplets quintupletsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (unsigned int lowmod = globalThreadIdx[0]; lowmod < *modulesInGPU.nLowerModules; + lowmod += gridThreadExtent[0]) { + unsigned int nQuintuplets_lowmod = quintupletsInGPU.nQuintuplets[lowmod]; + int quintupletModuleIndices_lowmod = rangesInGPU.quintupletModuleIndices[lowmod]; + + for (unsigned int ix1 = globalThreadIdx[1]; ix1 < nQuintuplets_lowmod; ix1 += gridThreadExtent[1]) { + unsigned int ix = quintupletModuleIndices_lowmod + ix1; + float eta1 = __H2F(quintupletsInGPU.eta[ix]); + float phi1 = __H2F(quintupletsInGPU.phi[ix]); + float score_rphisum1 = __H2F(quintupletsInGPU.score_rphisum[ix]); + + for (unsigned int jx1 = globalThreadIdx[2] + ix1 + 1; jx1 < nQuintuplets_lowmod; jx1 += gridThreadExtent[2]) { + unsigned int jx = quintupletModuleIndices_lowmod + jx1; + + float eta2 = __H2F(quintupletsInGPU.eta[jx]); + float phi2 = __H2F(quintupletsInGPU.phi[jx]); + float dEta = alpaka::math::abs(acc, eta1 - eta2); + float dPhi = SDL::calculate_dPhi(phi1, phi2); + float score_rphisum2 = __H2F(quintupletsInGPU.score_rphisum[jx]); + + if (dEta > 0.1f) + continue; + + if (alpaka::math::abs(acc, dPhi) > 0.1f) + continue; + + int nMatched = checkHitsT5(ix, jx, quintupletsInGPU); + if (nMatched >= 7) { + if (score_rphisum1 >= score_rphisum2) { + rmQuintupletFromMemory(quintupletsInGPU, ix); + } else { + rmQuintupletFromMemory(quintupletsInGPU, jx); + } + } + } + } + } + } + }; + + struct removeDupQuintupletsInGPUBeforeTC { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::quintuplets quintupletsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (unsigned int lowmodIdx1 = globalThreadIdx[1]; lowmodIdx1 < *(rangesInGPU.nEligibleT5Modules); + lowmodIdx1 += gridThreadExtent[1]) { + uint16_t lowmod1 = rangesInGPU.indicesOfEligibleT5Modules[lowmodIdx1]; + unsigned int nQuintuplets_lowmod1 = quintupletsInGPU.nQuintuplets[lowmod1]; + if (nQuintuplets_lowmod1 == 0) + continue; + + unsigned int quintupletModuleIndices_lowmod1 = rangesInGPU.quintupletModuleIndices[lowmod1]; + + for (unsigned int lowmodIdx2 = globalThreadIdx[2] + lowmodIdx1; lowmodIdx2 < *(rangesInGPU.nEligibleT5Modules); + lowmodIdx2 += gridThreadExtent[2]) { + uint16_t lowmod2 = rangesInGPU.indicesOfEligibleT5Modules[lowmodIdx2]; + unsigned int nQuintuplets_lowmod2 = quintupletsInGPU.nQuintuplets[lowmod2]; + if (nQuintuplets_lowmod2 == 0) + continue; + + unsigned int quintupletModuleIndices_lowmod2 = rangesInGPU.quintupletModuleIndices[lowmod2]; + + for (unsigned int ix1 = 0; ix1 < nQuintuplets_lowmod1; ix1 += 1) { + unsigned int ix = quintupletModuleIndices_lowmod1 + ix1; + if (quintupletsInGPU.partOfPT5[ix] || (quintupletsInGPU.isDup[ix] & 1)) + continue; + + for (unsigned int jx1 = 0; jx1 < nQuintuplets_lowmod2; jx1++) { + unsigned int jx = quintupletModuleIndices_lowmod2 + jx1; + if (ix == jx) + continue; + + if (quintupletsInGPU.partOfPT5[jx] || (quintupletsInGPU.isDup[jx] & 1)) + continue; + + float eta1 = __H2F(quintupletsInGPU.eta[ix]); + float phi1 = __H2F(quintupletsInGPU.phi[ix]); + float score_rphisum1 = __H2F(quintupletsInGPU.score_rphisum[ix]); + + float eta2 = __H2F(quintupletsInGPU.eta[jx]); + float phi2 = __H2F(quintupletsInGPU.phi[jx]); + float score_rphisum2 = __H2F(quintupletsInGPU.score_rphisum[jx]); + + float dEta = alpaka::math::abs(acc, eta1 - eta2); + float dPhi = SDL::calculate_dPhi(phi1, phi2); + + if (dEta > 0.1f) + continue; + + if (alpaka::math::abs(acc, dPhi) > 0.1f) + continue; + + float dR2 = dEta * dEta + dPhi * dPhi; + int nMatched = checkHitsT5(ix, jx, quintupletsInGPU); + if (dR2 < 0.001f || nMatched >= 5) { + if (score_rphisum1 > score_rphisum2) { + rmQuintupletFromMemory(quintupletsInGPU, ix, true); + } else if (score_rphisum1 < score_rphisum2) { + rmQuintupletFromMemory(quintupletsInGPU, jx, true); + } else { + rmQuintupletFromMemory(quintupletsInGPU, (ix < jx ? ix : jx), true); + } + } + } + } + } + } + } + }; + + struct removeDupPixelTripletsInGPUFromMap { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, struct SDL::pixelTriplets pixelTripletsInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (unsigned int ix = globalThreadIdx[1]; ix < *pixelTripletsInGPU.nPixelTriplets; ix += gridThreadExtent[1]) { + for (unsigned int jx = globalThreadIdx[2]; jx < *pixelTripletsInGPU.nPixelTriplets; jx += gridThreadExtent[2]) { + if (ix == jx) + continue; + + int nMatched[2]; + checkHitspT3(ix, jx, pixelTripletsInGPU, nMatched); + if ((nMatched[0] + nMatched[1]) >= 5) { + // Check the layers + if (pixelTripletsInGPU.logicalLayers[5 * jx + 2] < pixelTripletsInGPU.logicalLayers[5 * ix + 2]) { + rmPixelTripletFromMemory(pixelTripletsInGPU, ix); + break; + } else if (pixelTripletsInGPU.logicalLayers[5 * ix + 2] == pixelTripletsInGPU.logicalLayers[5 * jx + 2] && + __H2F(pixelTripletsInGPU.score[ix]) > __H2F(pixelTripletsInGPU.score[jx])) { + rmPixelTripletFromMemory(pixelTripletsInGPU, ix); + break; + } else if (pixelTripletsInGPU.logicalLayers[5 * ix + 2] == pixelTripletsInGPU.logicalLayers[5 * jx + 2] && + (__H2F(pixelTripletsInGPU.score[ix]) == __H2F(pixelTripletsInGPU.score[jx])) && (ix < jx)) { + rmPixelTripletFromMemory(pixelTripletsInGPU, ix); + break; + } + } + } + } + } + }; + + struct removeDupPixelQuintupletsInGPUFromMap { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, struct SDL::pixelQuintuplets pixelQuintupletsInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + unsigned int nPixelQuintuplets = *pixelQuintupletsInGPU.nPixelQuintuplets; + for (unsigned int ix = globalThreadIdx[1]; ix < nPixelQuintuplets; ix += gridThreadExtent[1]) { + float score1 = __H2F(pixelQuintupletsInGPU.score[ix]); + for (unsigned int jx = globalThreadIdx[2]; jx < nPixelQuintuplets; jx += gridThreadExtent[2]) { + if (ix == jx) + continue; + + int nMatched = checkHitspT5(ix, jx, pixelQuintupletsInGPU); + float score2 = __H2F(pixelQuintupletsInGPU.score[jx]); + if (nMatched >= 7) { + if (score1 > score2 or ((score1 == score2) and (ix > jx))) { + rmPixelQuintupletFromMemory(pixelQuintupletsInGPU, ix); + break; + } + } + } + } + } + }; + + struct checkHitspLS { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::segments segmentsInGPU, + bool secondpass) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + int pixelModuleIndex = *modulesInGPU.nLowerModules; + unsigned int nPixelSegments = segmentsInGPU.nSegments[pixelModuleIndex]; + + if (nPixelSegments > N_MAX_PIXEL_SEGMENTS_PER_MODULE) + nPixelSegments = N_MAX_PIXEL_SEGMENTS_PER_MODULE; + + for (unsigned int ix = globalThreadIdx[1]; ix < nPixelSegments; ix += gridThreadExtent[1]) { + if (secondpass && (!segmentsInGPU.isQuad[ix] || (segmentsInGPU.isDup[ix] & 1))) + continue; + + unsigned int phits1[4]; + phits1[0] = segmentsInGPU.pLSHitsIdxs[ix].x; + phits1[1] = segmentsInGPU.pLSHitsIdxs[ix].y; + phits1[2] = segmentsInGPU.pLSHitsIdxs[ix].z; + phits1[3] = segmentsInGPU.pLSHitsIdxs[ix].w; + float eta_pix1 = segmentsInGPU.eta[ix]; + float phi_pix1 = segmentsInGPU.phi[ix]; + + for (unsigned int jx = ix + 1 + globalThreadIdx[2]; jx < nPixelSegments; jx += gridThreadExtent[2]) { + float eta_pix2 = segmentsInGPU.eta[jx]; + float phi_pix2 = segmentsInGPU.phi[jx]; + + if (alpaka::math::abs(acc, eta_pix2 - eta_pix1) > 0.1f) + continue; + + if (secondpass && (!segmentsInGPU.isQuad[jx] || (segmentsInGPU.isDup[jx] & 1))) + continue; + + int8_t quad_diff = segmentsInGPU.isQuad[ix] - segmentsInGPU.isQuad[jx]; + float score_diff = segmentsInGPU.score[ix] - segmentsInGPU.score[jx]; + // Always keep quads over trips. If they are the same, we want the object with better score + int idxToRemove; + if (quad_diff > 0) + idxToRemove = jx; + else if (quad_diff < 0) + idxToRemove = ix; + else if (score_diff < 0) + idxToRemove = jx; + else if (score_diff > 0) + idxToRemove = ix; + else + idxToRemove = ix; + + unsigned int phits2[4]; + phits2[0] = segmentsInGPU.pLSHitsIdxs[jx].x; + phits2[1] = segmentsInGPU.pLSHitsIdxs[jx].y; + phits2[2] = segmentsInGPU.pLSHitsIdxs[jx].z; + phits2[3] = segmentsInGPU.pLSHitsIdxs[jx].w; + + int npMatched = 0; + for (int i = 0; i < 4; i++) { + bool pmatched = false; + for (int j = 0; j < 4; j++) { + if (phits1[i] == phits2[j]) { + pmatched = true; + break; + } + } + if (pmatched) { + npMatched++; + // Only one hit is enough + if (secondpass) + break; + } + } + if (npMatched >= 3) { + rmPixelSegmentFromMemory(segmentsInGPU, idxToRemove, secondpass); + } + if (secondpass) { + float dEta = alpaka::math::abs(acc, eta_pix1 - eta_pix2); + float dPhi = SDL::calculate_dPhi(phi_pix1, phi_pix2); + + float dR2 = dEta * dEta + dPhi * dPhi; + if ((npMatched >= 1) || (dR2 < 1e-5f)) { + rmPixelSegmentFromMemory(segmentsInGPU, idxToRemove, secondpass); + } + } + } + } + } + }; +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/LST.dev.cc b/RecoTracker/LSTCore/src/alpaka/LST.dev.cc new file mode 100644 index 0000000000000..9eb11503123df --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/LST.dev.cc @@ -0,0 +1,427 @@ +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/LST.h" +#else +#include "LST.h" +#endif + +#include "Event.h" + +#include "Math/Vector3D.h" +using XYZVector = ROOT::Math::XYZVector; + +void SDL::LST::run(SDL::QueueAcc& queue, + bool verbose, + const LSTESDeviceData* deviceESData, + const std::vector see_px, + const std::vector see_py, + const std::vector see_pz, + const std::vector see_dxy, + const std::vector see_dz, + const std::vector see_ptErr, + const std::vector see_etaErr, + const std::vector see_stateTrajGlbX, + const std::vector see_stateTrajGlbY, + const std::vector see_stateTrajGlbZ, + const std::vector see_stateTrajGlbPx, + const std::vector see_stateTrajGlbPy, + const std::vector see_stateTrajGlbPz, + const std::vector see_q, + const std::vector> see_hitIdx, + const std::vector ph2_detId, + const std::vector ph2_x, + const std::vector ph2_y, + const std::vector ph2_z) { + auto event = SDL::Event(verbose, queue, deviceESData); + prepareInput(see_px, + see_py, + see_pz, + see_dxy, + see_dz, + see_ptErr, + see_etaErr, + see_stateTrajGlbX, + see_stateTrajGlbY, + see_stateTrajGlbZ, + see_stateTrajGlbPx, + see_stateTrajGlbPy, + see_stateTrajGlbPz, + see_q, + see_hitIdx, + ph2_detId, + ph2_x, + ph2_y, + ph2_z); + + event.addHitToEvent(in_trkX_, in_trkY_, in_trkZ_, in_hitId_, in_hitIdxs_); + event.addPixelSegmentToEvent(in_hitIndices_vec0_, + in_hitIndices_vec1_, + in_hitIndices_vec2_, + in_hitIndices_vec3_, + in_deltaPhi_vec_, + in_ptIn_vec_, + in_ptErr_vec_, + in_px_vec_, + in_py_vec_, + in_pz_vec_, + in_eta_vec_, + in_etaErr_vec_, + in_phi_vec_, + in_charge_vec_, + in_seedIdx_vec_, + in_superbin_vec_, + in_pixelType_vec_, + in_isQuad_vec_); + event.createMiniDoublets(); + if (verbose) { + printf("# of Mini-doublets produced: %d\n", event.getNumberOfMiniDoublets()); + printf("# of Mini-doublets produced barrel layer 1: %d\n", event.getNumberOfMiniDoubletsByLayerBarrel(0)); + printf("# of Mini-doublets produced barrel layer 2: %d\n", event.getNumberOfMiniDoubletsByLayerBarrel(1)); + printf("# of Mini-doublets produced barrel layer 3: %d\n", event.getNumberOfMiniDoubletsByLayerBarrel(2)); + printf("# of Mini-doublets produced barrel layer 4: %d\n", event.getNumberOfMiniDoubletsByLayerBarrel(3)); + printf("# of Mini-doublets produced barrel layer 5: %d\n", event.getNumberOfMiniDoubletsByLayerBarrel(4)); + printf("# of Mini-doublets produced barrel layer 6: %d\n", event.getNumberOfMiniDoubletsByLayerBarrel(5)); + printf("# of Mini-doublets produced endcap layer 1: %d\n", event.getNumberOfMiniDoubletsByLayerEndcap(0)); + printf("# of Mini-doublets produced endcap layer 2: %d\n", event.getNumberOfMiniDoubletsByLayerEndcap(1)); + printf("# of Mini-doublets produced endcap layer 3: %d\n", event.getNumberOfMiniDoubletsByLayerEndcap(2)); + printf("# of Mini-doublets produced endcap layer 4: %d\n", event.getNumberOfMiniDoubletsByLayerEndcap(3)); + printf("# of Mini-doublets produced endcap layer 5: %d\n", event.getNumberOfMiniDoubletsByLayerEndcap(4)); + } + + event.createSegmentsWithModuleMap(); + if (verbose) { + printf("# of Segments produced: %d\n", event.getNumberOfSegments()); + printf("# of Segments produced layer 1-2: %d\n", event.getNumberOfSegmentsByLayerBarrel(0)); + printf("# of Segments produced layer 2-3: %d\n", event.getNumberOfSegmentsByLayerBarrel(1)); + printf("# of Segments produced layer 3-4: %d\n", event.getNumberOfSegmentsByLayerBarrel(2)); + printf("# of Segments produced layer 4-5: %d\n", event.getNumberOfSegmentsByLayerBarrel(3)); + printf("# of Segments produced layer 5-6: %d\n", event.getNumberOfSegmentsByLayerBarrel(4)); + printf("# of Segments produced endcap layer 1: %d\n", event.getNumberOfSegmentsByLayerEndcap(0)); + printf("# of Segments produced endcap layer 2: %d\n", event.getNumberOfSegmentsByLayerEndcap(1)); + printf("# of Segments produced endcap layer 3: %d\n", event.getNumberOfSegmentsByLayerEndcap(2)); + printf("# of Segments produced endcap layer 4: %d\n", event.getNumberOfSegmentsByLayerEndcap(3)); + printf("# of Segments produced endcap layer 5: %d\n", event.getNumberOfSegmentsByLayerEndcap(4)); + } + + event.createTriplets(); + if (verbose) { + printf("# of T3s produced: %d\n", event.getNumberOfTriplets()); + printf("# of T3s produced layer 1-2-3: %d\n", event.getNumberOfTripletsByLayerBarrel(0)); + printf("# of T3s produced layer 2-3-4: %d\n", event.getNumberOfTripletsByLayerBarrel(1)); + printf("# of T3s produced layer 3-4-5: %d\n", event.getNumberOfTripletsByLayerBarrel(2)); + printf("# of T3s produced layer 4-5-6: %d\n", event.getNumberOfTripletsByLayerBarrel(3)); + printf("# of T3s produced endcap layer 1-2-3: %d\n", event.getNumberOfTripletsByLayerEndcap(0)); + printf("# of T3s produced endcap layer 2-3-4: %d\n", event.getNumberOfTripletsByLayerEndcap(1)); + printf("# of T3s produced endcap layer 3-4-5: %d\n", event.getNumberOfTripletsByLayerEndcap(2)); + printf("# of T3s produced endcap layer 1: %d\n", event.getNumberOfTripletsByLayerEndcap(0)); + printf("# of T3s produced endcap layer 2: %d\n", event.getNumberOfTripletsByLayerEndcap(1)); + printf("# of T3s produced endcap layer 3: %d\n", event.getNumberOfTripletsByLayerEndcap(2)); + printf("# of T3s produced endcap layer 4: %d\n", event.getNumberOfTripletsByLayerEndcap(3)); + printf("# of T3s produced endcap layer 5: %d\n", event.getNumberOfTripletsByLayerEndcap(4)); + } + + event.createQuintuplets(); + if (verbose) { + printf("# of Quintuplets produced: %d\n", event.getNumberOfQuintuplets()); + printf("# of Quintuplets produced layer 1-2-3-4-5-6: %d\n", event.getNumberOfQuintupletsByLayerBarrel(0)); + printf("# of Quintuplets produced layer 2: %d\n", event.getNumberOfQuintupletsByLayerBarrel(1)); + printf("# of Quintuplets produced layer 3: %d\n", event.getNumberOfQuintupletsByLayerBarrel(2)); + printf("# of Quintuplets produced layer 4: %d\n", event.getNumberOfQuintupletsByLayerBarrel(3)); + printf("# of Quintuplets produced layer 5: %d\n", event.getNumberOfQuintupletsByLayerBarrel(4)); + printf("# of Quintuplets produced layer 6: %d\n", event.getNumberOfQuintupletsByLayerBarrel(5)); + printf("# of Quintuplets produced endcap layer 1: %d\n", event.getNumberOfQuintupletsByLayerEndcap(0)); + printf("# of Quintuplets produced endcap layer 2: %d\n", event.getNumberOfQuintupletsByLayerEndcap(1)); + printf("# of Quintuplets produced endcap layer 3: %d\n", event.getNumberOfQuintupletsByLayerEndcap(2)); + printf("# of Quintuplets produced endcap layer 4: %d\n", event.getNumberOfQuintupletsByLayerEndcap(3)); + printf("# of Quintuplets produced endcap layer 5: %d\n", event.getNumberOfQuintupletsByLayerEndcap(4)); + } + + event.pixelLineSegmentCleaning(); + + event.createPixelQuintuplets(); + if (verbose) + printf("# of Pixel Quintuplets produced: %d\n", event.getNumberOfPixelQuintuplets()); + + event.createPixelTriplets(); + if (verbose) + printf("# of Pixel T3s produced: %d\n", event.getNumberOfPixelTriplets()); + + event.createTrackCandidates(); + if (verbose) { + printf("# of TrackCandidates produced: %d\n", event.getNumberOfTrackCandidates()); + printf(" # of Pixel TrackCandidates produced: %d\n", event.getNumberOfPixelTrackCandidates()); + printf(" # of pT5 TrackCandidates produced: %d\n", event.getNumberOfPT5TrackCandidates()); + printf(" # of pT3 TrackCandidates produced: %d\n", event.getNumberOfPT3TrackCandidates()); + printf(" # of pLS TrackCandidates produced: %d\n", event.getNumberOfPLSTrackCandidates()); + printf(" # of T5 TrackCandidates produced: %d\n", event.getNumberOfT5TrackCandidates()); + } + + getOutput(event); + + event.resetEvent(); +} + +namespace { + XYZVector calculateR3FromPCA(const XYZVector& p3, const float dxy, const float dz) { + const float pt = p3.rho(); + const float p = p3.r(); + const float vz = dz * pt * pt / p / p; + + const float vx = -dxy * p3.y() / pt - p3.x() / p * p3.z() / p * dz; + const float vy = dxy * p3.x() / pt - p3.y() / p * p3.z() / p * dz; + return {vx, vy, vz}; + } +} // namespace + +void SDL::LST::prepareInput(const std::vector see_px, + const std::vector see_py, + const std::vector see_pz, + const std::vector see_dxy, + const std::vector see_dz, + const std::vector see_ptErr, + const std::vector see_etaErr, + const std::vector see_stateTrajGlbX, + const std::vector see_stateTrajGlbY, + const std::vector see_stateTrajGlbZ, + const std::vector see_stateTrajGlbPx, + const std::vector see_stateTrajGlbPy, + const std::vector see_stateTrajGlbPz, + const std::vector see_q, + const std::vector> see_hitIdx, + const std::vector ph2_detId, + const std::vector ph2_x, + const std::vector ph2_y, + const std::vector ph2_z) { + unsigned int count = 0; + auto n_see = see_stateTrajGlbPx.size(); + std::vector px_vec; + px_vec.reserve(n_see); + std::vector py_vec; + py_vec.reserve(n_see); + std::vector pz_vec; + pz_vec.reserve(n_see); + std::vector hitIndices_vec0; + hitIndices_vec0.reserve(n_see); + std::vector hitIndices_vec1; + hitIndices_vec1.reserve(n_see); + std::vector hitIndices_vec2; + hitIndices_vec2.reserve(n_see); + std::vector hitIndices_vec3; + hitIndices_vec3.reserve(n_see); + std::vector ptIn_vec; + ptIn_vec.reserve(n_see); + std::vector ptErr_vec; + ptErr_vec.reserve(n_see); + std::vector etaErr_vec; + etaErr_vec.reserve(n_see); + std::vector eta_vec; + eta_vec.reserve(n_see); + std::vector phi_vec; + phi_vec.reserve(n_see); + std::vector charge_vec; + charge_vec.reserve(n_see); + std::vector seedIdx_vec; + seedIdx_vec.reserve(n_see); + std::vector deltaPhi_vec; + deltaPhi_vec.reserve(n_see); + std::vector trkX = ph2_x; + std::vector trkY = ph2_y; + std::vector trkZ = ph2_z; + std::vector hitId = ph2_detId; + std::vector hitIdxs(ph2_detId.size()); + + std::vector superbin_vec; + std::vector pixelType_vec; + std::vector isQuad_vec; + std::iota(hitIdxs.begin(), hitIdxs.end(), 0); + const int hit_size = trkX.size(); + + for (size_t iSeed = 0; iSeed < n_see; iSeed++) { + XYZVector p3LH(see_stateTrajGlbPx[iSeed], see_stateTrajGlbPy[iSeed], see_stateTrajGlbPz[iSeed]); + XYZVector p3LH_helper(see_stateTrajGlbPx[iSeed], see_stateTrajGlbPy[iSeed], see_stateTrajGlbPz[iSeed]); + float ptIn = p3LH.rho(); + float eta = p3LH.eta(); + float ptErr = see_ptErr[iSeed]; + + if ((ptIn > 0.8 - 2 * ptErr)) { + XYZVector r3LH(see_stateTrajGlbX[iSeed], see_stateTrajGlbY[iSeed], see_stateTrajGlbZ[iSeed]); + XYZVector p3PCA(see_px[iSeed], see_py[iSeed], see_pz[iSeed]); + XYZVector r3PCA(calculateR3FromPCA(p3PCA, see_dxy[iSeed], see_dz[iSeed])); + + float pixelSegmentDeltaPhiChange = (r3LH - p3LH_helper).phi(); //FIXME: this looks like a bug + float etaErr = see_etaErr[iSeed]; + float px = p3LH.x(); + float py = p3LH.y(); + float pz = p3LH.z(); + + int charge = see_q[iSeed]; + int pixtype = -1; + + if (ptIn >= 2.0) + pixtype = 0; + else if (ptIn >= (0.8 - 2 * ptErr) and ptIn < 2.0) { + if (pixelSegmentDeltaPhiChange >= 0) + pixtype = 1; + else + pixtype = 2; + } else + continue; + + unsigned int hitIdx0 = hit_size + count; + count++; + unsigned int hitIdx1 = hit_size + count; + count++; + unsigned int hitIdx2 = hit_size + count; + count++; + unsigned int hitIdx3; + if (see_hitIdx[iSeed].size() <= 3) + hitIdx3 = hitIdx2; + else { + hitIdx3 = hit_size + count; + count++; + } + + trkX.push_back(r3PCA.x()); + trkY.push_back(r3PCA.y()); + trkZ.push_back(r3PCA.z()); + trkX.push_back(p3PCA.rho()); + float p3PCA_Eta = p3PCA.eta(); + trkY.push_back(p3PCA_Eta); + float p3PCA_Phi = p3PCA.phi(); + trkZ.push_back(p3PCA_Phi); + trkX.push_back(r3LH.x()); + trkY.push_back(r3LH.y()); + trkZ.push_back(r3LH.z()); + hitId.push_back(1); + hitId.push_back(1); + hitId.push_back(1); + if (see_hitIdx[iSeed].size() > 3) { + trkX.push_back(r3LH.x()); + trkY.push_back(see_dxy[iSeed]); + trkZ.push_back(see_dz[iSeed]); + hitId.push_back(1); + } + px_vec.push_back(px); + py_vec.push_back(py); + pz_vec.push_back(pz); + + hitIndices_vec0.push_back(hitIdx0); + hitIndices_vec1.push_back(hitIdx1); + hitIndices_vec2.push_back(hitIdx2); + hitIndices_vec3.push_back(hitIdx3); + ptIn_vec.push_back(ptIn); + ptErr_vec.push_back(ptErr); + etaErr_vec.push_back(etaErr); + eta_vec.push_back(eta); + float phi = p3LH.phi(); + phi_vec.push_back(phi); + charge_vec.push_back(charge); + seedIdx_vec.push_back(iSeed); + deltaPhi_vec.push_back(pixelSegmentDeltaPhiChange); + + hitIdxs.push_back(see_hitIdx[iSeed][0]); + hitIdxs.push_back(see_hitIdx[iSeed][1]); + hitIdxs.push_back(see_hitIdx[iSeed][2]); + char isQuad = false; + if (see_hitIdx[iSeed].size() > 3) { + isQuad = true; + hitIdxs.push_back(see_hitIdx[iSeed][3]); + } + float neta = 25.; + float nphi = 72.; + float nz = 25.; + int etabin = (p3PCA_Eta + 2.6) / ((2 * 2.6) / neta); + int phibin = (p3PCA_Phi + 3.14159265358979323846) / ((2. * 3.14159265358979323846) / nphi); + int dzbin = (see_dz[iSeed] + 30) / (2 * 30 / nz); + int isuperbin = (nz * nphi) * etabin + (nz)*phibin + dzbin; + superbin_vec.push_back(isuperbin); + pixelType_vec.push_back(pixtype); + isQuad_vec.push_back(isQuad); + } + } + + in_trkX_ = trkX; + in_trkY_ = trkY; + in_trkZ_ = trkZ; + in_hitId_ = hitId; + in_hitIdxs_ = hitIdxs; + in_hitIndices_vec0_ = hitIndices_vec0; + in_hitIndices_vec1_ = hitIndices_vec1; + in_hitIndices_vec2_ = hitIndices_vec2; + in_hitIndices_vec3_ = hitIndices_vec3; + in_deltaPhi_vec_ = deltaPhi_vec; + in_ptIn_vec_ = ptIn_vec; + in_ptErr_vec_ = ptErr_vec; + in_px_vec_ = px_vec; + in_py_vec_ = py_vec; + in_pz_vec_ = pz_vec; + in_eta_vec_ = eta_vec; + in_etaErr_vec_ = etaErr_vec; + in_phi_vec_ = phi_vec; + in_charge_vec_ = charge_vec; + in_seedIdx_vec_ = seedIdx_vec; + in_superbin_vec_ = superbin_vec; + in_pixelType_vec_ = pixelType_vec; + in_isQuad_vec_ = isQuad_vec; +} + +void SDL::LST::getOutput(SDL::Event& event) { + std::vector> tc_hitIdxs; + std::vector tc_len; + std::vector tc_seedIdx; + std::vector tc_trackCandidateType; + + SDL::hitsBuffer& hitsInGPU = (*event.getHitsInCMSSW()); + SDL::trackCandidatesBuffer& trackCandidatesInGPU = (*event.getTrackCandidatesInCMSSW()); + + unsigned int nTrackCandidates = *trackCandidatesInGPU.nTrackCandidates; + for (unsigned int idx = 0; idx < nTrackCandidates; idx++) { + short trackCandidateType = trackCandidatesInGPU.trackCandidateType[idx]; + std::vector hit_idx = + getHitIdxs(trackCandidateType, idx, trackCandidatesInGPU.hitIndices, hitsInGPU.idxs); + + tc_hitIdxs.push_back(hit_idx); + tc_len.push_back(hit_idx.size()); + tc_seedIdx.push_back(trackCandidatesInGPU.pixelSeedIndex[idx]); + tc_trackCandidateType.push_back(trackCandidateType); + } + + out_tc_hitIdxs_ = tc_hitIdxs; + out_tc_len_ = tc_len; + out_tc_seedIdx_ = tc_seedIdx; + out_tc_trackCandidateType_ = tc_trackCandidateType; +} + +std::vector SDL::LST::getHitIdxs(const short trackCandidateType, + const unsigned int TCIdx, + const unsigned int* TCHitIndices, + const unsigned int* hitIndices) { + std::vector hits; + + unsigned int maxNHits = 0; + if (trackCandidateType == 7) + maxNHits = 14; // pT5 + else if (trackCandidateType == 5) + maxNHits = 10; // pT3 + else if (trackCandidateType == 4) + maxNHits = 10; // T5 + else if (trackCandidateType == 8) + maxNHits = 4; // pLS + + for (unsigned int i = 0; i < maxNHits; i++) { + unsigned int hitIdxInGPU = TCHitIndices[14 * TCIdx + i]; + unsigned int hitIdx = + (trackCandidateType == 8) + ? hitIdxInGPU + : hitIndices[hitIdxInGPU]; // Hit indices are stored differently in the standalone for pLS. + + // For p objects, the 3rd and 4th hit maybe the same, + // due to the way pLS hits are stored in the standalone. + // This is because pixel seeds can be either triplets or quadruplets. + if (trackCandidateType != 4 && hits.size() == 3 && hits.back() == hitIdx) // Remove duplicate 4th hits. + continue; + + hits.push_back(hitIdx); + } + + return hits; +} diff --git a/RecoTracker/LSTCore/src/alpaka/LSTESData.dev.cc b/RecoTracker/LSTCore/src/alpaka/LSTESData.dev.cc new file mode 100644 index 0000000000000..3893355837e7c --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/LSTESData.dev.cc @@ -0,0 +1,114 @@ +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/LSTESData.h" +#else +#include "LSTESData.h" +#endif + +#include "EndcapGeometry.h" +#include "ModuleConnectionMap.h" +#include "TiltedGeometry.h" +#include "PixelMap.h" +#include "ModuleMethods.h" + +namespace { + std::string trackLooperDir() { + const char* path_lst_base = std::getenv("LST_BASE"); + const char* path_tracklooperdir = std::getenv("TRACKLOOPERDIR"); + std::string path_str; + if (path_lst_base != nullptr) { + path_str = path_lst_base; + } else if (path_tracklooperdir != nullptr) { + path_str = path_tracklooperdir; + path_str += "/../"; + } else { + // FIXME: temporary solution, will need to pass a value from FileInPath or CMSSW search path + // in the `LSTProducer` or a related ES producer + path_str = std::getenv("CMSSW_BASE"); + path_str += "/src/RecoTracker/LSTCore"; + } + return path_str; + } + + std::string get_absolute_path_after_check_file_exists(const std::string name) { + std::filesystem::path fullpath = std::filesystem::absolute(name.c_str()); + if (not std::filesystem::exists(fullpath)) { + std::cout << "ERROR: Could not find the file = " << fullpath << std::endl; + exit(2); + } + return fullpath.string(); + } + + void loadMapsHost(SDL::MapPLStoLayer& pLStoLayer, + std::shared_ptr> endcapGeometry, + std::shared_ptr> tiltedGeometry, + std::shared_ptr> moduleConnectionMap) { + // Module orientation information (DrDz or phi angles) + auto endcap_geom = + get_absolute_path_after_check_file_exists(trackLooperDir() + "/data/OT800_IT615_pt0.8/endcap_orientation.bin"); + auto tilted_geom = get_absolute_path_after_check_file_exists( + trackLooperDir() + "/data/OT800_IT615_pt0.8/tilted_barrel_orientation.bin"); + // Module connection map (for line segment building) + auto mappath = get_absolute_path_after_check_file_exists( + trackLooperDir() + "/data/OT800_IT615_pt0.8/module_connection_tracing_merged.bin"); + + endcapGeometry->load(endcap_geom); + tiltedGeometry->load(tilted_geom); + moduleConnectionMap->load(mappath); + + auto pLSMapDir = trackLooperDir() + "/data/OT800_IT615_pt0.8/pixelmap/pLS_map"; + const std::array connects{ + {"_layer1_subdet5", "_layer2_subdet5", "_layer1_subdet4", "_layer2_subdet4"}}; + std::string path; + + static_assert(connects.size() == std::tuple_size>{}); + for (unsigned int i = 0; i < connects.size(); i++) { + auto connectData = connects[i].data(); + + path = pLSMapDir + connectData + ".bin"; + pLStoLayer[0][i] = SDL::ModuleConnectionMap(get_absolute_path_after_check_file_exists(path)); + + path = pLSMapDir + "_pos" + connectData + ".bin"; + pLStoLayer[1][i] = SDL::ModuleConnectionMap(get_absolute_path_after_check_file_exists(path)); + + path = pLSMapDir + "_neg" + connectData + ".bin"; + pLStoLayer[2][i] = SDL::ModuleConnectionMap(get_absolute_path_after_check_file_exists(path)); + } + } +} // namespace + +std::unique_ptr> SDL::loadAndFillESHost() { + auto pLStoLayer = std::make_shared(); + auto endcapGeometry = std::make_shared>(); + auto tiltedGeometry = std::make_shared>(); + auto moduleConnectionMap = std::make_shared>(); + ::loadMapsHost(*pLStoLayer, endcapGeometry, tiltedGeometry, moduleConnectionMap); + return std::make_unique>(pLStoLayer, endcapGeometry, tiltedGeometry, moduleConnectionMap); +} + +std::unique_ptr> SDL::loadAndFillESDevice(SDL::QueueAcc& queue, + const LSTESHostData* hostData) { + SDL::Dev const& devAccIn = alpaka::getDev(queue); + uint16_t nModules; + uint16_t nLowerModules; + unsigned int nPixels; + std::shared_ptr> modulesBuffers = nullptr; + auto endcapGeometry = std::make_shared>(devAccIn, queue, *hostData->endcapGeometry); + auto pixelMapping = std::make_shared(); + auto moduleConnectionMap = hostData->moduleConnectionMap; + + auto path = + get_absolute_path_after_check_file_exists(trackLooperDir() + "/data/OT800_IT615_pt0.8/sensor_centroids.bin"); + SDL::loadModulesFromFile(queue, + hostData->mapPLStoLayer.get(), + path.c_str(), + nModules, + nLowerModules, + nPixels, + modulesBuffers, + pixelMapping.get(), + endcapGeometry.get(), + hostData->tiltedGeometry.get(), + moduleConnectionMap.get()); + return std::make_unique>( + nModules, nLowerModules, nPixels, modulesBuffers, endcapGeometry, pixelMapping); +} diff --git a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h new file mode 100644 index 0000000000000..aa63d51345a7f --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h @@ -0,0 +1,1106 @@ +#ifndef MiniDoublet_cuh +#define MiniDoublet_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "EndcapGeometry.h" +#include "Hit.h" + +namespace SDL { + struct miniDoublets { + unsigned int* nMemoryLocations; + + unsigned int* anchorHitIndices; + unsigned int* outerHitIndices; + uint16_t* moduleIndices; + unsigned int* nMDs; //counter per module + unsigned int* totOccupancyMDs; //counter per module + float* dphichanges; + + float* dzs; //will store drt if the module is endcap + float* dphis; + + float* shiftedXs; + float* shiftedYs; + float* shiftedZs; + float* noShiftedDzs; //if shifted module + float* noShiftedDphis; //if shifted module + float* noShiftedDphiChanges; //if shifted module + + float* anchorX; + float* anchorY; + float* anchorZ; + float* anchorRt; + float* anchorPhi; + float* anchorEta; + float* anchorHighEdgeX; + float* anchorHighEdgeY; + float* anchorLowEdgeX; + float* anchorLowEdgeY; + float* anchorLowEdgePhi; + float* anchorHighEdgePhi; + + float* outerX; + float* outerY; + float* outerZ; + float* outerRt; + float* outerPhi; + float* outerEta; + float* outerHighEdgeX; + float* outerHighEdgeY; + float* outerLowEdgeX; + float* outerLowEdgeY; + + template + void setData(TBuf& mdsbuf) { + nMemoryLocations = alpaka::getPtrNative(mdsbuf.nMemoryLocations_buf); + anchorHitIndices = alpaka::getPtrNative(mdsbuf.anchorHitIndices_buf); + outerHitIndices = alpaka::getPtrNative(mdsbuf.outerHitIndices_buf); + moduleIndices = alpaka::getPtrNative(mdsbuf.moduleIndices_buf); + nMDs = alpaka::getPtrNative(mdsbuf.nMDs_buf); + totOccupancyMDs = alpaka::getPtrNative(mdsbuf.totOccupancyMDs_buf); + dphichanges = alpaka::getPtrNative(mdsbuf.dphichanges_buf); + dzs = alpaka::getPtrNative(mdsbuf.dzs_buf); + dphis = alpaka::getPtrNative(mdsbuf.dphis_buf); + shiftedXs = alpaka::getPtrNative(mdsbuf.shiftedXs_buf); + shiftedYs = alpaka::getPtrNative(mdsbuf.shiftedYs_buf); + shiftedZs = alpaka::getPtrNative(mdsbuf.shiftedZs_buf); + noShiftedDzs = alpaka::getPtrNative(mdsbuf.noShiftedDzs_buf); + noShiftedDphis = alpaka::getPtrNative(mdsbuf.noShiftedDphis_buf); + noShiftedDphiChanges = alpaka::getPtrNative(mdsbuf.noShiftedDphiChanges_buf); + anchorX = alpaka::getPtrNative(mdsbuf.anchorX_buf); + anchorY = alpaka::getPtrNative(mdsbuf.anchorY_buf); + anchorZ = alpaka::getPtrNative(mdsbuf.anchorZ_buf); + anchorRt = alpaka::getPtrNative(mdsbuf.anchorRt_buf); + anchorPhi = alpaka::getPtrNative(mdsbuf.anchorPhi_buf); + anchorEta = alpaka::getPtrNative(mdsbuf.anchorEta_buf); + anchorHighEdgeX = alpaka::getPtrNative(mdsbuf.anchorHighEdgeX_buf); + anchorHighEdgeY = alpaka::getPtrNative(mdsbuf.anchorHighEdgeY_buf); + anchorLowEdgeX = alpaka::getPtrNative(mdsbuf.anchorLowEdgeX_buf); + anchorLowEdgeY = alpaka::getPtrNative(mdsbuf.anchorLowEdgeY_buf); + outerX = alpaka::getPtrNative(mdsbuf.outerX_buf); + outerY = alpaka::getPtrNative(mdsbuf.outerY_buf); + outerZ = alpaka::getPtrNative(mdsbuf.outerZ_buf); + outerRt = alpaka::getPtrNative(mdsbuf.outerRt_buf); + outerPhi = alpaka::getPtrNative(mdsbuf.outerPhi_buf); + outerEta = alpaka::getPtrNative(mdsbuf.outerEta_buf); + outerHighEdgeX = alpaka::getPtrNative(mdsbuf.outerHighEdgeX_buf); + outerHighEdgeY = alpaka::getPtrNative(mdsbuf.outerHighEdgeY_buf); + outerLowEdgeX = alpaka::getPtrNative(mdsbuf.outerLowEdgeX_buf); + outerLowEdgeY = alpaka::getPtrNative(mdsbuf.outerLowEdgeY_buf); + anchorLowEdgePhi = alpaka::getPtrNative(mdsbuf.anchorLowEdgePhi_buf); + anchorHighEdgePhi = alpaka::getPtrNative(mdsbuf.anchorHighEdgePhi_buf); + } + }; + + template + struct miniDoubletsBuffer : miniDoublets { + Buf nMemoryLocations_buf; + + Buf anchorHitIndices_buf; + Buf outerHitIndices_buf; + Buf moduleIndices_buf; + Buf nMDs_buf; + Buf totOccupancyMDs_buf; + Buf dphichanges_buf; + + Buf dzs_buf; + Buf dphis_buf; + + Buf shiftedXs_buf; + Buf shiftedYs_buf; + Buf shiftedZs_buf; + Buf noShiftedDzs_buf; + Buf noShiftedDphis_buf; + Buf noShiftedDphiChanges_buf; + + Buf anchorX_buf; + Buf anchorY_buf; + Buf anchorZ_buf; + Buf anchorRt_buf; + Buf anchorPhi_buf; + Buf anchorEta_buf; + Buf anchorHighEdgeX_buf; + Buf anchorHighEdgeY_buf; + Buf anchorLowEdgeX_buf; + Buf anchorLowEdgeY_buf; + Buf anchorLowEdgePhi_buf; + Buf anchorHighEdgePhi_buf; + + Buf outerX_buf; + Buf outerY_buf; + Buf outerZ_buf; + Buf outerRt_buf; + Buf outerPhi_buf; + Buf outerEta_buf; + Buf outerHighEdgeX_buf; + Buf outerHighEdgeY_buf; + Buf outerLowEdgeX_buf; + Buf outerLowEdgeY_buf; + + template + miniDoubletsBuffer(unsigned int nMemoryLoc, uint16_t nLowerModules, TDevAcc const& devAccIn, TQueue& queue) + : nMemoryLocations_buf(allocBufWrapper(devAccIn, 1, queue)), + anchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + moduleIndices_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + nMDs_buf(allocBufWrapper(devAccIn, nLowerModules + 1, queue)), + totOccupancyMDs_buf(allocBufWrapper(devAccIn, nLowerModules + 1, queue)), + dphichanges_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + dzs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + dphis_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + shiftedXs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + shiftedYs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + shiftedZs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + noShiftedDzs_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + noShiftedDphis_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + noShiftedDphiChanges_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorZ_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorRt_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorPhi_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorEta_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorLowEdgePhi_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + anchorHighEdgePhi_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerZ_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerRt_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerPhi_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerEta_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerHighEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerHighEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerLowEdgeX_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)), + outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)) { + alpaka::memset(queue, nMDs_buf, 0u); + alpaka::memset(queue, totOccupancyMDs_buf, 0u); + alpaka::wait(queue); + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(TAcc const& acc, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::hits& hitsInGPU, + struct SDL::modules& modulesInGPU, + unsigned int lowerHitIdx, + unsigned int upperHitIdx, + uint16_t& lowerModuleIdx, + float dz, + float dPhi, + float dPhiChange, + float shiftedX, + float shiftedY, + float shiftedZ, + float noShiftedDz, + float noShiftedDphi, + float noShiftedDPhiChange, + unsigned int idx) { + //the index into which this MD needs to be written will be computed in the kernel + //nMDs variable will be incremented in the kernel, no need to worry about that here + + mdsInGPU.moduleIndices[idx] = lowerModuleIdx; + unsigned int anchorHitIndex, outerHitIndex; + if (modulesInGPU.moduleType[lowerModuleIdx] == PS and modulesInGPU.moduleLayerType[lowerModuleIdx] == Strip) { + mdsInGPU.anchorHitIndices[idx] = upperHitIdx; + mdsInGPU.outerHitIndices[idx] = lowerHitIdx; + + anchorHitIndex = upperHitIdx; + outerHitIndex = lowerHitIdx; + } else { + mdsInGPU.anchorHitIndices[idx] = lowerHitIdx; + mdsInGPU.outerHitIndices[idx] = upperHitIdx; + + anchorHitIndex = lowerHitIdx; + outerHitIndex = upperHitIdx; + } + + mdsInGPU.dphichanges[idx] = dPhiChange; + + mdsInGPU.dphis[idx] = dPhi; + mdsInGPU.dzs[idx] = dz; + mdsInGPU.shiftedXs[idx] = shiftedX; + mdsInGPU.shiftedYs[idx] = shiftedY; + mdsInGPU.shiftedZs[idx] = shiftedZ; + + mdsInGPU.noShiftedDzs[idx] = noShiftedDz; + mdsInGPU.noShiftedDphis[idx] = noShiftedDphi; + mdsInGPU.noShiftedDphiChanges[idx] = noShiftedDPhiChange; + + mdsInGPU.anchorX[idx] = hitsInGPU.xs[anchorHitIndex]; + mdsInGPU.anchorY[idx] = hitsInGPU.ys[anchorHitIndex]; + mdsInGPU.anchorZ[idx] = hitsInGPU.zs[anchorHitIndex]; + mdsInGPU.anchorRt[idx] = hitsInGPU.rts[anchorHitIndex]; + mdsInGPU.anchorPhi[idx] = hitsInGPU.phis[anchorHitIndex]; + mdsInGPU.anchorEta[idx] = hitsInGPU.etas[anchorHitIndex]; + mdsInGPU.anchorHighEdgeX[idx] = hitsInGPU.highEdgeXs[anchorHitIndex]; + mdsInGPU.anchorHighEdgeY[idx] = hitsInGPU.highEdgeYs[anchorHitIndex]; + mdsInGPU.anchorLowEdgeX[idx] = hitsInGPU.lowEdgeXs[anchorHitIndex]; + mdsInGPU.anchorLowEdgeY[idx] = hitsInGPU.lowEdgeYs[anchorHitIndex]; + mdsInGPU.anchorHighEdgePhi[idx] = + alpaka::math::atan2(acc, mdsInGPU.anchorHighEdgeY[idx], mdsInGPU.anchorHighEdgeX[idx]); + mdsInGPU.anchorLowEdgePhi[idx] = + alpaka::math::atan2(acc, mdsInGPU.anchorLowEdgeY[idx], mdsInGPU.anchorLowEdgeX[idx]); + + mdsInGPU.outerX[idx] = hitsInGPU.xs[outerHitIndex]; + mdsInGPU.outerY[idx] = hitsInGPU.ys[outerHitIndex]; + mdsInGPU.outerZ[idx] = hitsInGPU.zs[outerHitIndex]; + mdsInGPU.outerRt[idx] = hitsInGPU.rts[outerHitIndex]; + mdsInGPU.outerPhi[idx] = hitsInGPU.phis[outerHitIndex]; + mdsInGPU.outerEta[idx] = hitsInGPU.etas[outerHitIndex]; + mdsInGPU.outerHighEdgeX[idx] = hitsInGPU.highEdgeXs[outerHitIndex]; + mdsInGPU.outerHighEdgeY[idx] = hitsInGPU.highEdgeYs[outerHitIndex]; + mdsInGPU.outerLowEdgeX[idx] = hitsInGPU.lowEdgeXs[outerHitIndex]; + mdsInGPU.outerLowEdgeY[idx] = hitsInGPU.lowEdgeYs[outerHitIndex]; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules(struct SDL::modules& modulesInGPU, + uint16_t& moduleIndex) { + // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing + // This is the same as what was previously considered as"isNormalTiltedModules" + // See Figure 9.1 of https://cds.cern.ch/record/2272264/files/CMS-TDR-014.pdf + short subdet = modulesInGPU.subdets[moduleIndex]; + short layer = modulesInGPU.layers[moduleIndex]; + short side = modulesInGPU.sides[moduleIndex]; + short rod = modulesInGPU.rods[moduleIndex]; + + if ((subdet == Barrel and side != Center and layer == 3) or + (subdet == Barrel and side == NegZ and layer == 2 and rod > 5) or + (subdet == Barrel and side == PosZ and layer == 2 and rod < 8) or + (subdet == Barrel and side == NegZ and layer == 1 and rod > 9) or + (subdet == Barrel and side == PosZ and layer == 1 and rod < 4)) + return true; + else + return false; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float moduleGapSize(struct SDL::modules& modulesInGPU, uint16_t& moduleIndex) { + float miniDeltaTilted[3] = {0.26f, 0.26f, 0.26f}; + float miniDeltaFlat[6] = {0.26f, 0.16f, 0.16f, 0.18f, 0.18f, 0.18f}; + float miniDeltaLooseTilted[3] = {0.4f, 0.4f, 0.4f}; + float miniDeltaEndcap[5][15]; + + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 15; j++) { + if (i == 0 || i == 1) { + if (j < 10) { + miniDeltaEndcap[i][j] = 0.4f; + } else { + miniDeltaEndcap[i][j] = 0.18f; + } + } else if (i == 2 || i == 3) { + if (j < 8) { + miniDeltaEndcap[i][j] = 0.4f; + } else { + miniDeltaEndcap[i][j] = 0.18f; + } + } else { + if (j < 9) { + miniDeltaEndcap[i][j] = 0.4f; + } else { + miniDeltaEndcap[i][j] = 0.18f; + } + } + } + } + + unsigned int iL = modulesInGPU.layers[moduleIndex] - 1; + unsigned int iR = modulesInGPU.rings[moduleIndex] - 1; + short subdet = modulesInGPU.subdets[moduleIndex]; + short side = modulesInGPU.sides[moduleIndex]; + + float moduleSeparation = 0; + + if (subdet == Barrel and side == Center) { + moduleSeparation = miniDeltaFlat[iL]; + } else if (isTighterTiltedModules(modulesInGPU, moduleIndex)) { + moduleSeparation = miniDeltaTilted[iL]; + } else if (subdet == Endcap) { + moduleSeparation = miniDeltaEndcap[iL][iR]; + } else //Loose tilted modules + { + moduleSeparation = miniDeltaLooseTilted[iL]; + } + + return moduleSeparation; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float dPhiThreshold(TAcc const& acc, + float rt, + struct SDL::modules& modulesInGPU, + uint16_t& moduleIndex, + float dPhi = 0, + float dz = 0) { + // ================================================================= + // Various constants + // ================================================================= + //mean of the horizontal layer position in y; treat this as R below + + // ================================================================= + // Computing some components that make up the cut threshold + // ================================================================= + + unsigned int iL = modulesInGPU.layers[moduleIndex] - 1; + const float miniSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rt * k2Rinv1GeVf / ptCut, sinAlphaMax)); + const float rLayNominal = + ((modulesInGPU.subdets[moduleIndex] == Barrel) ? miniRminMeanBarrel[iL] : miniRminMeanEndcap[iL]); + const float miniPVoff = 0.1f / rLayNominal; + const float miniMuls = ((modulesInGPU.subdets[moduleIndex] == Barrel) ? miniMulsPtScaleBarrel[iL] * 3.f / ptCut + : miniMulsPtScaleEndcap[iL] * 3.f / ptCut); + const bool isTilted = modulesInGPU.subdets[moduleIndex] == Barrel and modulesInGPU.sides[moduleIndex] != Center; + //the lower module is sent in irrespective of its layer type. We need to fetch the drdz properly + + float drdz; + if (isTilted) { + if (modulesInGPU.moduleType[moduleIndex] == PS and modulesInGPU.moduleLayerType[moduleIndex] == Strip) { + drdz = modulesInGPU.drdzs[moduleIndex]; + } else { + drdz = modulesInGPU.drdzs[modulesInGPU.partnerModuleIndices[moduleIndex]]; + } + } else { + drdz = 0; + } + const float miniTilt = ((isTilted) ? 0.5f * pixelPSZpitch * drdz / alpaka::math::sqrt(acc, 1.f + drdz * drdz) / + moduleGapSize(modulesInGPU, moduleIndex) + : 0); + + // Compute luminous region requirement for endcap + const float miniLum = alpaka::math::abs(acc, dPhi * deltaZLum / dz); // Balaji's new error + + // ================================================================= + // Return the threshold value + // ================================================================= + // Following condition is met if the module is central and flatly lying + if (modulesInGPU.subdets[moduleIndex] == Barrel and modulesInGPU.sides[moduleIndex] == Center) { + return miniSlope + alpaka::math::sqrt(acc, miniMuls * miniMuls + miniPVoff * miniPVoff); + } + // Following condition is met if the module is central and tilted + else if (modulesInGPU.subdets[moduleIndex] == Barrel and + modulesInGPU.sides[moduleIndex] != Center) //all types of tilted modules + { + return miniSlope + + alpaka::math::sqrt( + acc, miniMuls * miniMuls + miniPVoff * miniPVoff + miniTilt * miniTilt * miniSlope * miniSlope); + } + // If not barrel, it is Endcap + else { + return miniSlope + alpaka::math::sqrt(acc, miniMuls * miniMuls + miniPVoff * miniPVoff + miniLum * miniLum); + } + }; + + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC void shiftStripHits(TAcc const& acc, + struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex, + uint16_t& upperModuleIndex, + unsigned int lowerHitIndex, + unsigned int upperHitIndex, + float* shiftedCoords, + float xLower, + float yLower, + float zLower, + float rtLower, + float xUpper, + float yUpper, + float zUpper, + float rtUpper) { + // This is the strip shift scheme that is explained in http://uaf-10.t2.ucsd.edu/~phchang/talks/PhilipChang20190607_SDL_Update.pdf (see backup slides) + // The main feature of this shifting is that the strip hits are shifted to be "aligned" in the line of sight from interaction point to the the pixel hit. + // (since pixel hit is well defined in 3-d) + // The strip hit is shifted along the strip detector to be placed in a guessed position where we think they would have actually crossed + // The size of the radial direction shift due to module separation gap is computed in "radial" size, while the shift is done along the actual strip orientation + // This means that there may be very very subtle edge effects coming from whether the strip hit is center of the module or the at the edge of the module + // But this should be relatively minor effect + + // dependent variables for this if statement + // lowerModule + // lowerHit + // upperHit + // SDL::endcapGeometry + // SDL::tiltedGeometry + + // Some variables relevant to the function + float xp; // pixel x (pixel hit x) + float yp; // pixel y (pixel hit y) + float zp; // pixel y (pixel hit y) + float rtp; // pixel y (pixel hit y) + float xa; // "anchor" x (the anchor position on the strip module plane from pixel hit) + float ya; // "anchor" y (the anchor position on the strip module plane from pixel hit) + float xo; // old x (before the strip hit is moved up or down) + float yo; // old y (before the strip hit is moved up or down) + float xn; // new x (after the strip hit is moved up or down) + float yn; // new y (after the strip hit is moved up or down) + float abszn; // new z in absolute value + float zn; // new z with the sign (+/-) accounted + float angleA; // in r-z plane the theta of the pixel hit in polar coordinate is the angleA + float angleB; // this is the angle of tilted module in r-z plane ("drdz"), for endcap this is 90 degrees + bool isEndcap; // If endcap, drdz = infinity + float moduleSeparation; + float drprime; // The radial shift size in x-y plane projection + float drprime_x; // x-component of drprime + float drprime_y; // y-component of drprime + const float& slope = + modulesInGPU.dxdys[lowerModuleIndex]; // The slope of the possible strip hits for a given module in x-y plane + float absArctanSlope; + float angleM; // the angle M is the angle of rotation of the module in x-y plane if the possible strip hits are along the x-axis, then angleM = 0, and if the possible strip hits are along y-axis angleM = 90 degrees + float absdzprime; // The distance between the two points after shifting + const float& drdz_ = modulesInGPU.drdzs[lowerModuleIndex]; + // Assign hit pointers based on their hit type + if (modulesInGPU.moduleType[lowerModuleIndex] == PS) { + // TODO: This is somewhat of an mystery.... somewhat confused why this is the case + if (modulesInGPU.subdets[lowerModuleIndex] == Barrel ? modulesInGPU.moduleLayerType[lowerModuleIndex] != Pixel + : modulesInGPU.moduleLayerType[lowerModuleIndex] == Pixel) { + xo = xUpper; + yo = yUpper; + xp = xLower; + yp = yLower; + zp = zLower; + rtp = rtLower; + } else { + xo = xLower; + yo = yLower; + xp = xUpper; + yp = yUpper; + zp = zUpper; + rtp = rtUpper; + } + } else { + xo = xUpper; + yo = yUpper; + xp = xLower; + yp = yLower; + zp = zLower; + rtp = rtLower; + } + + // If it is endcap some of the math gets simplified (and also computers don't like infinities) + isEndcap = modulesInGPU.subdets[lowerModuleIndex] == Endcap; + + // NOTE: TODO: Keep in mind that the sin(atan) function can be simplified to something like x / sqrt(1 + x^2) and similar for cos + // I am not sure how slow sin, atan, cos, functions are in c++. If x / sqrt(1 + x^2) are faster change this later to reduce arithmetic computation time + angleA = alpaka::math::abs(acc, alpaka::math::atan(acc, rtp / zp)); + angleB = + ((isEndcap) + ? float(M_PI) / 2.f + : alpaka::math::atan( + acc, + drdz_)); // The tilt module on the positive z-axis has negative drdz slope in r-z plane and vice versa + + moduleSeparation = moduleGapSize(modulesInGPU, lowerModuleIndex); + + // Sign flips if the pixel is later layer + if (modulesInGPU.moduleType[lowerModuleIndex] == PS and modulesInGPU.moduleLayerType[lowerModuleIndex] != Pixel) { + moduleSeparation *= -1; + } + + drprime = (moduleSeparation / alpaka::math::sin(acc, angleA + angleB)) * alpaka::math::sin(acc, angleA); + + // Compute arctan of the slope and take care of the slope = infinity case + absArctanSlope = ((slope != SDL::SDL_INF) ? fabs(alpaka::math::atan(acc, slope)) : float(M_PI) / 2.f); + + // Depending on which quadrant the pixel hit lies, we define the angleM by shifting them slightly differently + if (xp > 0 and yp > 0) { + angleM = absArctanSlope; + } else if (xp > 0 and yp < 0) { + angleM = float(M_PI) - absArctanSlope; + } else if (xp < 0 and yp < 0) { + angleM = float(M_PI) + absArctanSlope; + } else // if (xp < 0 and yp > 0) + { + angleM = 2.f * float(M_PI) - absArctanSlope; + } + + // Then since the angleM sign is taken care of properly + drprime_x = drprime * alpaka::math::sin(acc, angleM); + drprime_y = drprime * alpaka::math::cos(acc, angleM); + + // The new anchor position is + xa = xp + drprime_x; + ya = yp + drprime_y; + + // Compute the new strip hit position (if the slope value is in special condition take care of the exceptions) + if (slope == + SDL::SDL_INF) // Designated for tilted module when the slope is exactly infinity (module lying along y-axis) + { + xn = xa; // New x point is simply where the anchor is + yn = yo; // No shift in y + } else if (slope == 0) { + xn = xo; // New x point is simply where the anchor is + yn = ya; // No shift in y + } else { + xn = (slope * xa + (1.f / slope) * xo - ya + yo) / (slope + (1.f / slope)); // new xn + yn = (xn - xa) * slope + ya; // new yn + } + + // Computing new Z position + absdzprime = alpaka::math::abs( + acc, + moduleSeparation / alpaka::math::sin(acc, angleA + angleB) * + alpaka::math::cos( + acc, + angleA)); // module separation sign is for shifting in radial direction for z-axis direction take care of the sign later + + // Depending on which one as closer to the interactin point compute the new z wrt to the pixel properly + if (modulesInGPU.moduleLayerType[lowerModuleIndex] == Pixel) { + abszn = alpaka::math::abs(acc, zp) + absdzprime; + } else { + abszn = alpaka::math::abs(acc, zp) - absdzprime; + } + + zn = abszn * ((zp > 0) ? 1 : -1); // Apply the sign of the zn + + shiftedCoords[0] = xn; + shiftedCoords[1] = yn; + shiftedCoords[2] = zn; + }; + + template + ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgo(TAcc const& acc, + struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex, + uint16_t& upperModuleIndex, + unsigned int lowerHitIndex, + unsigned int upperHitIndex, + float& dz, + float& dPhi, + float& dPhiChange, + float& shiftedX, + float& shiftedY, + float& shiftedZ, + float& noShiftedDz, + float& noShiftedDphi, + float& noShiftedDphiChange, + float xLower, + float yLower, + float zLower, + float rtLower, + float xUpper, + float yUpper, + float zUpper, + float rtUpper) { + if (modulesInGPU.subdets[lowerModuleIndex] == SDL::Barrel) { + return runMiniDoubletDefaultAlgoBarrel(acc, + modulesInGPU, + lowerModuleIndex, + upperModuleIndex, + lowerHitIndex, + upperHitIndex, + dz, + dPhi, + dPhiChange, + shiftedX, + shiftedY, + shiftedZ, + noShiftedDz, + noShiftedDphi, + noShiftedDphiChange, + xLower, + yLower, + zLower, + rtLower, + xUpper, + yUpper, + zUpper, + rtUpper); + } else { + return runMiniDoubletDefaultAlgoEndcap(acc, + modulesInGPU, + lowerModuleIndex, + upperModuleIndex, + lowerHitIndex, + upperHitIndex, + dz, + dPhi, + dPhiChange, + shiftedX, + shiftedY, + shiftedZ, + noShiftedDz, + noShiftedDphi, + noShiftedDphiChange, + xLower, + yLower, + zLower, + rtLower, + xUpper, + yUpper, + zUpper, + rtUpper); + } + }; + + template + ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgoBarrel(TAcc const& acc, + struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex, + uint16_t& upperModuleIndex, + unsigned int lowerHitIndex, + unsigned int upperHitIndex, + float& dz, + float& dPhi, + float& dPhiChange, + float& shiftedX, + float& shiftedY, + float& shiftedZ, + float& noShiftedDz, + float& noShiftedDphi, + float& noShiftedDphiChange, + float xLower, + float yLower, + float zLower, + float rtLower, + float xUpper, + float yUpper, + float zUpper, + float rtUpper) { + bool pass = true; + dz = zLower - zUpper; + const float dzCut = modulesInGPU.moduleType[lowerModuleIndex] == SDL::PS ? 2.f : 10.f; + //const float sign = ((dz > 0) - (dz < 0)) * ((hitsInGPU.zs[lowerHitIndex] > 0) - (hitsInGPU.zs[lowerHitIndex] < 0)); + const float sign = ((dz > 0) - (dz < 0)) * ((zLower > 0) - (zLower < 0)); + const float invertedcrossercut = (alpaka::math::abs(acc, dz) > 2) * sign; + + pass = pass and ((alpaka::math::abs(acc, dz) < dzCut) && (invertedcrossercut <= 0)); + if (not pass) + return pass; + + float miniCut = 0; + + miniCut = modulesInGPU.moduleLayerType[lowerModuleIndex] == SDL::Pixel + ? dPhiThreshold(acc, rtLower, modulesInGPU, lowerModuleIndex) + : dPhiThreshold(acc, rtUpper, modulesInGPU, lowerModuleIndex); + + // Cut #2: dphi difference + // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3085 + float xn = 0.f, yn = 0.f; // , zn = 0; + float shiftedRt; + if (modulesInGPU.sides[lowerModuleIndex] != Center) // If barrel and not center it is tilted + { + // Shift the hits and calculate new xn, yn position + float shiftedCoords[3]; + shiftStripHits(acc, + modulesInGPU, + lowerModuleIndex, + upperModuleIndex, + lowerHitIndex, + upperHitIndex, + shiftedCoords, + xLower, + yLower, + zLower, + rtLower, + xUpper, + yUpper, + zUpper, + rtUpper); + xn = shiftedCoords[0]; + yn = shiftedCoords[1]; + + // Lower or the upper hit needs to be modified depending on which one was actually shifted + if (modulesInGPU.moduleLayerType[lowerModuleIndex] == SDL::Pixel) { + shiftedX = xn; + shiftedY = yn; + shiftedZ = zUpper; + shiftedRt = alpaka::math::sqrt(acc, xn * xn + yn * yn); + + dPhi = SDL::deltaPhi(acc, xLower, yLower, shiftedX, shiftedY); //function from Hit.cc + noShiftedDphi = SDL::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + } else { + shiftedX = xn; + shiftedY = yn; + shiftedZ = zLower; + shiftedRt = alpaka::math::sqrt(acc, xn * xn + yn * yn); + dPhi = SDL::deltaPhi(acc, shiftedX, shiftedY, xUpper, yUpper); + noShiftedDphi = SDL::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + } + } else { + shiftedX = 0; + shiftedY = 0; + shiftedZ = 0; + dPhi = SDL::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + noShiftedDphi = dPhi; + } + + pass = pass && (alpaka::math::abs(acc, dPhi) < miniCut); + if (not pass) + return pass; + + // Cut #3: The dphi change going from lower Hit to upper Hit + // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3076 + if (modulesInGPU.sides[lowerModuleIndex] != Center) { + // When it is tilted, use the new shifted positions + // TODO: This is somewhat of an mystery.... somewhat confused why this is the case + if (modulesInGPU.moduleLayerType[lowerModuleIndex] != SDL::Pixel) { + // dPhi Change should be calculated so that the upper hit has higher rt. + // In principle, this kind of check rt_lower < rt_upper should not be necessary because the hit shifting should have taken care of this. + // (i.e. the strip hit is shifted to be aligned in the line of sight from interaction point to pixel hit of PS module guaranteeing rt ordering) + // But I still placed this check for safety. (TODO: After checking explicitly if not needed remove later?) + // setdeltaPhiChange(lowerHit.rt() < upperHitMod.rt() ? lowerHit.deltaPhiChange(upperHitMod) : upperHitMod.deltaPhiChange(lowerHit)); + + dPhiChange = (rtLower < shiftedRt) ? SDL::deltaPhiChange(acc, xLower, yLower, shiftedX, shiftedY) + : SDL::deltaPhiChange(acc, shiftedX, shiftedY, xLower, yLower); + noShiftedDphiChange = rtLower < rtUpper ? SDL::deltaPhiChange(acc, xLower, yLower, xUpper, yUpper) + : SDL::deltaPhiChange(acc, xUpper, yUpper, xLower, yLower); + } else { + // dPhi Change should be calculated so that the upper hit has higher rt. + // In principle, this kind of check rt_lower < rt_upper should not be necessary because the hit shifting should have taken care of this. + // (i.e. the strip hit is shifted to be aligned in the line of sight from interaction point to pixel hit of PS module guaranteeing rt ordering) + // But I still placed this check for safety. (TODO: After checking explicitly if not needed remove later?) + + dPhiChange = (shiftedRt < rtUpper) ? SDL::deltaPhiChange(acc, shiftedX, shiftedY, xUpper, yUpper) + : SDL::deltaPhiChange(acc, xUpper, yUpper, shiftedX, shiftedY); + noShiftedDphiChange = rtLower < rtUpper ? SDL::deltaPhiChange(acc, xLower, yLower, xUpper, yUpper) + : SDL::deltaPhiChange(acc, xUpper, yUpper, xLower, yLower); + } + } else { + // When it is flat lying module, whichever is the lowerSide will always have rt lower + dPhiChange = SDL::deltaPhiChange(acc, xLower, yLower, xUpper, yUpper); + noShiftedDphiChange = dPhiChange; + } + + pass = pass && (alpaka::math::abs(acc, dPhiChange) < miniCut); + noShiftedDz = 0; // not used anywhere + return pass; + }; + + template + ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgoEndcap(TAcc const& acc, + struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex, + uint16_t& upperModuleIndex, + unsigned int lowerHitIndex, + unsigned int upperHitIndex, + float& drt, + float& dPhi, + float& dPhiChange, + float& shiftedX, + float& shiftedY, + float& shiftedZ, + float& noShiftedDz, + float& noShiftedDphi, + float& noShiftedDphichange, + float xLower, + float yLower, + float zLower, + float rtLower, + float xUpper, + float yUpper, + float zUpper, + float rtUpper) { + bool pass = true; + + // There are series of cuts that applies to mini-doublet in a "endcap" region + // Cut #1 : dz cut. The dz difference can't be larger than 1cm. (max separation is 4mm for modules in the endcap) + // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3093 + // For PS module in case when it is tilted a different dz (after the strip hit shift) is calculated later. + + float dz = zLower - zUpper; // Not const since later it might change depending on the type of module + + const float dzCut = 1.f; + + pass = pass && (alpaka::math::abs(acc, dz) < dzCut); + if (not pass) + return pass; + // Cut #2 : drt cut. The dz difference can't be larger than 1cm. (max separation is 4mm for modules in the endcap) + // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3100 + const float drtCut = modulesInGPU.moduleType[lowerModuleIndex] == SDL::PS ? 2.f : 10.f; + drt = rtLower - rtUpper; + pass = pass && (alpaka::math::abs(acc, drt) < drtCut); + if (not pass) + return pass; + // The new scheme shifts strip hits to be "aligned" along the line of sight from interaction point to the pixel hit (if it is PS modules) + float xn = 0, yn = 0, zn = 0; + + float shiftedCoords[3]; + shiftStripHits(acc, + modulesInGPU, + lowerModuleIndex, + upperModuleIndex, + lowerHitIndex, + upperHitIndex, + shiftedCoords, + xLower, + yLower, + zLower, + rtLower, + xUpper, + yUpper, + zUpper, + rtUpper); + + xn = shiftedCoords[0]; + yn = shiftedCoords[1]; + zn = shiftedCoords[2]; + + if (modulesInGPU.moduleType[lowerModuleIndex] == SDL::PS) { + // Appropriate lower or upper hit is modified after checking which one was actually shifted + if (modulesInGPU.moduleLayerType[lowerModuleIndex] == SDL::Pixel) { + shiftedX = xn; + shiftedY = yn; + shiftedZ = zUpper; + dPhi = SDL::deltaPhi(acc, xLower, yLower, shiftedX, shiftedY); + noShiftedDphi = SDL::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + } else { + shiftedX = xn; + shiftedY = yn; + shiftedZ = zLower; + dPhi = SDL::deltaPhi(acc, shiftedX, shiftedY, xUpper, yUpper); + noShiftedDphi = SDL::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + } + } else { + shiftedX = xn; + shiftedY = yn; + shiftedZ = zUpper; + dPhi = SDL::deltaPhi(acc, xLower, yLower, xn, yn); + noShiftedDphi = SDL::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + } + + // dz needs to change if it is a PS module where the strip hits are shifted in order to properly account for the case when a tilted module falls under "endcap logic" + // if it was an endcap it will have zero effect + if (modulesInGPU.moduleType[lowerModuleIndex] == SDL::PS) { + dz = modulesInGPU.moduleLayerType[lowerModuleIndex] == SDL::Pixel ? zLower - zn : zUpper - zn; + } + + float miniCut = 0; + miniCut = modulesInGPU.moduleLayerType[lowerModuleIndex] == SDL::Pixel + ? dPhiThreshold(acc, rtLower, modulesInGPU, lowerModuleIndex, dPhi, dz) + : dPhiThreshold(acc, rtUpper, modulesInGPU, lowerModuleIndex, dPhi, dz); + + pass = pass && (alpaka::math::abs(acc, dPhi) < miniCut); + if (not pass) + return pass; + + // Cut #4: Another cut on the dphi after some modification + // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3119-L3124 + + float dzFrac = alpaka::math::abs(acc, dz) / alpaka::math::abs(acc, zLower); + dPhiChange = dPhi / dzFrac * (1.f + dzFrac); + noShiftedDphichange = noShiftedDphi / dzFrac * (1.f + dzFrac); + pass = pass && (alpaka::math::abs(acc, dPhiChange) < miniCut); + noShiftedDz = 0; // not used anywhere + return pass; + }; + + struct createMiniDoubletsInGPUv2 { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::hits hitsInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (uint16_t lowerModuleIndex = globalThreadIdx[1]; lowerModuleIndex < (*modulesInGPU.nLowerModules); + lowerModuleIndex += gridThreadExtent[1]) { + uint16_t upperModuleIndex = modulesInGPU.partnerModuleIndices[lowerModuleIndex]; + int nLowerHits = hitsInGPU.hitRangesnLower[lowerModuleIndex]; + int nUpperHits = hitsInGPU.hitRangesnUpper[lowerModuleIndex]; + if (hitsInGPU.hitRangesLower[lowerModuleIndex] == -1) + continue; + unsigned int upHitArrayIndex = hitsInGPU.hitRangesUpper[lowerModuleIndex]; + unsigned int loHitArrayIndex = hitsInGPU.hitRangesLower[lowerModuleIndex]; + int limit = nUpperHits * nLowerHits; + + for (int hitIndex = globalThreadIdx[2]; hitIndex < limit; hitIndex += gridThreadExtent[2]) { + int lowerHitIndex = hitIndex / nUpperHits; + int upperHitIndex = hitIndex % nUpperHits; + if (upperHitIndex >= nUpperHits) + continue; + if (lowerHitIndex >= nLowerHits) + continue; + unsigned int lowerHitArrayIndex = loHitArrayIndex + lowerHitIndex; + float xLower = hitsInGPU.xs[lowerHitArrayIndex]; + float yLower = hitsInGPU.ys[lowerHitArrayIndex]; + float zLower = hitsInGPU.zs[lowerHitArrayIndex]; + float rtLower = hitsInGPU.rts[lowerHitArrayIndex]; + unsigned int upperHitArrayIndex = upHitArrayIndex + upperHitIndex; + float xUpper = hitsInGPU.xs[upperHitArrayIndex]; + float yUpper = hitsInGPU.ys[upperHitArrayIndex]; + float zUpper = hitsInGPU.zs[upperHitArrayIndex]; + float rtUpper = hitsInGPU.rts[upperHitArrayIndex]; + + float dz, dphi, dphichange, shiftedX, shiftedY, shiftedZ, noShiftedDz, noShiftedDphi, noShiftedDphiChange; + bool success = runMiniDoubletDefaultAlgo(acc, + modulesInGPU, + lowerModuleIndex, + upperModuleIndex, + lowerHitArrayIndex, + upperHitArrayIndex, + dz, + dphi, + dphichange, + shiftedX, + shiftedY, + shiftedZ, + noShiftedDz, + noShiftedDphi, + noShiftedDphiChange, + xLower, + yLower, + zLower, + rtLower, + xUpper, + yUpper, + zUpper, + rtUpper); + if (success) { + int totOccupancyMDs = + alpaka::atomicOp(acc, &mdsInGPU.totOccupancyMDs[lowerModuleIndex], 1u); + if (totOccupancyMDs >= (rangesInGPU.miniDoubletModuleOccupancy[lowerModuleIndex])) { +#ifdef Warnings + printf("Mini-doublet excess alert! Module index = %d\n", lowerModuleIndex); +#endif + } else { + int mdModuleIndex = alpaka::atomicOp(acc, &mdsInGPU.nMDs[lowerModuleIndex], 1u); + unsigned int mdIndex = rangesInGPU.miniDoubletModuleIndices[lowerModuleIndex] + mdModuleIndex; + + addMDToMemory(acc, + mdsInGPU, + hitsInGPU, + modulesInGPU, + lowerHitArrayIndex, + upperHitArrayIndex, + lowerModuleIndex, + dz, + dphi, + dphichange, + shiftedX, + shiftedY, + shiftedZ, + noShiftedDz, + noShiftedDphi, + noShiftedDphiChange, + mdIndex); + } + } + } + } + } + }; + + struct createMDArrayRangesGPU { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + // Initialize variables in shared memory and set to 0 + int& nTotalMDs = alpaka::declareSharedVar(acc); + nTotalMDs = 0; + alpaka::syncBlockThreads(acc); + + // Initialize variables outside of the for loop. + int occupancy, category_number, eta_number; + + for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + short module_rings = modulesInGPU.rings[i]; + short module_layers = modulesInGPU.layers[i]; + short module_subdets = modulesInGPU.subdets[i]; + float module_eta = alpaka::math::abs(acc, modulesInGPU.eta[i]); + + if (module_layers <= 3 && module_subdets == 5) + category_number = 0; + else if (module_layers >= 4 && module_subdets == 5) + category_number = 1; + else if (module_layers <= 2 && module_subdets == 4 && module_rings >= 11) + category_number = 2; + else if (module_layers >= 3 && module_subdets == 4 && module_rings >= 8) + category_number = 2; + else if (module_layers <= 2 && module_subdets == 4 && module_rings <= 10) + category_number = 3; + else if (module_layers >= 3 && module_subdets == 4 && module_rings <= 7) + category_number = 3; + else + category_number = -1; + + if (module_eta < 0.75) + eta_number = 0; + else if (module_eta > 0.75 && module_eta < 1.5) + eta_number = 1; + else if (module_eta > 1.5 && module_eta < 2.25) + eta_number = 2; + else if (module_eta > 2.25 && module_eta < 3) + eta_number = 3; + else + eta_number = -1; + + if (category_number == 0 && eta_number == 0) + occupancy = 49; + else if (category_number == 0 && eta_number == 1) + occupancy = 42; + else if (category_number == 0 && eta_number == 2) + occupancy = 37; + else if (category_number == 0 && eta_number == 3) + occupancy = 41; + else if (category_number == 1) + occupancy = 100; + else if (category_number == 2 && eta_number == 1) + occupancy = 16; + else if (category_number == 2 && eta_number == 2) + occupancy = 19; + else if (category_number == 3 && eta_number == 1) + occupancy = 14; + else if (category_number == 3 && eta_number == 2) + occupancy = 20; + else if (category_number == 3 && eta_number == 3) + occupancy = 25; + else { + occupancy = 0; +#ifdef Warnings + printf("Unhandled case in createMDArrayRangesGPU! Module index = %i\n", i); +#endif + } + + unsigned int nTotMDs = alpaka::atomicOp(acc, &nTotalMDs, occupancy); + + rangesInGPU.miniDoubletModuleIndices[i] = nTotMDs; + rangesInGPU.miniDoubletModuleOccupancy[i] = occupancy; + } + + // Wait for all threads to finish before reporting final values + alpaka::syncBlockThreads(acc); + if (globalThreadIdx[2] == 0) { + rangesInGPU.miniDoubletModuleIndices[*modulesInGPU.nLowerModules] = nTotalMDs; + *rangesInGPU.device_nTotalMDs = nTotalMDs; + } + } + }; + + struct addMiniDoubletRangesToEventExplicit { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::objectRanges rangesInGPU, + struct SDL::hits hitsInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + if (mdsInGPU.nMDs[i] == 0 or hitsInGPU.hitRanges[i * 2] == -1) { + rangesInGPU.mdRanges[i * 2] = -1; + rangesInGPU.mdRanges[i * 2 + 1] = -1; + } else { + rangesInGPU.mdRanges[i * 2] = rangesInGPU.miniDoubletModuleIndices[i]; + rangesInGPU.mdRanges[i * 2 + 1] = rangesInGPU.miniDoubletModuleIndices[i] + mdsInGPU.nMDs[i] - 1; + } + } + } + }; +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/ModuleConnectionMap.dev.cc b/RecoTracker/LSTCore/src/alpaka/ModuleConnectionMap.dev.cc new file mode 100644 index 0000000000000..9a1ee2dedf52c --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/ModuleConnectionMap.dev.cc @@ -0,0 +1,101 @@ +#include "ModuleConnectionMap.h" + +SDL::ModuleConnectionMap::ModuleConnectionMap() {} + +SDL::ModuleConnectionMap::ModuleConnectionMap(std::string filename) { load(filename); } + +SDL::ModuleConnectionMap::~ModuleConnectionMap() {} + +void SDL::ModuleConnectionMap::load(std::string filename) { + moduleConnections_.clear(); + + std::ifstream ifile(filename, std::ios::binary); + if (!ifile.is_open()) { + throw std::runtime_error("Unable to open file: " + filename); + } + + while (!ifile.eof()) { + unsigned int detid, number_of_connections; + + // Read the detid and the number of connections from the binary file + ifile.read(reinterpret_cast(&detid), sizeof(detid)); + ifile.read(reinterpret_cast(&number_of_connections), sizeof(number_of_connections)); + + if (ifile) { + std::vector connected_detids; + + // Read the connections for the given detid + for (unsigned int i = 0; i < number_of_connections; ++i) { + unsigned int connected_detid; + ifile.read(reinterpret_cast(&connected_detid), sizeof(connected_detid)); + if (ifile) { + connected_detids.push_back(connected_detid); + } else { + if (!ifile.eof()) { + throw std::runtime_error("Failed to read connection data."); + } + break; // Exit loop on read failure that's not EOF + } + } + + if (ifile) { + moduleConnections_[detid] = connected_detids; + } + } else { + if (!ifile.eof()) { + throw std::runtime_error("Failed to read module connection binary data."); + } + } + } +} + +void SDL::ModuleConnectionMap::add(std::string filename) { + std::ifstream ifile; + ifile.open(filename.c_str()); + std::string line; + + while (std::getline(ifile, line)) { + unsigned int detid; + int number_of_connections; + std::vector connected_detids; + unsigned int connected_detid; + + std::stringstream ss(line); + + ss >> detid >> number_of_connections; + + for (int ii = 0; ii < number_of_connections; ++ii) { + ss >> connected_detid; + connected_detids.push_back(connected_detid); + } + + // Concatenate + moduleConnections_[detid].insert(moduleConnections_[detid].end(), connected_detids.begin(), connected_detids.end()); + + // Sort + std::sort(moduleConnections_[detid].begin(), moduleConnections_[detid].end()); + + // Unique + moduleConnections_[detid].erase(std::unique(moduleConnections_[detid].begin(), moduleConnections_[detid].end()), + moduleConnections_[detid].end()); + } +} + +void SDL::ModuleConnectionMap::print() { + std::cout << "Printing ModuleConnectionMap" << std::endl; + for (auto& pair : moduleConnections_) { + unsigned int detid = pair.first; + std::vector connected_detids = pair.second; + std::cout << " detid: " << detid << std::endl; + for (auto& connected_detid : connected_detids) { + std::cout << " connected_detid: " << connected_detid << std::endl; + } + } +} + +const std::vector& SDL::ModuleConnectionMap::getConnectedModuleDetIds(unsigned int detid) const { + static const std::vector dummy; + auto const mList = moduleConnections_.find(detid); + return mList != moduleConnections_.end() ? mList->second : dummy; +} +int SDL::ModuleConnectionMap::size() const { return moduleConnections_.size(); } diff --git a/RecoTracker/LSTCore/src/alpaka/ModuleConnectionMap.h b/RecoTracker/LSTCore/src/alpaka/ModuleConnectionMap.h new file mode 100644 index 0000000000000..45d629a74e00e --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/ModuleConnectionMap.h @@ -0,0 +1,42 @@ +#ifndef ModuleConnectionMap_h +#define ModuleConnectionMap_h + +#include +#include +#include +#include +#include +#include + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#else +#include "Constants.h" +#endif + +namespace SDL { + //FIXME: move to non-alpaka single arch build + template + class ModuleConnectionMap; + template <> + class ModuleConnectionMap { + private: + std::map> moduleConnections_; + + public: + ModuleConnectionMap(); + ModuleConnectionMap(std::string filename); + ~ModuleConnectionMap(); + + void load(std::string); + void add(std::string); + void print(); + + const std::vector& getConnectedModuleDetIds(unsigned int detid) const; + int size() const; + }; + + using MapPLStoLayer = std::array, 4>, 3>; +} // namespace SDL + +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/ModuleMethods.h b/RecoTracker/LSTCore/src/alpaka/ModuleMethods.h new file mode 100644 index 0000000000000..dc0eb0fc1f3b8 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/ModuleMethods.h @@ -0,0 +1,418 @@ +#ifndef ModuleMethods_cuh +#define ModuleMethods_cuh + +#include +#include + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "TiltedGeometry.h" +#include "EndcapGeometry.h" +#include "ModuleConnectionMap.h" +#include "PixelMap.h" + +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" + +namespace SDL { + struct ModuleMetaData { + std::map detIdToIndex; + std::map module_x; + std::map module_y; + std::map module_z; + std::map module_type; // 23 : Ph2PSP, 24 : Ph2PSS, 25 : Ph2SS + // https://github.com/cms-sw/cmssw/blob/5e809e8e0a625578aa265dc4b128a93830cb5429/Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h#L29 + }; + + template + inline void fillPixelMap(std::shared_ptr>& modulesBuf, + uint16_t nModules, + unsigned int& nPixels, + pixelMap& pixelMapping, + TQueue queue, + const MapPLStoLayer& pLStoLayer, + struct ModuleMetaData& mmd) { + pixelMapping.pixelModuleIndex = mmd.detIdToIndex[1]; + + std::vector connectedModuleDetIds; + std::vector connectedModuleDetIds_pos; + std::vector connectedModuleDetIds_neg; + + unsigned int totalSizes = 0; + unsigned int totalSizes_pos = 0; + unsigned int totalSizes_neg = 0; + for (unsigned int isuperbin = 0; isuperbin < size_superbins; isuperbin++) { + int sizes = 0; + for (auto const& mCM_pLS : pLStoLayer[0]) { + std::vector connectedModuleDetIds_pLS = + mCM_pLS.getConnectedModuleDetIds(isuperbin + size_superbins); + connectedModuleDetIds.insert( + connectedModuleDetIds.end(), connectedModuleDetIds_pLS.begin(), connectedModuleDetIds_pLS.end()); + sizes += connectedModuleDetIds_pLS.size(); + } + pixelMapping.connectedPixelsIndex[isuperbin] = totalSizes; + pixelMapping.connectedPixelsSizes[isuperbin] = sizes; + totalSizes += sizes; + + int sizes_pos = 0; + for (auto const& mCM_pLS : pLStoLayer[1]) { + std::vector connectedModuleDetIds_pLS_pos = mCM_pLS.getConnectedModuleDetIds(isuperbin); + connectedModuleDetIds_pos.insert(connectedModuleDetIds_pos.end(), + connectedModuleDetIds_pLS_pos.begin(), + connectedModuleDetIds_pLS_pos.end()); + sizes_pos += connectedModuleDetIds_pLS_pos.size(); + } + pixelMapping.connectedPixelsIndexPos[isuperbin] = totalSizes_pos; + pixelMapping.connectedPixelsSizesPos[isuperbin] = sizes_pos; + totalSizes_pos += sizes_pos; + + int sizes_neg = 0; + for (auto const& mCM_pLS : pLStoLayer[2]) { + std::vector connectedModuleDetIds_pLS_neg = mCM_pLS.getConnectedModuleDetIds(isuperbin); + connectedModuleDetIds_neg.insert(connectedModuleDetIds_neg.end(), + connectedModuleDetIds_pLS_neg.begin(), + connectedModuleDetIds_pLS_neg.end()); + sizes_neg += connectedModuleDetIds_pLS_neg.size(); + } + pixelMapping.connectedPixelsIndexNeg[isuperbin] = totalSizes_neg; + pixelMapping.connectedPixelsSizesNeg[isuperbin] = sizes_neg; + totalSizes_neg += sizes_neg; + } + + unsigned int connectedPix_size = totalSizes + totalSizes_pos + totalSizes_neg; + nPixels = connectedPix_size; + + // Now we can initialize modulesBuf + if (modulesBuf == nullptr) { + SDL::Dev const& devAcc = alpaka::getDev(queue); + modulesBuf = std::make_shared>(devAcc, nModules, nPixels); + } + + DevHost const& devHost = cms::alpakatools::host(); + auto connectedPixels_buf = allocBufWrapper(devHost, connectedPix_size); + unsigned int* connectedPixels = alpaka::getPtrNative(connectedPixels_buf); + + for (unsigned int icondet = 0; icondet < totalSizes; icondet++) { + connectedPixels[icondet] = mmd.detIdToIndex[connectedModuleDetIds[icondet]]; + } + for (unsigned int icondet = 0; icondet < totalSizes_pos; icondet++) { + connectedPixels[icondet + totalSizes] = mmd.detIdToIndex[connectedModuleDetIds_pos[icondet]]; + } + for (unsigned int icondet = 0; icondet < totalSizes_neg; icondet++) { + connectedPixels[icondet + totalSizes + totalSizes_pos] = mmd.detIdToIndex[connectedModuleDetIds_neg[icondet]]; + } + + alpaka::memcpy(queue, modulesBuf->connectedPixels_buf, connectedPixels_buf); + alpaka::wait(queue); + }; + + template + inline void fillConnectedModuleArrayExplicit(struct modulesBuffer* modulesBuf, + unsigned int nMod, + TQueue queue, + struct ModuleMetaData& mmd, + const ModuleConnectionMap* moduleConnectionMap) { + DevHost const& devHost = cms::alpakatools::host(); + auto moduleMap_buf = allocBufWrapper(devHost, nMod * MAX_CONNECTED_MODULES); + uint16_t* moduleMap = alpaka::getPtrNative(moduleMap_buf); + + auto nConnectedModules_buf = allocBufWrapper(devHost, nMod); + uint16_t* nConnectedModules = alpaka::getPtrNative(nConnectedModules_buf); + + for (auto it = mmd.detIdToIndex.begin(); it != mmd.detIdToIndex.end(); ++it) { + unsigned int detId = it->first; + uint16_t index = it->second; + auto& connectedModules = moduleConnectionMap->getConnectedModuleDetIds(detId); + nConnectedModules[index] = connectedModules.size(); + for (uint16_t i = 0; i < nConnectedModules[index]; i++) { + moduleMap[index * MAX_CONNECTED_MODULES + i] = mmd.detIdToIndex[connectedModules[i]]; + } + } + + alpaka::memcpy(queue, modulesBuf->moduleMap_buf, moduleMap_buf); + alpaka::memcpy(queue, modulesBuf->nConnectedModules_buf, nConnectedModules_buf); + alpaka::wait(queue); + }; + + template + inline void fillMapArraysExplicit(struct modulesBuffer* modulesBuf, + unsigned int nMod, + TQueue queue, + struct ModuleMetaData& mmd) { + DevHost const& devHost = cms::alpakatools::host(); + auto mapIdx_buf = allocBufWrapper(devHost, nMod); + uint16_t* mapIdx = alpaka::getPtrNative(mapIdx_buf); + + auto mapdetId_buf = allocBufWrapper(devHost, nMod); + unsigned int* mapdetId = alpaka::getPtrNative(mapdetId_buf); + + unsigned int counter = 0; + for (auto it = mmd.detIdToIndex.begin(); it != mmd.detIdToIndex.end(); ++it) { + unsigned int detId = it->first; + unsigned int index = it->second; + mapIdx[counter] = index; + mapdetId[counter] = detId; + counter++; + } + + alpaka::memcpy(queue, modulesBuf->mapIdx_buf, mapIdx_buf); + alpaka::memcpy(queue, modulesBuf->mapdetId_buf, mapdetId_buf); + alpaka::wait(queue); + }; + + inline void setDerivedQuantities(unsigned int detId, + unsigned short& layer, + unsigned short& ring, + unsigned short& rod, + unsigned short& module, + unsigned short& subdet, + unsigned short& side, + float m_x, + float m_y, + float m_z, + float& eta, + float& r) { + subdet = (detId & (7 << 25)) >> 25; + side = (subdet == Endcap) ? (detId & (3 << 23)) >> 23 : (detId & (3 << 18)) >> 18; + layer = (subdet == Endcap) ? (detId & (7 << 18)) >> 18 : (detId & (7 << 20)) >> 20; + ring = (subdet == Endcap) ? (detId & (15 << 12)) >> 12 : 0; + module = (detId & (127 << 2)) >> 2; + rod = (subdet == Endcap) ? 0 : (detId & (127 << 10)) >> 10; + + r = std::sqrt(m_x * m_x + m_y * m_y + m_z * m_z); + eta = ((m_z > 0) - (m_z < 0)) * std::acosh(r / std::sqrt(m_x * m_x + m_y * m_y)); + }; + + inline void loadCentroidsFromFile(const char* filePath, ModuleMetaData& mmd, uint16_t& nModules) { + std::ifstream ifile(filePath, std::ios::binary); + if (!ifile.is_open()) { + throw std::runtime_error("Unable to open file: " + std::string(filePath)); + } + + uint16_t counter = 0; + while (!ifile.eof()) { + unsigned int temp_detId; + float module_x, module_y, module_z; + int module_type; + + ifile.read(reinterpret_cast(&temp_detId), sizeof(temp_detId)); + ifile.read(reinterpret_cast(&module_x), sizeof(module_x)); + ifile.read(reinterpret_cast(&module_y), sizeof(module_y)); + ifile.read(reinterpret_cast(&module_z), sizeof(module_z)); + ifile.read(reinterpret_cast(&module_type), sizeof(module_type)); + + if (ifile) { + mmd.detIdToIndex[temp_detId] = counter; + mmd.module_x[temp_detId] = module_x; + mmd.module_y[temp_detId] = module_y; + mmd.module_z[temp_detId] = module_z; + mmd.module_type[temp_detId] = module_type; + counter++; + } else { + if (!ifile.eof()) { + throw std::runtime_error("Failed to read data for detId: " + std::to_string(temp_detId)); + } + } + } + + mmd.detIdToIndex[1] = counter; //pixel module is the last module in the module list + counter++; + nModules = counter; + }; + + template + void loadModulesFromFile(TQueue& queue, + const MapPLStoLayer* pLStoLayer, + const char* moduleMetaDataFilePath, + uint16_t& nModules, + uint16_t& nLowerModules, + unsigned int& nPixels, + std::shared_ptr>& modulesBuf, + pixelMap* pixelMapping, + const EndcapGeometry* endcapGeometry, + const TiltedGeometry* tiltedGeometry, + const ModuleConnectionMap* moduleConnectionMap) { + ModuleMetaData mmd; + + loadCentroidsFromFile(moduleMetaDataFilePath, mmd, nModules); + + DevHost const& devHost = cms::alpakatools::host(); + auto detIds_buf = allocBufWrapper(devHost, nModules); + auto layers_buf = allocBufWrapper(devHost, nModules); + auto rings_buf = allocBufWrapper(devHost, nModules); + auto rods_buf = allocBufWrapper(devHost, nModules); + auto modules_buf = allocBufWrapper(devHost, nModules); + auto subdets_buf = allocBufWrapper(devHost, nModules); + auto sides_buf = allocBufWrapper(devHost, nModules); + auto eta_buf = allocBufWrapper(devHost, nModules); + auto r_buf = allocBufWrapper(devHost, nModules); + auto isInverted_buf = allocBufWrapper(devHost, nModules); + auto isLower_buf = allocBufWrapper(devHost, nModules); + auto isAnchor_buf = allocBufWrapper(devHost, nModules); + auto moduleType_buf = allocBufWrapper(devHost, nModules); + auto moduleLayerType_buf = allocBufWrapper(devHost, nModules); + auto dxdys_buf = allocBufWrapper(devHost, nModules); + auto drdzs_buf = allocBufWrapper(devHost, nModules); + auto partnerModuleIndices_buf = allocBufWrapper(devHost, nModules); + auto sdlLayers_buf = allocBufWrapper(devHost, nModules); + + // Getting the underlying data pointers + unsigned int* host_detIds = alpaka::getPtrNative(detIds_buf); + short* host_layers = alpaka::getPtrNative(layers_buf); + short* host_rings = alpaka::getPtrNative(rings_buf); + short* host_rods = alpaka::getPtrNative(rods_buf); + short* host_modules = alpaka::getPtrNative(modules_buf); + short* host_subdets = alpaka::getPtrNative(subdets_buf); + short* host_sides = alpaka::getPtrNative(sides_buf); + float* host_eta = alpaka::getPtrNative(eta_buf); + float* host_r = alpaka::getPtrNative(r_buf); + bool* host_isInverted = alpaka::getPtrNative(isInverted_buf); + bool* host_isLower = alpaka::getPtrNative(isLower_buf); + bool* host_isAnchor = alpaka::getPtrNative(isAnchor_buf); + ModuleType* host_moduleType = alpaka::getPtrNative(moduleType_buf); + ModuleLayerType* host_moduleLayerType = alpaka::getPtrNative(moduleLayerType_buf); + float* host_dxdys = alpaka::getPtrNative(dxdys_buf); + float* host_drdzs = alpaka::getPtrNative(drdzs_buf); + uint16_t* host_partnerModuleIndices = alpaka::getPtrNative(partnerModuleIndices_buf); + int* host_sdlLayers = alpaka::getPtrNative(sdlLayers_buf); + + //reassign detIdToIndex indices here + nLowerModules = (nModules - 1) / 2; + uint16_t lowerModuleCounter = 0; + uint16_t upperModuleCounter = nLowerModules + 1; + //0 to nLowerModules - 1 => only lower modules, nLowerModules - pixel module, nLowerModules + 1 to nModules => upper modules + for (auto it = mmd.detIdToIndex.begin(); it != mmd.detIdToIndex.end(); it++) { + unsigned int detId = it->first; + float m_x = mmd.module_x[detId]; + float m_y = mmd.module_y[detId]; + float m_z = mmd.module_z[detId]; + unsigned int m_t = mmd.module_type[detId]; + + float eta, r; + + uint16_t index; + unsigned short layer, ring, rod, module, subdet, side; + bool isInverted, isLower; + if (detId == 1) { + layer = 0; + ring = 0; + rod = 0; + module = 0; + subdet = 0; + side = 0; + isInverted = false; + isLower = false; + eta = 0; + r = 0; + } else { + setDerivedQuantities(detId, layer, ring, rod, module, subdet, side, m_x, m_y, m_z, eta, r); + isInverted = SDL::modules::parseIsInverted(subdet, side, module, layer); + isLower = SDL::modules::parseIsLower(isInverted, detId); + } + if (isLower) { + index = lowerModuleCounter; + lowerModuleCounter++; + } else if (detId != 1) { + index = upperModuleCounter; + upperModuleCounter++; + } else { + index = nLowerModules; //pixel + } + //reassigning indices! + mmd.detIdToIndex[detId] = index; + host_detIds[index] = detId; + host_layers[index] = layer; + host_rings[index] = ring; + host_rods[index] = rod; + host_modules[index] = module; + host_subdets[index] = subdet; + host_sides[index] = side; + host_eta[index] = eta; + host_r[index] = r; + host_isInverted[index] = isInverted; + host_isLower[index] = isLower; + + //assigning other variables! + if (detId == 1) { + host_moduleType[index] = PixelModule; + host_moduleLayerType[index] = SDL::InnerPixelLayer; + host_dxdys[index] = 0; + host_drdzs[index] = 0; + host_isAnchor[index] = false; + } else { + host_moduleType[index] = (m_t == 25 ? SDL::TwoS : SDL::PS); + host_moduleLayerType[index] = (m_t == 23 ? SDL::Pixel : SDL::Strip); + + if (host_moduleType[index] == SDL::PS and host_moduleLayerType[index] == SDL::Pixel) { + host_isAnchor[index] = true; + } else if (host_moduleType[index] == SDL::TwoS and host_isLower[index]) { + host_isAnchor[index] = true; + } else { + host_isAnchor[index] = false; + } + + host_dxdys[index] = (subdet == Endcap) ? endcapGeometry->getdxdy_slope(detId) : tiltedGeometry->getDxDy(detId); + host_drdzs[index] = (subdet == Barrel) ? tiltedGeometry->getDrDz(detId) : 0; + } + + host_sdlLayers[index] = + layer + 6 * (subdet == SDL::Endcap) + 5 * (subdet == SDL::Endcap and host_moduleType[index] == SDL::TwoS); + } + + //partner module stuff, and slopes and drdz move around + for (auto it = mmd.detIdToIndex.begin(); it != mmd.detIdToIndex.end(); it++) { + auto& detId = it->first; + auto& index = it->second; + if (detId != 1) { + host_partnerModuleIndices[index] = + mmd.detIdToIndex[SDL::modules::parsePartnerModuleId(detId, host_isLower[index], host_isInverted[index])]; + //add drdz and slope importing stuff here! + if (host_drdzs[index] == 0) { + host_drdzs[index] = host_drdzs[host_partnerModuleIndices[index]]; + } + if (host_dxdys[index] == 0) { + host_dxdys[index] = host_dxdys[host_partnerModuleIndices[index]]; + } + } + } + + // modulesBuf is initialized in fillPixelMap since both nModules and nPix will be known + fillPixelMap(modulesBuf, nModules, nPixels, *pixelMapping, queue, *pLStoLayer, mmd); + + auto src_view_nModules = alpaka::createView(devHost, &nModules, (Idx)1u); + alpaka::memcpy(queue, modulesBuf->nModules_buf, src_view_nModules); + + auto src_view_nLowerModules = alpaka::createView(devHost, &nLowerModules, (Idx)1u); + alpaka::memcpy(queue, modulesBuf->nLowerModules_buf, src_view_nLowerModules); + + alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf); + alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf); + + alpaka::memcpy(queue, modulesBuf->detIds_buf, detIds_buf); + alpaka::memcpy(queue, modulesBuf->layers_buf, layers_buf); + alpaka::memcpy(queue, modulesBuf->rings_buf, rings_buf); + alpaka::memcpy(queue, modulesBuf->rods_buf, rods_buf); + alpaka::memcpy(queue, modulesBuf->modules_buf, modules_buf); + alpaka::memcpy(queue, modulesBuf->subdets_buf, subdets_buf); + alpaka::memcpy(queue, modulesBuf->sides_buf, sides_buf); + alpaka::memcpy(queue, modulesBuf->eta_buf, eta_buf); + alpaka::memcpy(queue, modulesBuf->r_buf, r_buf); + alpaka::memcpy(queue, modulesBuf->isInverted_buf, isInverted_buf); + alpaka::memcpy(queue, modulesBuf->isLower_buf, isLower_buf); + alpaka::memcpy(queue, modulesBuf->isAnchor_buf, isAnchor_buf); + alpaka::memcpy(queue, modulesBuf->dxdys_buf, dxdys_buf); + alpaka::memcpy(queue, modulesBuf->drdzs_buf, drdzs_buf); + alpaka::memcpy(queue, modulesBuf->partnerModuleIndices_buf, partnerModuleIndices_buf); + alpaka::memcpy(queue, modulesBuf->sdlLayers_buf, sdlLayers_buf); + alpaka::wait(queue); + + fillConnectedModuleArrayExplicit(modulesBuf.get(), nModules, queue, mmd, moduleConnectionMap); + fillMapArraysExplicit(modulesBuf.get(), nModules, queue, mmd); + }; +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h b/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h new file mode 100644 index 0000000000000..a786e249eb664 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h @@ -0,0 +1,168 @@ +#ifndef NeuralNetwork_cuh +#define NeuralNetwork_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "NeuralNetworkWeights.h" +#include "Segment.h" +#include "MiniDoublet.h" +#include "Hit.h" +#include "Triplet.h" + +namespace T5DNN { + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float runInference(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::triplets& tripletsInGPU, + const float* xVec, + const float* yVec, + const unsigned int* mdIndices, + const uint16_t* lowerModuleIndices, + const unsigned int& innerTripletIndex, + const unsigned int& outerTripletIndex, + const float& innerRadius, + const float& outerRadius, + const float& bridgeRadius) { + // Unpack x-coordinates of hits + float x1 = xVec[0]; + float x2 = xVec[1]; + float x3 = xVec[2]; + float x4 = xVec[3]; + float x5 = xVec[4]; + // Unpack y-coordinates of hits + float y1 = yVec[0]; + float y2 = yVec[1]; + float y3 = yVec[2]; + float y4 = yVec[3]; + float y5 = yVec[4]; + // Unpack module indices + unsigned int mdIndex1 = mdIndices[0]; + unsigned int mdIndex2 = mdIndices[1]; + unsigned int mdIndex3 = mdIndices[2]; + unsigned int mdIndex4 = mdIndices[3]; + unsigned int mdIndex5 = mdIndices[4]; + // Unpack module indices + uint16_t lowerModuleIndex1 = lowerModuleIndices[0]; + uint16_t lowerModuleIndex2 = lowerModuleIndices[1]; + uint16_t lowerModuleIndex3 = lowerModuleIndices[2]; + uint16_t lowerModuleIndex4 = lowerModuleIndices[3]; + uint16_t lowerModuleIndex5 = lowerModuleIndices[4]; + + // Compute some convenience variables + short layer2_adjustment = 0; + if (modulesInGPU.layers[lowerModuleIndex1] == 1) { + layer2_adjustment = 1; // get upper segment to be in second layer + } + unsigned int md_idx_for_t5_eta_phi = + segmentsInGPU.mdIndices[2 * tripletsInGPU.segmentIndices[2 * innerTripletIndex + layer2_adjustment]]; + bool is_endcap1 = (modulesInGPU.subdets[lowerModuleIndex1] == 4); // true if anchor hit 1 is in the endcap + bool is_endcap2 = (modulesInGPU.subdets[lowerModuleIndex2] == 4); // true if anchor hit 2 is in the endcap + bool is_endcap3 = (modulesInGPU.subdets[lowerModuleIndex3] == 4); // true if anchor hit 3 is in the endcap + bool is_endcap4 = (modulesInGPU.subdets[lowerModuleIndex4] == 4); // true if anchor hit 4 is in the endcap + bool is_endcap5 = (modulesInGPU.subdets[lowerModuleIndex5] == 4); // true if anchor hit 5 is in the endcap + + // Build DNN input vector (corresponding output N-tuple branch noted in parenthetical in comment) + float x[38] = { + SDL::temp_log10(acc, 2 * SDL::k2Rinv1GeVf * innerRadius), // inner T3 pT (t3_pt) + mdsInGPU.anchorEta[mdIndex1], // inner T3 anchor hit 1 eta (t3_0_eta) + mdsInGPU.anchorPhi[mdIndex1], // inner T3 anchor hit 1 phi (t3_0_phi) + mdsInGPU.anchorZ[mdIndex1], // inner T3 anchor hit 1 z (t3_0_z) + alpaka::math::sqrt(acc, x1 * x1 + y1 * y1), // inner T3 anchor hit 1 r (t3_0_r) + float(modulesInGPU.layers[lowerModuleIndex1] + 6 * is_endcap1), // inner T3 anchor hit 1 layer (t3_0_layer) + mdsInGPU.anchorEta[mdIndex2], // inner T3 anchor hit 2 eta (t3_2_eta) + mdsInGPU.anchorPhi[mdIndex2], // inner T3 anchor hit 2 phi (t3_2_phi) + mdsInGPU.anchorZ[mdIndex2], // inner T3 anchor hit 2 z (t3_2_z) + alpaka::math::sqrt(acc, x2 * x2 + y2 * y2), // inner T3 anchor hit 2 r (t3_2_r) + float(modulesInGPU.layers[lowerModuleIndex2] + 6 * is_endcap2), // inner T3 anchor hit 2 layer (t3_2_layer) + mdsInGPU.anchorEta[mdIndex3], // inner T3 anchor hit 3 eta (t3_4_eta) + mdsInGPU.anchorPhi[mdIndex3], // inner T3 anchor hit 3 phi (t3_4_phi) + mdsInGPU.anchorZ[mdIndex3], // inner T3 anchor hit 3 z (t3_4_z) + alpaka::math::sqrt(acc, x3 * x3 + y3 * y3), // inner T3 anchor hit 3 r (t3_4_r) + float(modulesInGPU.layers[lowerModuleIndex3] + 6 * is_endcap3), // inner T3 anchor hit 3 layer (t3_4_layer) + SDL::temp_log10(acc, 2 * SDL::k2Rinv1GeVf * outerRadius), // outer T3 pT (t3_pt) + mdsInGPU.anchorEta[mdIndex3], // outer T3 anchor hit 4 eta (t3_0_eta) + mdsInGPU.anchorPhi[mdIndex3], // outer T3 anchor hit 4 phi (t3_0_phi) + mdsInGPU.anchorZ[mdIndex3], // outer T3 anchor hit 3 eta (t3_0_z) + alpaka::math::sqrt(acc, x3 * x3 + y3 * y3), // outer T3 anchor hit 3 r (t3_0_r) + float(modulesInGPU.layers[lowerModuleIndex3] + 6 * is_endcap3), // outer T3 anchor hit 3 layer (t3_0_layer) + mdsInGPU.anchorEta[mdIndex4], // outer T3 anchor hit 4 eta (t3_2_eta) + mdsInGPU.anchorPhi[mdIndex4], // outer T3 anchor hit 4 phi (t3_2_phi) + mdsInGPU.anchorZ[mdIndex4], // outer T3 anchor hit 4 z (t3_2_z) + alpaka::math::sqrt(acc, x4 * x4 + y4 * y4), // outer T3 anchor hit 4 r (t3_2_r) + float(modulesInGPU.layers[lowerModuleIndex4] + 6 * is_endcap4), // outer T3 anchor hit 4 layer (t3_2_layer) + mdsInGPU.anchorEta[mdIndex5], // outer T3 anchor hit 5 eta (t3_4_eta) + mdsInGPU.anchorPhi[mdIndex5], // outer T3 anchor hit 5 phi (t3_4_phi) + mdsInGPU.anchorZ[mdIndex5], // outer T3 anchor hit 5 z (t3_4_z) + alpaka::math::sqrt(acc, x5 * x5 + y5 * y5), // outer T3 anchor hit 5 r (t3_4_r) + float(modulesInGPU.layers[lowerModuleIndex5] + 6 * is_endcap5), // outer T3 anchor hit 5 layer (t3_4_layer) + SDL::temp_log10( + acc, (innerRadius + outerRadius) * SDL::magnetic_field * 1.602f / (2 * 100 * 5.39f)), // T5 pT (t5_pt) + mdsInGPU.anchorEta[md_idx_for_t5_eta_phi], // T5 eta (t5_eta) + mdsInGPU.anchorPhi[md_idx_for_t5_eta_phi], // T5 phi (t5_phi) + SDL::temp_log10(acc, innerRadius), // T5 inner radius (t5_innerRadius) + SDL::temp_log10(acc, bridgeRadius), // T5 bridge radius (t5_bridgeRadius) + SDL::temp_log10(acc, outerRadius) // T5 outer radius (t5_outerRadius) + }; + + // (0): Linear(in_features=38, out_features=32, bias=True) => x = x*W_T + b + float x_0[32]; + for (unsigned int col = 0; col < 32; ++col) { + x_0[col] = 0; + for (unsigned int inner = 0; inner < 38; ++inner) { + x_0[col] += x[inner] * wgtT_0[inner][col]; + } + x_0[col] += bias_0[col]; + } + + // (1): ReLU() + float x_1[32]; + for (unsigned int col = 0; col < 32; ++col) { + x_1[col] = (x_0[col] > 0.f) ? x_0[col] : 0.f; + } + + // (2): Linear(in_features=32, out_features=32, bias=True) => x = x*W_T + b + float x_2[32]; + for (unsigned int col = 0; col < 32; ++col) { + x_2[col] = 0; + for (unsigned int inner = 0; inner < 32; ++inner) { + x_2[col] += x_1[inner] * wgtT_2[inner][col]; + } + x_2[col] += bias_2[col]; + } + + // (3): ReLU() + float x_3[32]; + for (unsigned int col = 0; col < 32; ++col) { + x_3[col] = (x_2[col] > 0.f) ? x_2[col] : 0.f; + } + + // (4): Linear(in_features=32, out_features=1, bias=True) => x = x*W_T + b + float x_4[1]; + for (unsigned int col = 0; col < 1; ++col) { + x_4[col] = 0; + for (unsigned int inner = 0; inner < 32; ++inner) { + x_4[col] += x_3[inner] * wgtT_4[inner][col]; + } + x_4[col] += bias_4[col]; + } + + // (5): Sigmoid() + float x_5[1]; + for (unsigned int col = 0; col < 1; ++col) { + x_5[col] = alpaka::math::exp(acc, x_4[col]) / (alpaka::math::exp(acc, x_4[col]) + 1); + } + + return x_5[0]; + } +} // namespace T5DNN + +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/NeuralNetworkWeights.h b/RecoTracker/LSTCore/src/alpaka/NeuralNetworkWeights.h new file mode 100644 index 0000000000000..90fb6b21ac7d4 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/NeuralNetworkWeights.h @@ -0,0 +1,311 @@ +#ifndef NeuralNetworkWeights_cuh +#define NeuralNetworkWeights_cuh + +#include + +namespace T5DNN { + ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_0[32] = { + -4.5069356f, -5.8842053f, 1.0793180f, -0.1540973f, -0.4705772f, 6.4027028f, -0.6620818f, -7.0734525f, + 0.6211641f, 4.9630723f, 3.4310920f, -0.8856288f, 4.5843782f, -6.0180559f, 0.0126438f, -1.5725276f, + -0.8549317f, -6.8545237f, -1.2129461f, 3.0617838f, -0.3911322f, 0.0799793f, -2.5398655f, -0.5780622f, + 2.8533990f, -0.1777968f, -2.6457164f, -0.7976936f, 4.5644889f, -2.1747942f, 3.4286616f, -10.1073380f}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_0[38][32] = { + {6.1269712f, -10.6625051f, 17.4907818f, -0.0019928f, -3.4468415f, 1.6674044f, -7.8957767f, 2.2077549f, + 9.5517254f, -5.1345053f, -30.1643391f, 4.0148559f, -19.8330841f, -18.3806915f, 0.1334764f, 1.6213616f, + -4.1423774f, -15.3062429f, -1.0209556f, 1.5580219f, 0.7426265f, 0.0033929f, 1.3924170f, 0.9196110f, + -0.8995734f, 1.0594707f, 39.4390869f, 8.7642002f, 28.4583893f, -5.9235659f, 3.7221889f, 14.4167147f}, + {1.7863803f, -0.6068707f, 0.3166098f, -0.0608759f, 0.5939785f, 0.4870262f, -3.1375074f, -17.7147388f, + -0.7231818f, -9.3808413f, 2.2070611f, 15.7461920f, 0.9355862f, 2.3942475f, -0.0671409f, 3.5954301f, + -3.0463996f, -2.0748904f, -0.5450584f, -4.4800100f, 0.6074556f, -0.0161482f, 3.0624702f, -4.5688419f, + 2.9881518f, -0.3714012f, -0.0387531f, -0.7699140f, 4.4028845f, 5.0333014f, -4.7350726f, -8.6568584f}, + {5.6548429f, -0.0207700f, 0.1785973f, 0.0881671f, 0.2530097f, -0.1893259f, -0.1105739f, -0.5183877f, + 1.0728362f, 0.1833011f, 1.7765219f, 0.3127359f, 0.0455277f, -0.1442616f, -0.1048361f, -0.1235604f, + -0.1217661f, -0.5487315f, 0.7575656f, -0.1177454f, -17.0993137f, 0.1628031f, 0.2789381f, 0.5304270f, + 0.0837841f, -3.1120780f, 0.0074821f, -0.1648044f, -0.3395336f, 0.3958135f, 0.8718957f, -1.1980486f}, + {0.2401041f, -0.0585765f, -0.0144584f, 0.0411095f, 0.0752229f, 0.0292672f, -0.2437613f, -1.4396472f, + -0.0971315f, -1.7181139f, 0.2417643f, 2.2030578f, 0.0566049f, 0.1081589f, -0.1060181f, 0.3473758f, + -0.7095683f, -0.0345675f, 0.2794849f, -1.1702278f, 0.2622930f, -0.0072611f, 0.5026371f, -1.2882922f, + -0.4712771f, 0.0597130f, -0.0039970f, -0.6050836f, 0.1554724f, 1.0991164f, -0.4975886f, 0.2597970f}, + {0.0766028f, 0.0218421f, -0.1739017f, -0.0076569f, 0.0384461f, -0.1841756f, 0.9677940f, -3.1114254f, + 2.3830564f, 2.0706992f, -0.9643140f, 0.7361387f, -0.0060253f, -0.1554846f, -0.0831100f, 2.8754771f, + -1.4403527f, -0.5281797f, 0.5157787f, 4.2405987f, 0.4807618f, 0.0217647f, -1.2626950f, 0.9145837f, + -0.3931780f, 0.3426280f, -0.0065206f, -0.7510439f, -0.4555758f, 2.7724340f, -1.2173026f, 0.1039017f}, + {0.5685715f, 0.3927337f, 0.4942532f, -0.0671033f, -0.2808350f, -0.0336000f, -1.3983957f, 0.9876546f, + -2.3840380f, 0.7315395f, -2.2009561f, -1.4631602f, -0.4672308f, -0.4994236f, 0.1169335f, -1.1894208f, + -1.2692982f, 0.3303853f, -2.0147655f, -0.9912014f, 1.0042895f, 0.1121151f, -1.0789106f, -2.2821584f, + -6.6459913f, -0.0959398f, -0.0068429f, -2.8177626f, 0.3213172f, -2.6832986f, -4.7613306f, -0.9985733f}, + {1.4419515f, -0.3864825f, -0.6756768f, -0.1273375f, 0.4321181f, 0.3354745f, -0.8236564f, -2.8190827f, + 0.7090831f, 1.9072700f, -3.1834064f, -2.6938572f, 0.5051147f, 1.4382831f, 0.1241910f, -0.7352629f, + 0.7703634f, -1.7556250f, -2.1104112f, 3.0603442f, 1.9873468f, -0.0358815f, -1.0087154f, 3.8253262f, + -0.5466214f, 0.0875162f, 0.2691758f, 0.7121435f, 1.9314718f, -0.1580560f, 3.6484149f, -5.3173709f}, + {6.9104381f, -0.0033664f, -1.4405546f, -0.1768288f, 0.2028089f, -0.1012344f, -4.4735684f, 0.6354278f, + 4.3039737f, 0.2056303f, 1.8338999f, -1.1351355f, 0.1015760f, -0.0733253f, -0.0561627f, 2.5292397f, + 1.6314448f, -0.9333628f, -0.7773662f, 0.8313186f, -0.7829623f, 0.1265118f, 0.5922315f, -0.3463379f, + -1.3269740f, -3.3302619f, -0.0061799f, 2.3374722f, 0.0880938f, 0.7470241f, -0.4205743f, -4.7557602f}, + {0.0380794f, 0.0947470f, 0.0419397f, 0.0582226f, -0.0603404f, 0.0234028f, -0.2575402f, 0.4125248f, + 0.3035339f, 0.2663808f, -0.6092452f, -1.4727812f, 0.0247187f, -0.0539688f, -0.0150413f, 0.2094955f, + 0.5379737f, -0.3255228f, -0.5639279f, 0.0786276f, 0.6703192f, 0.1557026f, -0.2753083f, 1.1463971f, + -0.9372965f, 0.5657740f, 0.0041413f, 0.0870248f, 0.0101520f, -0.8214461f, 0.1212932f, 1.5648646f}, + {-0.0969819f, 0.0137566f, 1.3515147f, -0.0155047f, -0.1416170f, -0.1636726f, 0.5184190f, 0.4732984f, + 0.6815788f, -1.0522166f, -0.4486531f, -0.0516016f, 0.0201894f, -0.0849667f, -0.0861271f, -1.2027841f, + 1.2458711f, -0.7061657f, 1.0381308f, -0.3450044f, -0.1300479f, -0.0828402f, 0.6859242f, -1.0575374f, + 0.6947553f, -0.0922188f, 0.0199132f, 0.8038982f, -0.1734094f, -0.1057449f, 1.6305015f, -0.0688597f}, + {-1.8151448f, 0.1024327f, 1.7063105f, 0.1130912f, -0.1081472f, -0.2904744f, -1.3465070f, -1.0455177f, + -0.4581082f, -3.2220871f, 0.5221398f, -5.1637673f, 0.0811146f, -0.1326323f, -0.0379338f, -3.0439703f, + -2.4246936f, -0.3670847f, -3.1256330f, -1.6595014f, -3.4715190f, -0.1526113f, -1.0420206f, 0.9536474f, + -3.2932863f, 1.6048199f, 0.0025162f, -3.6049840f, 0.0604250f, -2.2404826f, 1.8406851f, -3.1381185f}, + {1.2985691f, -1.1044264f, 0.9062797f, -0.0788333f, 0.2694912f, 0.0032800f, -0.0574267f, 0.9734111f, + 1.1532565f, 2.6786125f, -3.8574269f, -2.2871449f, -0.1261243f, 1.0545347f, -0.1454154f, -0.5609738f, + 1.8385800f, -0.8035598f, -1.7668265f, 5.1665063f, 0.7966110f, 0.0940206f, -2.3943975f, 2.3344002f, + 1.0342182f, 0.4806454f, -0.3880928f, 0.6998246f, 1.4011886f, -1.7313483f, 4.9702630f, -6.0058608f}, + {1.0300356f, 0.0616315f, -0.1113776f, -0.1694220f, 0.7159944f, 0.0626456f, 2.0994680f, 0.3452290f, + -3.0487001f, 0.0654031f, -1.1510723f, 0.5370992f, -0.0290704f, -0.0300795f, 0.0751569f, -0.2345951f, + -0.3472281f, 0.4424143f, 1.2444530f, -0.2114656f, 0.7865694f, -0.0709381f, -0.1839961f, -0.0529834f, + 0.5867608f, -3.8793530f, -0.0814745f, -0.6368676f, 0.0361213f, -0.5549288f, 0.5661780f, 1.8374584f}, + {0.3345098f, 0.0068199f, -0.4205509f, -0.1088801f, -0.1043202f, -0.0040804f, 0.3400922f, 0.2673528f, + -0.6050695f, 0.4443954f, -0.4319905f, -0.6044132f, -0.0260679f, 0.0137036f, 0.0765494f, -0.0095099f, + 0.5880439f, -0.0083854f, -0.2407522f, 0.1942379f, 0.6554548f, -0.1322891f, -0.8298992f, 0.7909554f, + 1.0528831f, 0.1970959f, 0.0754069f, -0.0947960f, -0.0279494f, -0.5888316f, 0.8919419f, 0.4828835f}, + {0.3995822f, -0.2139665f, 0.3982936f, -0.1285759f, -0.3445527f, -0.1167238f, -0.1263519f, 0.8393803f, + -0.7758383f, 0.0719291f, -0.0134762f, 0.1715237f, 0.0796666f, 0.1023507f, -0.1172728f, -1.2364722f, + 1.2592632f, -0.3168479f, 0.7487004f, -1.5170647f, -0.2235429f, -0.1620898f, 1.4064828f, -1.0821995f, + 0.0740103f, -1.0412805f, -0.0621277f, 0.2439800f, 0.2684972f, -1.1661061f, 0.7859434f, -0.6170313f}, + {2.1615884f, 0.1431713f, 0.0642652f, -0.0522325f, -0.2658786f, -0.0245810f, -1.6857448f, -0.6685011f, + -0.6978170f, -0.8716729f, 0.3129902f, -2.5870812f, -0.2855283f, -0.3205920f, -0.0084069f, 1.3182145f, + -0.6923816f, -0.3730274f, -2.3638811f, -1.1128502f, -2.4709859f, 0.1349022f, -0.3574466f, -0.6597407f, + -4.1122031f, 0.2240651f, 0.1806145f, -1.6836300f, -0.0766231f, -3.2611966f, 0.0091456f, -0.0997367f}, + {5.2476101f, -0.1966512f, 4.8935304f, -0.1551689f, 1.6919724f, -0.8324367f, 14.3318472f, -0.3503132f, + 10.3614969f, -9.1522884f, -0.2543063f, -1.8476851f, 16.7961140f, 9.9541416f, -0.0434563f, -9.6973553f, + -5.0469398f, 6.1688442f, 7.6429725f, -7.3149266f, 1.2345183f, 0.1412155f, 0.7114770f, -1.6378664f, + 5.1548996f, 0.3686100f, -45.3027611f, 3.0492647f, -37.3445892f, 2.7421410f, -2.7958770f, -25.2034016f}, + {1.4597454f, -1.0561740f, 0.9751291f, 0.0446527f, 0.3691662f, 0.1006782f, 0.1418435f, 0.8871480f, + 1.1603093f, 2.8034730f, -4.0856910f, -1.9786842f, -0.2206208f, 0.9539357f, 0.0868183f, -0.6811873f, + 1.9642411f, -0.8065316f, -2.0244894f, 5.2936082f, 0.6120632f, -0.1194160f, -2.3925939f, 2.5555069f, + 1.0149733f, 0.4607603f, -0.2197217f, 0.5703423f, 1.4049014f, -1.5900208f, 5.1645074f, -6.0569463f}, + {0.9000676f, -0.0028781f, -0.1967366f, 0.1039593f, 0.7993248f, 0.0655172f, 2.2296758f, 0.4391927f, + -3.0292840f, 0.0334536f, -1.1728534f, 0.3479103f, -0.1190938f, 0.0410203f, 0.1146637f, -0.2958017f, + -0.3240463f, 0.4361866f, 1.0564958f, -0.1989332f, 0.5194008f, -0.0628912f, -0.1733121f, -0.1255383f, + 0.5990249f, -3.7692382f, 0.0995128f, -0.7101220f, -0.0785123f, -0.3514554f, 0.6662078f, 2.0991604f}, + {0.1781942f, -0.1873588f, -0.4653996f, -0.0153059f, -0.1399561f, -0.0498718f, 0.4552556f, 0.2300792f, + -0.7682312f, 0.4342302f, -0.3787803f, -0.6089386f, -0.1049337f, 0.0395331f, 0.0220332f, 0.0114750f, + 0.4672548f, 0.1284784f, -0.2472819f, 0.2892784f, 0.4788667f, 0.0472555f, -0.6593549f, 0.6508777f, + 0.9286987f, 0.3043948f, -0.0635985f, 0.0814399f, -0.1168853f, -0.6688027f, 0.8876534f, 0.4865684f}, + {0.4024099f, 0.0480259f, 0.4588822f, -0.1793082f, -0.2151573f, -0.1871128f, -0.1502780f, 1.1011307f, + -0.9467706f, 0.2632496f, -0.1257263f, -0.0241331f, 0.2280627f, 0.0878608f, -0.1334262f, -1.1642927f, + 1.0943586f, -0.4799654f, 0.5981907f, -1.5051398f, -0.4235946f, 0.0012827f, 1.2342577f, -0.8281875f, + 0.2776567f, -1.0362227f, 0.0408372f, 0.1540821f, 0.1777556f, -1.2684357f, 0.8836584f, -0.4001710f}, + {2.1558056f, 0.2082023f, 0.0863442f, 0.0364868f, -0.3985825f, 0.0307202f, -1.8889453f, -0.5614714f, + -0.7311882f, -0.8075573f, 0.4895108f, -2.7770483f, -0.3121874f, -0.1671291f, -0.1281284f, 1.3212786f, + -0.5310181f, -0.1974759f, -2.6240873f, -0.8320529f, -2.3875966f, -0.0286360f, -0.6263188f, -0.6553424f, + -4.1658955f, -0.0601300f, 0.0946256f, -1.6795633f, -0.1251303f, -3.0974686f, 0.2412274f, -0.0687501f}, + {2.0523887f, -0.6387668f, 2.0633900f, -0.0550964f, 0.5181718f, -0.4202190f, 1.8569367f, 0.8295385f, + 0.8555872f, 2.4727983f, -0.2072828f, -1.9006120f, 0.5379534f, 0.4463673f, 0.1468820f, 0.4918649f, + -3.4016700f, 0.2884440f, -1.9418719f, 4.5157170f, -0.5160927f, -0.0199372f, 3.1353824f, -0.9863126f, + -1.5135859f, 0.7576568f, 0.6715558f, 2.7409093f, 0.9291748f, -0.3247162f, 1.8204515f, -8.9181070f}, + {-0.1428107f, -0.0829889f, 0.4213613f, 0.0225415f, 1.2238166f, 0.0477106f, 0.3031853f, -0.7466553f, + 2.0663500f, 0.7588379f, 0.3689216f, -0.2003786f, 0.1242338f, 0.1693589f, -0.0351716f, -0.0186597f, + -0.0189417f, 0.5468715f, -0.2862698f, -0.1311738f, 3.0747476f, -0.0310747f, 0.0943165f, 0.3139819f, + 0.6274695f, -1.8314874f, 0.0147495f, 0.3554756f, 0.3829916f, 0.4891713f, 0.1328600f, 1.0535098f}, + {0.0534900f, 0.1787969f, -0.0571320f, -0.0685673f, 0.1968977f, 0.0374476f, 0.7876674f, 0.0828491f, + 0.6444036f, -0.2203166f, -0.2383427f, 0.5397566f, 0.0106769f, -0.1230072f, -0.0135021f, -0.5691944f, + -1.5040319f, 0.0406933f, -0.0025478f, 0.9251419f, -1.7180276f, -0.1112956f, 1.4840862f, 0.0407115f, + -0.0100329f, 0.0583593f, -0.0110524f, 0.7431355f, -0.0971857f, -0.5501527f, -0.6371027f, -0.1935233f}, + {-0.6455778f, 0.2317368f, 0.9285696f, -0.1415854f, 0.0822560f, 0.2488030f, -2.6992166f, 0.0884904f, + 0.6735302f, -0.1467820f, 0.5641044f, 0.6436581f, 0.0818401f, -0.0336634f, -0.0729000f, -0.1206900f, + -2.5739892f, 0.5776953f, 0.9531668f, -1.2362405f, -0.0615577f, -0.0143544f, -2.7525210f, 1.3738545f, + 0.2751348f, -1.7463943f, -0.0020144f, 2.4814103f, 0.1716725f, -0.7055540f, -0.3474010f, 0.4482578f}, + {-0.2526205f, -0.7463821f, -3.6076138f, -0.1511098f, 0.1216256f, 0.0888247f, -1.0190924f, -1.3260181f, + -0.0443211f, -4.8911066f, -3.4385188f, -6.0057454f, 0.3340450f, 0.2997236f, -0.0907855f, 0.7500492f, + -0.4007562f, 1.9382039f, 0.5687234f, 2.6511824f, 4.7703862f, 0.0006749f, -0.0201394f, -3.5885489f, + -4.1518898f, 0.0807014f, -0.0584071f, -0.8100027f, 0.7697087f, -0.8038046f, -1.2945876f, -4.0110312f}, + {0.4337017f, -1.1532011f, 2.0740633f, 0.0271806f, 0.6654227f, 0.1012998f, -4.0791736f, 1.2631345f, + 1.9511020f, 2.3272331f, 1.2707534f, 1.6306664f, 0.4936035f, 0.8285242f, 0.0807625f, 3.8652387f, + 0.0281145f, 1.6877037f, 1.2557380f, -0.3036775f, 0.5604967f, 0.1551418f, -0.9599600f, -6.3067718f, + -0.6352320f, 0.8058553f, 0.3657880f, -2.0491202f, -0.3926269f, 2.5650854f, 1.3697821f, -8.3070078f}, + {5.1334143f, -0.0351738f, -0.4774780f, -0.0679726f, 1.4569254f, 0.0580191f, -0.3649136f, -0.2298838f, + -3.3826666f, -0.7392708f, -0.6036060f, -0.2612940f, -0.1877640f, -0.1145124f, -0.0042578f, -0.0311193f, + -0.0320479f, 0.5270581f, -0.4324475f, 0.2681437f, 4.7813129f, -0.0222701f, -0.0525629f, -0.2861001f, + -0.1251072f, 3.9112861f, 0.0045046f, -0.0426071f, -0.3299106f, -0.0686970f, -0.1602017f, -0.0070103f}, + {-0.6633690f, 0.0103367f, 0.5998458f, 0.1256577f, -0.0359184f, -0.0176820f, -0.6458368f, -0.0370536f, + 0.3542259f, 0.1394724f, 0.8255956f, 0.2501569f, 0.0320156f, -0.0256806f, 0.0277949f, 0.0036392f, + 0.2825173f, 0.1400358f, 1.0011463f, -0.6792242f, 0.0672508f, 0.0728705f, -0.1089695f, -1.0414587f, + -0.4135485f, 0.4293025f, -0.0041241f, -0.9564193f, 0.0314900f, 0.8658463f, -0.7734696f, -0.7610567f}, + {-0.0200122f, -0.0749178f, -1.5026549f, -0.0387432f, -0.0713735f, 0.1214790f, 1.8730290f, -0.0552839f, + -1.6867150f, 0.2282097f, 0.7161849f, -0.1018546f, -0.1092003f, 0.0365504f, -0.1326883f, 1.2310545f, + 0.1800210f, 0.7024739f, -2.9606545f, 1.2275347f, -0.2050014f, 0.0940569f, 0.4761694f, 0.8812068f, + -0.0083424f, -1.5406264f, 0.0061815f, -2.7606382f, 0.0248556f, 1.1086880f, -1.3608936f, 1.0795454f}, + {0.9734020f, 0.3905411f, -3.7008634f, 0.0013557f, 0.1649124f, 0.9935362f, 1.3489184f, 0.9505764f, + 0.7966231f, -0.1627246f, -2.5754328f, 1.4892205f, 0.8586300f, 0.6974363f, 0.1320204f, -0.7840260f, + 0.3121157f, 0.0966901f, 2.7447381f, 1.8256680f, 0.7229405f, -0.1723188f, 0.9145948f, -2.1376033f, + 0.5259342f, 0.0731194f, -0.2908303f, -0.2603913f, -0.2326528f, 3.6684167f, -0.2883157f, -2.8546307f}, + {-4.8917460f, 6.7944999f, -0.2255474f, 0.1051999f, 3.9000113f, 2.0624907f, 5.3019547f, 10.0209141f, + 1.1268179f, 2.2669628f, -6.5002980f, 1.8408583f, 5.3039579f, 2.2055962f, 0.1055369f, 1.7230233f, + 6.9605255f, 7.7025104f, 2.9880707f, -0.9274251f, -0.2287160f, -0.0206735f, 0.6885675f, 2.8179996f, + -7.1129837f, -1.3772345f, 3.8655453f, -5.9388318f, -0.0469947f, 7.2763596f, -6.3536129f, -17.0069847f}, + {1.8787041f, -0.9953383f, -1.4839923f, 0.1308209f, 0.3657510f, 0.3106483f, -1.4158971f, -6.7449651f, + 0.6553892f, -4.5046172f, -3.5489719f, 3.5363002f, 0.5454772f, 2.3521471f, 0.1612140f, -0.9744226f, + 0.6546553f, -2.7179255f, -1.7758157f, 0.3089439f, 1.7462813f, 0.1654593f, -0.2440207f, 3.9501827f, + 1.3750844f, 0.0596805f, -0.1977254f, 0.0264880f, 2.6396444f, 1.0816911f, 3.6413448f, -6.0299959f}, + {-4.1295738f, 0.1044480f, 0.2131937f, 0.0420826f, 0.5292229f, 0.0090477f, -0.0973486f, 0.9596778f, + 2.9579651f, -0.6364226f, -1.7556342f, 0.1539868f, -0.1273174f, -0.1348504f, 0.1257833f, -1.4168571f, + -1.0960362f, 0.0482449f, -1.4395387f, -0.2524115f, -2.9162085f, -0.0451428f, -0.4021681f, -0.5756381f, + 0.0515293f, -3.1996479f, -0.0007676f, -1.3878343f, -0.2864279f, -0.9579773f, -1.0999249f, 1.6500067f}, + {-2.4806111f, -6.8115449f, 3.2805641f, 0.1187415f, -0.9950783f, 6.2553434f, -1.6450261f, -6.1463733f, + 2.7507148f, 4.2995782f, 0.0461297f, -0.5417359f, 2.4306326f, -7.3530145f, 0.0698273f, -0.9394333f, + -1.3595498f, -7.5141478f, -1.4911395f, 3.2300410f, 0.1203540f, 0.0314884f, -2.0116949f, -0.8167119f, + 2.4133310f, 0.1920709f, 1.0619365f, 0.2459123f, 6.9166069f, -2.6384118f, 3.6829739f, -7.2385545f}, + {0.9408096f, 14.9067144f, 1.7709646f, 0.1105646f, -0.5600107f, -15.3188124f, -12.3718462f, -1.8893757f, + 13.6364670f, -5.7327847f, -14.1805468f, 1.0581509f, -14.2186184f, 14.8948650f, 0.0190344f, 5.4395180f, + 6.7243400f, 9.8468456f, 4.5144215f, -1.4551491f, 1.1032411f, -0.0317988f, 2.3398454f, -3.1671596f, + -7.7541409f, 1.1255593f, 6.7340465f, -4.4448423f, -9.1472626f, -3.1959128f, 4.4181323f, -2.7904994f}, + {-2.1621978f, -4.7202382f, 1.7378219f, 0.1417439f, -0.5000908f, 5.4468708f, 1.4260571f, -6.6136570f, + 1.5713804f, 3.4479704f, 2.7354901f, -0.7388076f, 5.4666147f, -3.8697338f, -0.1368596f, -2.7903373f, + -1.2043713f, -4.9554005f, 0.3324645f, 1.6767365f, 0.1156244f, -0.0326964f, -2.0945346f, -0.4590589f, + 3.0942657f, 0.0015020f, -6.2626700f, -0.3969755f, 0.7717427f, -1.9667094f, 2.9664171f, -11.9477053f}, + }; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_2[32] = { + 9.8383608f, 3.6922295f, 3.5774977f, -4.4619012f, 6.5087032f, -0.9540017f, -0.5059246f, 0.0706402f, + 14.3396597f, -0.2771132f, -4.8409863f, -8.3581600f, -3.5078344f, 4.3287506f, -5.7808843f, 3.9264839f, + -2.1697845f, -0.0040514f, -0.2095029f, -6.8678174f, 1.7911285f, -0.4510343f, 1.2410443f, -4.5678806f, + -0.5693849f, 2.3320096f, 4.4606552f, -6.3771009f, -4.3149071f, -0.1905672f, -3.5726390f, -1.0744030f}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_2[32][32] = { + {-0.0155548f, 0.0243339f, 0.0037967f, -0.2771824f, 0.0111955f, -0.0115980f, 0.0079653f, -2.9803498f, + -0.0061037f, -0.0956634f, 0.0332446f, 0.0179244f, -0.0080377f, -9.0180779f, 0.1720033f, 0.0350694f, + -0.0146588f, -0.2135506f, -0.3158041f, 1.3697664f, 0.0119146f, 0.0119120f, -0.0986927f, 0.0297492f, + 0.0355827f, -0.1196868f, -0.0745119f, 0.0281862f, -0.0422190f, -0.3069138f, -0.0477367f, -0.0550450f}, + {-1.7374619f, 1.4822800f, -2.1885235f, 1.8354234f, -0.5380136f, 1.6621803f, 0.6251035f, 0.1008954f, + -0.8387129f, -0.2063313f, 1.0661691f, -0.9799694f, -5.1710258f, -3.2260630f, -1.5073707f, -1.0792168f, + 1.8569958f, -0.2289213f, 0.0563821f, -1.6398847f, -4.1649504f, -2.7527378f, -0.0134577f, 3.0424533f, + 0.0364320f, 0.6762254f, -3.1551330f, 2.4888904f, 1.4757305f, -0.3141717f, -2.0126467f, -0.1675602f}, + {-0.9571826f, 0.0914152f, 0.0404339f, 0.2927902f, 0.2933607f, 0.0619171f, 0.0772318f, -1.3796169f, + -0.8194544f, -0.2179988f, -1.1241078f, -0.1443964f, 0.0559355f, -1.2914546f, -0.3445117f, 0.2031156f, + 0.0273864f, -0.0193422f, -0.2136522f, 0.0429592f, 0.0212854f, 0.0414394f, -1.1734651f, 0.0582848f, + 0.0136039f, -0.1892604f, 0.0764908f, -0.0130132f, -0.1272559f, -0.0818855f, -0.0408583f, -0.1563294f}, + {-0.0213695f, 0.0596942f, -0.0641309f, -0.0146449f, 0.0416586f, -0.0378931f, 0.1234860f, 0.1622967f, + 0.0794091f, -0.0639933f, -0.1030663f, 0.0579078f, 0.1050275f, -0.0136866f, 0.0149978f, 0.0876813f, + 0.0693554f, 0.1612417f, -0.0595916f, -0.1008234f, -0.0579058f, 0.0915138f, 0.1321436f, -0.1484535f, + -0.0920316f, -0.0024532f, -0.1045300f, 0.0924260f, 0.0277524f, -0.0287276f, -0.1271127f, 0.1164243f}, + {0.0713067f, 0.0198056f, -0.3023696f, -0.0025908f, -0.0085885f, -1.1157553f, 0.0236462f, -0.0704844f, + -0.0189257f, -0.0997382f, 0.3379845f, -0.1229390f, -0.0616165f, -0.8968034f, 0.0401445f, -0.1144476f, + -0.0532077f, 0.0604580f, 0.0609454f, -0.1613472f, 0.0103525f, -0.1653874f, 0.0205189f, 0.0758978f, + -0.1514593f, 0.0151441f, 0.2043469f, 0.0349607f, -0.1361278f, -0.1255922f, 0.0631648f, 0.3570991f}, + {0.3371337f, -3.7541580f, 2.2215877f, -0.3390516f, 0.1912718f, -4.1861577f, -1.2264019f, 2.8179801f, + 0.0667294f, -0.0093539f, 2.3029909f, 3.1814916f, 3.9780347f, 0.2310601f, 0.3986159f, -0.8544636f, + 0.4139664f, -0.1876569f, -0.2448732f, -2.8053334f, 4.0488625f, 2.1094146f, -6.7310257f, -4.9950023f, + -0.8315823f, 0.0555959f, 2.4573720f, -3.7234364f, -4.2910552f, -0.2995245f, -3.2605181f, 2.3620574f}, + {-1.5522735f, -0.1866350f, -0.0067679f, 0.3196557f, 1.4052233f, 2.8143549f, -0.9992948f, -0.5309914f, + -25.8852596f, -0.1218249f, 0.6625420f, 0.3007106f, -0.2767264f, -0.1847300f, -0.5313534f, -0.0383462f, + -0.1987552f, 0.0581405f, -0.3376078f, 1.2621028f, 0.0818709f, -0.1401216f, -0.4550788f, -0.1592657f, + 0.0597123f, 0.1344101f, -0.1005317f, -0.1538406f, 2.9142656f, -0.0806051f, -0.4267367f, -31.9512234f}, + {0.6859627f, 0.1212986f, 0.1291616f, 0.0459838f, -0.0899920f, 0.0287645f, 0.1987007f, -2.7079368f, + -0.2628384f, -0.1402464f, -0.6302179f, -0.2923960f, -0.1106663f, 0.8256195f, -2.8054097f, -0.0296494f, + -0.5632019f, -0.1335654f, -0.1558440f, -6.8611612f, 0.0203786f, 0.0046566f, -0.4401442f, -0.0471430f, + 0.4535986f, -0.8657981f, 0.0684740f, 0.0518814f, -0.0123748f, -0.2270164f, 0.0922878f, -0.3863277f}, + {0.0127175f, 2.3346109f, -0.4390767f, -0.4657893f, 0.1659466f, -0.1132782f, -0.4928388f, 0.7652873f, + 1.1510741f, -0.0879600f, 0.2721785f, -0.1878961f, -0.3477249f, -0.8473209f, -0.8931856f, -0.4328294f, + -11.9181929f, -0.0282545f, -0.0217915f, 1.6676594f, -0.2122232f, -0.6190930f, 1.9053432f, -0.7592348f, + -1.0739189f, -0.7170524f, 0.3864411f, -0.8849231f, 0.1393488f, 0.0738489f, 0.4460345f, 1.9020857f}, + {0.4453296f, -0.0767821f, 0.1638939f, 1.6997167f, -0.1098599f, -0.0551604f, 0.0040561f, -13.5290670f, + -0.1285677f, -0.0590394f, 0.6499141f, -0.7617344f, 0.0453151f, 0.3104213f, -1.0711143f, 0.1361838f, + -0.4365610f, -0.1300649f, 0.2013344f, -0.5308123f, 0.1451896f, 0.1030715f, -0.6487910f, -0.3136590f, + -0.0280079f, 0.5394178f, 0.1318262f, -0.0159292f, 0.0636870f, -0.3224248f, -0.1868187f, -0.2468304f}, + {-0.0333494f, -0.0834255f, -0.1221875f, 0.6861304f, 0.0521738f, -0.0416543f, -0.4437352f, -19.3246250f, + -0.1520821f, 0.0528602f, -0.6375434f, -0.5803806f, -0.0958465f, -2.0058544f, -0.8282642f, 0.0259000f, + 0.4846996f, 0.1211179f, 0.0356884f, 1.0009497f, 0.0635682f, -0.0314105f, -0.0011147f, 0.0131714f, + -0.3410152f, 0.2798154f, 0.0961889f, 0.1266228f, -0.0934717f, -0.0904307f, 0.1355542f, 0.5722573f}, + {0.2146454f, 0.2143834f, 0.1290650f, -0.9063646f, 0.2100945f, 0.1331054f, -0.2620614f, -0.1264993f, + 0.1313979f, 0.0455465f, -0.8395286f, -0.4967833f, -0.0538581f, 0.9155380f, 0.6627046f, 0.1691243f, + 0.9887002f, -0.1597013f, -0.1236713f, -1.9041336f, 0.0427585f, 0.0849747f, -5.2559652f, -0.3133100f, + 0.0141170f, -0.1635530f, 0.4938746f, 0.0162943f, 0.2107756f, -0.3413893f, -0.0657575f, 1.0542560f}, + {-2.8868380f, -2.0837426f, -1.0611480f, -0.6143807f, -0.6398501f, -2.8018746f, 0.5166737f, -1.0814301f, + -1.9272422f, -0.1017482f, -0.4651161f, -1.4021232f, 1.8854499f, 0.1815407f, 0.5965426f, -2.3344259f, + -0.0690846f, -0.1678239f, -0.4219488f, 0.6215640f, 1.0270095f, -0.3473049f, -0.3926674f, -0.7942593f, + 1.1305071f, -1.4621233f, -0.8051161f, -0.7698632f, -2.6038630f, -0.3090037f, -1.6365144f, -1.0179478f}, + {0.0046026f, 1.1319581f, -2.6405678f, -2.0353596f, -2.1687336f, 0.3364883f, 2.1122196f, 0.2584647f, + -2.4344857f, -0.0378498f, 0.6158544f, -0.6060749f, -4.9598379f, 0.1570698f, 2.2436838f, -2.6198347f, + -2.0935996f, -0.1845744f, -0.0716080f, -1.9338604f, -4.1995640f, -3.6706774f, -1.6762524f, 3.9646862f, + -0.9677961f, 1.8319578f, -3.1916575f, 3.7312632f, 0.0820446f, -0.0497568f, -0.0898171f, -0.2499462f}, + {-0.0780375f, -0.0286571f, 0.1007227f, 0.0012229f, -0.0531285f, 0.0840718f, 0.1013894f, 0.1312424f, + -0.0673772f, 0.1603183f, 0.0074385f, -0.0718321f, -0.1549873f, 0.1616689f, 0.0405887f, -0.1558588f, + 0.0740745f, 0.1696893f, -0.0064026f, -0.1656420f, -0.1186674f, -0.1262667f, -0.0784757f, -0.1280154f, + 0.0909976f, 0.0853046f, -0.1075811f, 0.1310615f, 0.0610194f, 0.0647223f, 0.1360559f, 0.0440074f}, + {-0.2106480f, 0.0087131f, 0.1119385f, -1.0611318f, 0.5250220f, 0.0525479f, -0.2733742f, -1.0799565f, + -0.5601607f, -0.0651806f, -1.9793440f, -0.3373334f, -0.1550518f, 0.8932216f, 0.7264332f, -0.0450735f, + 1.2373760f, -0.1236272f, 0.0680048f, -3.0446634f, -0.1533586f, -0.0127355f, -0.3326311f, -0.0225603f, + -0.2265739f, -2.3752897f, -0.3771705f, -0.0728938f, 0.1741305f, 0.1111639f, 0.4131119f, 0.2239323f}, + {-2.5691276f, -1.4011253f, -2.0640867f, -3.7236946f, 1.5542637f, -0.9456654f, -1.7575809f, 3.6794879f, + -0.4439790f, -0.1009826f, 3.6702275f, -0.1935008f, -0.4423219f, -0.3825364f, -0.4784791f, 0.5927492f, + -2.3482494f, 0.0801714f, -0.1567418f, -1.7934613f, -0.1706410f, -0.6326947f, 0.6260155f, 0.3631033f, + -0.9325932f, 1.9647995f, -1.3409088f, 1.3501998f, 0.0367797f, -0.1744210f, 1.8690013f, -1.0737898f}, + {-0.5934777f, 0.6232591f, -0.3391055f, 0.2640936f, -0.2824444f, 0.4815128f, 0.6625078f, -0.1103976f, + 0.9555223f, -0.0624896f, -0.6778919f, 0.1181502f, -0.5425385f, 0.7297349f, -1.7261271f, -0.2917557f, + 1.1873137f, -0.2725933f, 0.0975242f, 1.7756181f, -0.5735835f, -0.4453230f, 0.9800369f, 0.9344145f, + -1.8692539f, 0.0120440f, -0.7315661f, 0.6250805f, 0.3839143f, -0.0376306f, 0.3816243f, 0.6059195f}, + {0.5522162f, -1.8043815f, -10.9379101f, 0.5719097f, -0.2246755f, -1.4856353f, 0.4877502f, 0.7163438f, + -11.8135147f, -0.0180790f, -0.9928634f, 0.1107815f, -0.0005064f, -0.3824990f, -0.7453306f, -1.9909632f, + -7.4362645f, -0.0245507f, -0.1815712f, -3.5507584f, -0.0075889f, -11.0296011f, -1.1292133f, -0.0710276f, + 0.5675677f, 0.2017778f, -0.0684891f, -0.0367653f, -1.6674192f, 0.0281711f, -0.8356591f, -0.0447807f}, + {0.2537312f, -3.0178010f, -0.3493635f, 1.8573236f, 0.4017631f, 0.9912633f, -0.8625028f, -0.7783228f, + -1.7815375f, -0.1204695f, 1.8551122f, 0.3344182f, -0.2828701f, -1.3226960f, -1.4470471f, 0.2895959f, + 0.6780876f, -0.2010069f, 0.0425280f, -2.1786852f, -0.1274053f, -0.2549899f, -0.2233993f, -0.1561645f, + -0.4640818f, 0.6375850f, 0.7733670f, -0.2388286f, 1.0447853f, -0.1503223f, 0.3823584f, -13.8176088f}, + {0.2575197f, -2.2127593f, -0.0389457f, -0.0215759f, 0.1659477f, -0.0097748f, -0.1935415f, -0.9091369f, + -0.1453371f, 0.0442428f, -0.1206519f, 0.1435609f, -0.0186047f, -5.0154042f, 0.0538177f, 0.0403250f, + 0.0240955f, 0.0331080f, 0.0517951f, 0.7422639f, 0.0069818f, 0.0248351f, -0.2205741f, -0.0082387f, + 0.2043269f, 0.0459435f, 0.0876343f, 0.0140607f, 0.1056308f, 0.0062555f, 0.0184278f, -0.5539715f}, + {-0.0398742f, 0.1075264f, 0.1725024f, -0.0755192f, -0.0360048f, 0.1325573f, 0.0903103f, -0.0882263f, + 0.1207692f, 0.0032722f, 0.0048489f, -0.1257241f, 0.1450990f, -0.0713558f, 0.1116815f, 0.1107689f, + -0.1447252f, 0.1581838f, -0.0160124f, -0.0425587f, 0.1411217f, 0.0865060f, -0.0643460f, -0.0431262f, + -0.1452804f, -0.0195101f, 0.1234572f, 0.0520887f, 0.1117576f, -0.0751791f, 0.1511539f, 0.1224861f}, + {0.7728126f, 2.3075340f, -0.0385258f, -3.1270287f, 0.9414487f, 3.5251477f, -0.8043440f, 0.7212446f, + -7.6850162f, -0.1609414f, -3.7687578f, -1.0751100f, -0.2052089f, 5.0728245f, 2.2835267f, 0.5930225f, + 0.1303335f, -0.1428799f, -0.3715075f, 0.5136011f, -0.4755619f, -0.2192461f, -3.8696294f, -0.0062392f, + -1.3774812f, -0.0034140f, -1.5944362f, 0.9773729f, 3.2859125f, -0.1616932f, -1.2785367f, -13.5732412f}, + {0.5535743f, 0.1461481f, -0.2218016f, -0.2971808f, -0.2169309f, 0.1564545f, -0.0390397f, 1.1558976f, + -0.0119933f, -0.0774637f, 1.1907971f, -0.5127968f, -0.0066028f, -1.6794037f, -0.3650940f, 0.2555613f, + -0.9488379f, 0.0449603f, -0.1620417f, 0.1583214f, 0.0000908f, 0.0152763f, -1.0660053f, -0.0139402f, + -1.7440189f, 0.2515209f, 0.3333162f, 0.1904725f, 0.1116094f, -0.2287960f, -0.0007165f, -1.7047704f}, + {-5.9897852f, -0.1316296f, -0.0218074f, -0.4602887f, 0.3288545f, -0.0882939f, -0.5929499f, 0.4294790f, + -0.0383545f, 0.0556869f, 0.1975944f, 0.1341491f, 0.0629570f, -2.2742157f, 0.0175826f, -0.1439869f, + -24.8701649f, -0.1582915f, -0.2460304f, -3.9643264f, 0.0863483f, 0.0180861f, -0.2210452f, -0.0868723f, + -0.4175525f, -0.8231756f, 0.0247534f, -0.1473545f, -0.0021330f, -0.0410253f, -1.1944869f, -1.1523768f}, + {0.1031547f, -3.3402514f, -4.3636522f, -0.1534714f, -0.0622189f, 0.0374694f, -0.0870097f, -4.1865788f, + -0.0555377f, 0.0252329f, 0.1339467f, 0.0461691f, -0.0503090f, 0.0289890f, -0.0095674f, -0.3289992f, + -0.0279080f, 0.0274977f, -0.0903500f, 0.5610157f, -0.0478177f, 0.4346960f, 0.4822784f, -0.1058945f, + -0.2026870f, -0.0560638f, 0.0910069f, -0.0818529f, 0.0819198f, -0.0292193f, 0.3040628f, -0.1275230f}, + {-5.8789845f, -17.1114635f, -4.6755161f, 0.1016624f, -0.8685016f, -0.3898779f, -2.3363957f, 0.1413794f, + -2.4254086f, -0.2171030f, -0.0901150f, 0.7058705f, 0.4166250f, -0.0231085f, -0.1789686f, -9.4244318f, + -0.6418229f, -0.0857969f, 0.1683681f, -0.0310597f, -0.0247807f, -5.3748040f, -7.4730940f, 0.1019564f, + -1.2126822f, -0.3726285f, -1.0287101f, 0.1803891f, -0.2227769f, -0.0791530f, -0.0159770f, -1.4883354f}, + {-17.9394970f, -0.5228514f, -11.3547935f, -0.0672671f, -2.0371394f, -0.9076943f, 2.4331825f, -6.9409127f, + 0.8286008f, 0.0208618f, -0.8009814f, 1.2268484f, 0.1943726f, -1.7297083f, -0.7668949f, -6.5505466f, + -0.6495168f, -0.0404727f, -0.1260914f, -3.5029383f, -0.0852898f, -2.9679556f, 1.6404767f, -0.0251449f, + 1.1460075f, -0.7877688f, -0.0586593f, -0.4741839f, -1.7420560f, 0.0295600f, -2.3574052f, 0.0974777f}, + {0.4443443f, 0.6384261f, 1.3317494f, -1.0085982f, 0.9508762f, 1.3168396f, -0.1862490f, -0.1801148f, + 1.1106120f, -0.0654911f, 0.1186706f, -0.7198273f, 0.5449172f, -0.5886080f, 0.7504217f, 1.8046317f, + -0.1294390f, -0.1939137f, -0.2383934f, 0.4131435f, 0.6910310f, 1.2821866f, -0.1088722f, -0.5660405f, + -0.1188610f, 0.0364403f, 0.3597929f, -0.6409024f, 1.2114668f, -0.0212278f, 0.8423592f, 0.4848156f}, + {-0.8772649f, -13.5265112f, -4.5540547f, -0.2856667f, 0.7604876f, -0.6829260f, -0.8320626f, 0.6541347f, + 0.4020181f, 0.0009324f, -10.9660740f, -0.3540186f, -0.2316812f, 0.3576394f, 0.0998953f, -1.5738430f, + 1.2089975f, 0.0706465f, -0.2538019f, 0.7016497f, -0.0282650f, -3.1291001f, -0.4375663f, -0.3979468f, + -0.1588882f, 0.3978875f, 0.2038192f, -0.4281644f, -0.5787544f, -0.0922198f, 0.9595569f, 0.0212818f}, + {0.3392667f, 0.1170919f, -0.0705636f, -0.1025443f, -0.1192213f, -0.0495686f, 0.0284667f, -0.1226804f, + 0.0050191f, -0.0516545f, -1.0892097f, 0.0033689f, 0.0471462f, 1.4266804f, 0.0288870f, -0.0110408f, + -1.1283765f, -0.1299917f, -0.4318301f, -0.9854419f, -0.0190479f, -0.0269406f, 0.3697925f, -0.0757695f, + -0.3632923f, -0.1714077f, 0.0669245f, 0.0557428f, -0.1713906f, -0.4307863f, -0.1749060f, -2.1246362f}, + {0.8383662f, -3.8122442f, 0.1568939f, -2.2105119f, -0.7086993f, -0.4664145f, -0.3578597f, 0.5554636f, + 0.6965880f, -0.1506968f, 0.2646832f, 0.2874083f, 0.1901203f, -2.4997077f, -0.3519035f, -0.0518054f, + 1.0862818f, -0.2502540f, -0.3133347f, -0.7411230f, 0.1268138f, 0.1069811f, -0.8109779f, 0.0264679f, + 0.1604289f, -0.7534032f, -0.1419461f, 0.0688303f, -0.1570919f, -0.3055144f, -0.7415189f, 2.5547018f}, + }; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_4[1] = {1.4616280f}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_4[32][1] = { + {0.0609813f}, {0.0685224f}, {0.1655236f}, {-0.0599842f}, {0.0669006f}, {-0.1817371f}, {-0.0539167f}, + {-0.0737955f}, {0.0654664f}, {0.0302955f}, {-0.0586768f}, {0.0717433f}, {0.1472274f}, {-0.0610073f}, + {-0.0601061f}, {0.2086218f}, {-0.0545418f}, {-0.0388369f}, {-0.0613536f}, {-0.1141072f}, {-0.2289097f}, + {-0.3354485f}, {0.0831025f}, {0.1333673f}, {0.0490410f}, {0.0484894f}, {0.0436755f}, {-0.1479877f}, + {0.1540713f}, {0.0021261f}, {-0.0845848f}, {-0.0564973f}, + }; +} // namespace T5DNN + +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/PixelMap.h b/RecoTracker/LSTCore/src/alpaka/PixelMap.h new file mode 100644 index 0000000000000..d6cbdffebe096 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/PixelMap.h @@ -0,0 +1,39 @@ +#ifndef PixelMap_h +#define PixelMap_h + +#include +#include + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#else +#include "Constants.h" +#endif + +namespace SDL { + // PixelMap is never allocated on the device. + // This is also not passed to any of the kernels, so we can combine the structs. + struct pixelMap { + uint16_t pixelModuleIndex; + + std::vector connectedPixelsIndex; + std::vector connectedPixelsSizes; + std::vector connectedPixelsIndexPos; + std::vector connectedPixelsSizesPos; + std::vector connectedPixelsIndexNeg; + std::vector connectedPixelsSizesNeg; + + int* pixelType; + + pixelMap(unsigned int sizef = size_superbins) + : pixelModuleIndex(0), + connectedPixelsIndex(sizef), + connectedPixelsSizes(sizef), + connectedPixelsIndexPos(sizef), + connectedPixelsSizesPos(sizef), + connectedPixelsIndexNeg(sizef), + connectedPixelsSizesNeg(sizef) {} + }; +} // namespace SDL + +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h b/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h new file mode 100644 index 0000000000000..bd048f9c819a2 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h @@ -0,0 +1,2789 @@ +#ifndef PixelTriplet_cuh +#define PixelTriplet_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "Triplet.h" +#include "Segment.h" +#include "MiniDoublet.h" +#include "Hit.h" +#include "Quintuplet.h" + +namespace SDL { + // One pixel segment, one outer tracker triplet! + struct pixelTriplets { + unsigned int* pixelSegmentIndices; + unsigned int* tripletIndices; + unsigned int* nPixelTriplets; + unsigned int* totOccupancyPixelTriplets; + + float* pixelRadiusError; + float* rPhiChiSquared; + float* rPhiChiSquaredInwards; + float* rzChiSquared; + + FPX* pixelRadius; + FPX* tripletRadius; + FPX* pt; + FPX* eta; + FPX* phi; + FPX* eta_pix; + FPX* phi_pix; + FPX* score; + bool* isDup; + bool* partOfPT5; + + uint8_t* logicalLayers; + unsigned int* hitIndices; + uint16_t* lowerModuleIndices; + FPX* centerX; + FPX* centerY; + + template + void setData(TBuff& pixelTripletsBuffer) { + pixelSegmentIndices = alpaka::getPtrNative(pixelTripletsBuffer.pixelSegmentIndices_buf); + tripletIndices = alpaka::getPtrNative(pixelTripletsBuffer.tripletIndices_buf); + nPixelTriplets = alpaka::getPtrNative(pixelTripletsBuffer.nPixelTriplets_buf); + totOccupancyPixelTriplets = alpaka::getPtrNative(pixelTripletsBuffer.totOccupancyPixelTriplets_buf); + pixelRadius = alpaka::getPtrNative(pixelTripletsBuffer.pixelRadius_buf); + tripletRadius = alpaka::getPtrNative(pixelTripletsBuffer.tripletRadius_buf); + pt = alpaka::getPtrNative(pixelTripletsBuffer.pt_buf); + eta = alpaka::getPtrNative(pixelTripletsBuffer.eta_buf); + phi = alpaka::getPtrNative(pixelTripletsBuffer.phi_buf); + eta_pix = alpaka::getPtrNative(pixelTripletsBuffer.eta_pix_buf); + phi_pix = alpaka::getPtrNative(pixelTripletsBuffer.phi_pix_buf); + score = alpaka::getPtrNative(pixelTripletsBuffer.score_buf); + isDup = alpaka::getPtrNative(pixelTripletsBuffer.isDup_buf); + partOfPT5 = alpaka::getPtrNative(pixelTripletsBuffer.partOfPT5_buf); + logicalLayers = alpaka::getPtrNative(pixelTripletsBuffer.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(pixelTripletsBuffer.hitIndices_buf); + lowerModuleIndices = alpaka::getPtrNative(pixelTripletsBuffer.lowerModuleIndices_buf); + centerX = alpaka::getPtrNative(pixelTripletsBuffer.centerX_buf); + centerY = alpaka::getPtrNative(pixelTripletsBuffer.centerY_buf); + pixelRadiusError = alpaka::getPtrNative(pixelTripletsBuffer.pixelRadiusError_buf); + rPhiChiSquared = alpaka::getPtrNative(pixelTripletsBuffer.rPhiChiSquared_buf); + rPhiChiSquaredInwards = alpaka::getPtrNative(pixelTripletsBuffer.rPhiChiSquaredInwards_buf); + rzChiSquared = alpaka::getPtrNative(pixelTripletsBuffer.rzChiSquared_buf); + } + }; + + template + struct pixelTripletsBuffer : pixelTriplets { + Buf pixelSegmentIndices_buf; + Buf tripletIndices_buf; + Buf nPixelTriplets_buf; + Buf totOccupancyPixelTriplets_buf; + Buf pixelRadius_buf; + Buf tripletRadius_buf; + Buf pt_buf; + Buf eta_buf; + Buf phi_buf; + Buf eta_pix_buf; + Buf phi_pix_buf; + Buf score_buf; + Buf isDup_buf; + Buf partOfPT5_buf; + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf lowerModuleIndices_buf; + Buf centerX_buf; + Buf centerY_buf; + Buf pixelRadiusError_buf; + Buf rPhiChiSquared_buf; + Buf rPhiChiSquaredInwards_buf; + Buf rzChiSquared_buf; + + template + pixelTripletsBuffer(unsigned int maxPixelTriplets, TDevAcc const& devAccIn, TQueue& queue) + : pixelSegmentIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + tripletIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + nPixelTriplets_buf(allocBufWrapper(devAccIn, 1, queue)), + totOccupancyPixelTriplets_buf(allocBufWrapper(devAccIn, 1, queue)), + pixelRadius_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + tripletRadius_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + pt_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + eta_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + phi_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + eta_pix_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + phi_pix_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + score_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxPixelTriplets * 5, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets * 10, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, maxPixelTriplets * 5, queue)), + centerX_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + centerY_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + pixelRadiusError_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + rPhiChiSquared_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)), + rzChiSquared_buf(allocBufWrapper(devAccIn, maxPixelTriplets, queue)) { + alpaka::memset(queue, nPixelTriplets_buf, 0u); + alpaka::memset(queue, totOccupancyPixelTriplets_buf, 0u); + alpaka::memset(queue, partOfPT5_buf, false); + alpaka::wait(queue); + } + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::triplets& tripletsInGPU, + struct SDL::pixelTriplets& pixelTripletsInGPU, + unsigned int pixelSegmentIndex, + unsigned int tripletIndex, + float pixelRadius, + float tripletRadius, + float centerX, + float centerY, + float rPhiChiSquared, + float rPhiChiSquaredInwards, + float rzChiSquared, + unsigned int pixelTripletIndex, + float pt, + float eta, + float phi, + float eta_pix, + float phi_pix, + float score) { + pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex] = pixelSegmentIndex; + pixelTripletsInGPU.tripletIndices[pixelTripletIndex] = tripletIndex; + pixelTripletsInGPU.pixelRadius[pixelTripletIndex] = __F2H(pixelRadius); + pixelTripletsInGPU.tripletRadius[pixelTripletIndex] = __F2H(tripletRadius); + pixelTripletsInGPU.pt[pixelTripletIndex] = __F2H(pt); + pixelTripletsInGPU.eta[pixelTripletIndex] = __F2H(eta); + pixelTripletsInGPU.phi[pixelTripletIndex] = __F2H(phi); + pixelTripletsInGPU.eta_pix[pixelTripletIndex] = __F2H(eta_pix); + pixelTripletsInGPU.phi_pix[pixelTripletIndex] = __F2H(phi_pix); + pixelTripletsInGPU.isDup[pixelTripletIndex] = false; + pixelTripletsInGPU.score[pixelTripletIndex] = __F2H(score); + + pixelTripletsInGPU.centerX[pixelTripletIndex] = __F2H(centerX); + pixelTripletsInGPU.centerY[pixelTripletIndex] = __F2H(centerY); + pixelTripletsInGPU.logicalLayers[5 * pixelTripletIndex] = 0; + pixelTripletsInGPU.logicalLayers[5 * pixelTripletIndex + 1] = 0; + pixelTripletsInGPU.logicalLayers[5 * pixelTripletIndex + 2] = tripletsInGPU.logicalLayers[tripletIndex * 3]; + pixelTripletsInGPU.logicalLayers[5 * pixelTripletIndex + 3] = tripletsInGPU.logicalLayers[tripletIndex * 3 + 1]; + pixelTripletsInGPU.logicalLayers[5 * pixelTripletIndex + 4] = tripletsInGPU.logicalLayers[tripletIndex * 3 + 2]; + + pixelTripletsInGPU.lowerModuleIndices[5 * pixelTripletIndex] = + segmentsInGPU.innerLowerModuleIndices[pixelSegmentIndex]; + pixelTripletsInGPU.lowerModuleIndices[5 * pixelTripletIndex + 1] = + segmentsInGPU.outerLowerModuleIndices[pixelSegmentIndex]; + pixelTripletsInGPU.lowerModuleIndices[5 * pixelTripletIndex + 2] = + tripletsInGPU.lowerModuleIndices[3 * tripletIndex]; + pixelTripletsInGPU.lowerModuleIndices[5 * pixelTripletIndex + 3] = + tripletsInGPU.lowerModuleIndices[3 * tripletIndex + 1]; + pixelTripletsInGPU.lowerModuleIndices[5 * pixelTripletIndex + 4] = + tripletsInGPU.lowerModuleIndices[3 * tripletIndex + 2]; + + unsigned int pixelInnerMD = segmentsInGPU.mdIndices[2 * pixelSegmentIndex]; + unsigned int pixelOuterMD = segmentsInGPU.mdIndices[2 * pixelSegmentIndex + 1]; + + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex] = mdsInGPU.anchorHitIndices[pixelInnerMD]; + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 1] = mdsInGPU.outerHitIndices[pixelInnerMD]; + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 2] = mdsInGPU.anchorHitIndices[pixelOuterMD]; + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 3] = mdsInGPU.outerHitIndices[pixelOuterMD]; + + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 4] = tripletsInGPU.hitIndices[6 * tripletIndex]; + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 5] = tripletsInGPU.hitIndices[6 * tripletIndex + 1]; + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 6] = tripletsInGPU.hitIndices[6 * tripletIndex + 2]; + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 7] = tripletsInGPU.hitIndices[6 * tripletIndex + 3]; + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 8] = tripletsInGPU.hitIndices[6 * tripletIndex + 4]; + pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex + 9] = tripletsInGPU.hitIndices[6 * tripletIndex + 5]; + pixelTripletsInGPU.rPhiChiSquared[pixelTripletIndex] = rPhiChiSquared; + pixelTripletsInGPU.rPhiChiSquaredInwards[pixelTripletIndex] = rPhiChiSquaredInwards; + pixelTripletsInGPU.rzChiSquared[pixelTripletIndex] = rzChiSquared; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::objectRanges& rangesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& pixelLowerModuleIndex, + uint16_t& outerInnerLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + float& zOut, + float& rtOut, + float& deltaPhiPos, + float& deltaPhi, + float& betaIn, + float& betaOut, + float& pt_beta, + float& zLo, + float& zHi, + float& rtLo, + float& rtHi, + float& zLoPointed, + float& zHiPointed, + float& sdlCut, + float& betaInCut, + float& betaOutCut, + float& deltaBetaCut, + float& kZ) { + zLo = -999; + zHi = -999; + rtLo = -999; + rtHi = -999; + zLoPointed = -999; + zHiPointed = -999; + kZ = -999; + betaInCut = -999; + + short outerInnerLowerModuleSubdet = modulesInGPU.subdets[outerInnerLowerModuleIndex]; + short outerOuterLowerModuleSubdet = modulesInGPU.subdets[outerOuterLowerModuleIndex]; + + unsigned int firstMDIndex = segmentsInGPU.mdIndices[2 * innerSegmentIndex]; + unsigned int secondMDIndex = segmentsInGPU.mdIndices[2 * innerSegmentIndex + 1]; + + unsigned int thirdMDIndex = segmentsInGPU.mdIndices[2 * outerSegmentIndex]; + unsigned int fourthMDIndex = segmentsInGPU.mdIndices[2 * outerSegmentIndex + 1]; + + if (outerInnerLowerModuleSubdet == SDL::Barrel and + (outerOuterLowerModuleSubdet == SDL::Barrel or outerOuterLowerModuleSubdet == SDL::Endcap)) { + return runTripletDefaultAlgoPPBB(acc, + modulesInGPU, + rangesInGPU, + mdsInGPU, + segmentsInGPU, + pixelLowerModuleIndex, + outerInnerLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + zHi, + zLoPointed, + zHiPointed, + sdlCut, + betaOutCut, + deltaBetaCut); + } else if (outerInnerLowerModuleSubdet == SDL::Endcap and outerOuterLowerModuleSubdet == SDL::Endcap) { + return runTripletDefaultAlgoPPEE(acc, + modulesInGPU, + rangesInGPU, + mdsInGPU, + segmentsInGPU, + pixelLowerModuleIndex, + outerInnerLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + rtLo, + rtHi, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut, + kZ); + } + return false; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RZChiSquaredCuts(struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + float& rzChiSquared) { + const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == SDL::TwoS); + const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == SDL::TwoS); + const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == SDL::TwoS); + + if (layer1 == 8 and layer2 == 9 and layer3 == 10) { + return rzChiSquared < 13.6067f; + } else if (layer1 == 8 and layer2 == 9 and layer3 == 15) { + return rzChiSquared < 5.5953f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 9) { + return rzChiSquared < 3.9263f; + } + /* + else if(layer1 == 7 and layer2 == 8 and layer3 == 14) + { + // PS+PS+2S in endcap layers 1+2+3, which is not really feasible in the current geometry, + // without skipping barrel layers 1 and 2 (not allowed by algorithm logic). + } + */ + else if (layer1 == 1 and layer2 == 2 and layer3 == 3) { + return rzChiSquared < 9.4377f; + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) { + return rzChiSquared < 9.9975f; + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8) { + return rzChiSquared < 8.6369f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 7) { + return rzChiSquared < 37.945f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 12) { + return rzChiSquared < 43.0167f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) { + return rzChiSquared < 8.6923f; + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) { + return rzChiSquared < 11.9672f; + } else if (layer1 == 2 and layer2 == 7 and layer3 == 13) { + return rzChiSquared < 16.2133f; + } + + //default - category not found! + return true; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeChiSquaredpT3(TAcc const& acc, + unsigned int nPoints, + float* xs, + float* ys, + float* delta1, + float* delta2, + float* slopes, + bool* isFlat, + float g, + float f, + float radius) { + //given values of (g, f, radius) and a set of points (and its uncertainties) + //compute chi squared + float c = g * g + f * f - radius * radius; + float chiSquared = 0.f; + float absArctanSlope, angleM, xPrime, yPrime, sigma; + for (size_t i = 0; i < nPoints; i++) { + absArctanSlope = ((slopes[i] != SDL::SDL_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) + : 0.5f * float(M_PI)); + if (xs[i] > 0 and ys[i] > 0) { + angleM = 0.5f * float(M_PI) - absArctanSlope; + } else if (xs[i] < 0 and ys[i] > 0) { + angleM = absArctanSlope + 0.5f * float(M_PI); + } else if (xs[i] < 0 and ys[i] < 0) { + angleM = -(absArctanSlope + 0.5f * float(M_PI)); + } else if (xs[i] > 0 and ys[i] < 0) { + angleM = -(0.5f * float(M_PI) - absArctanSlope); + } else { + angleM = 0; + } + + if (not isFlat[i]) { + xPrime = xs[i] * alpaka::math::cos(acc, angleM) + ys[i] * alpaka::math::sin(acc, angleM); + yPrime = ys[i] * alpaka::math::cos(acc, angleM) - xs[i] * alpaka::math::sin(acc, angleM); + } else { + xPrime = xs[i]; + yPrime = ys[i]; + } + sigma = 2 * alpaka::math::sqrt( + acc, (xPrime * delta1[i]) * (xPrime * delta1[i]) + (yPrime * delta2[i]) * (yPrime * delta2[i])); + chiSquared += (xs[i] * xs[i] + ys[i] * ys[i] - 2 * g * xs[i] - 2 * f * ys[i] + c) * + (xs[i] * xs[i] + ys[i] * ys[i] - 2 * g * xs[i] - 2 * f * ys[i] + c) / (sigma * sigma); + } + return chiSquared; + }; + + //TODO: merge this one and the pT5 function later into a single function + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT3RPhiChiSquared(TAcc const& acc, + struct SDL::modules& modulesInGPU, + uint16_t* lowerModuleIndices, + float& g, + float& f, + float& radius, + float* xs, + float* ys) { + float delta1[3]{}, delta2[3]{}, slopes[3]; + bool isFlat[3]{}; + float chiSquared = 0; + float inv1 = 0.01f / 0.009f; + float inv2 = 0.15f / 0.009f; + for (size_t i = 0; i < 3; i++) { + ModuleType moduleType = modulesInGPU.moduleType[lowerModuleIndices[i]]; + short moduleSubdet = modulesInGPU.subdets[lowerModuleIndices[i]]; + short moduleSide = modulesInGPU.sides[lowerModuleIndices[i]]; + float drdz = modulesInGPU.drdzs[lowerModuleIndices[i]]; + slopes[i] = modulesInGPU.dxdys[lowerModuleIndices[i]]; + //category 1 - barrel PS flat + if (moduleSubdet == Barrel and moduleType == PS and moduleSide == Center) { + delta1[i] = inv1; + delta2[i] = inv1; + slopes[i] = -999; + isFlat[i] = true; + } + //category 2 - barrel 2S + else if (moduleSubdet == Barrel and moduleType == TwoS) { + delta1[i] = 1; + delta2[i] = 1; + slopes[i] = -999; + isFlat[i] = true; + } + //category 3 - barrel PS tilted + else if (moduleSubdet == Barrel and moduleType == PS and moduleSide != Center) { + delta1[i] = inv1; + isFlat[i] = false; + delta2[i] = (inv2 * drdz / alpaka::math::sqrt(acc, 1 + drdz * drdz)); + } + //category 4 - endcap PS + else if (moduleSubdet == Endcap and moduleType == PS) { + delta1[i] = inv1; + isFlat[i] = false; + + /* + despite the type of the module layer of the lower module index, all anchor + hits are on the pixel side and all non-anchor hits are on the strip side! + */ + delta2[i] = inv2; + } + //category 5 - endcap 2S + else if (moduleSubdet == Endcap and moduleType == TwoS) { + delta1[i] = 1; + delta2[i] = 500 * inv1; + isFlat[i] = false; + } +#ifdef Warnings + else { + printf("ERROR!!!!! I SHOULDN'T BE HERE!!!! subdet = %d, type = %d, side = %d\n", + moduleSubdet, + moduleType, + moduleSide); + } +#endif + } + chiSquared = computeChiSquaredpT3(acc, 3, xs, ys, delta1, delta2, slopes, isFlat, g, f, radius); + + return chiSquared; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT3RPhiChiSquaredInwards( + struct SDL::modules& modulesInGPU, float& g, float& f, float& r, float* xPix, float* yPix) { + float residual = (xPix[0] - g) * (xPix[0] - g) + (yPix[0] - f) * (yPix[0] - f) - r * r; + float chiSquared = residual * residual; + residual = (xPix[1] - g) * (xPix[1] - g) + (yPix[1] - f) * (yPix[1] - f) - r * r; + chiSquared += residual * residual; + + chiSquared *= 0.5f; + return chiSquared; + }; + + //90pc threshold + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RPhiChiSquaredCuts(struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + float& chiSquared) { + const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == SDL::TwoS); + const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == SDL::TwoS); + const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == SDL::TwoS); + + if (layer1 == 8 and layer2 == 9 and layer3 == 10) { + return chiSquared < 7.003f; + } else if (layer1 == 8 and layer2 == 9 and layer3 == 15) { + return chiSquared < 0.5f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 9) { + return chiSquared < 8.046f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 14) { + return chiSquared < 0.575f; + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) { + return chiSquared < 5.304f; + } else if (layer1 == 1 and layer2 == 2 and layer3 == 3) { + return chiSquared < 10.6211f; + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8) { + return chiSquared < 4.617f; + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) { + return chiSquared < 8.046f; + } else if (layer1 == 2 and layer2 == 7 and layer3 == 13) { + return chiSquared < 0.435f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 7) { + return chiSquared < 9.244f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 12) { + return chiSquared < 0.287f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) { + return chiSquared < 18.509f; + } + + return true; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RPhiChiSquaredInwardsCuts(struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + float& chiSquared) { + const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == SDL::TwoS); + const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == SDL::TwoS); + const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == SDL::TwoS); + + if (layer1 == 7 and layer2 == 8 and layer3 == 9) // endcap layer 1,2,3, ps + { + return chiSquared < 22016.8055f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 14) // endcap layer 1,2,3 layer3->2s + { + return chiSquared < 935179.56807f; + } else if (layer1 == 8 and layer2 == 9 and layer3 == 10) // endcap layer 2,3,4 + { + return chiSquared < 29064.12959f; + } else if (layer1 == 8 and layer2 == 9 and layer3 == 15) // endcap layer 2,3,4, layer3->2s + { + return chiSquared < 935179.5681f; + } else if (layer1 == 1 and layer2 == 2 and layer3 == 3) // barrel 1,2,3 + { + return chiSquared < 1370.0113195101474f; + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) // barrel 1,2 endcap 1 + { + return chiSquared < 5492.110048314815f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) // barrel 2,3,4 + { + return chiSquared < 4160.410806470067f; + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8) // barrel 1, endcap 1,2 + { + return chiSquared < 29064.129591225726f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 7) // barrel 2,3 endcap 1 + { + return chiSquared < 12634.215376250893f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 12) // barrel 2,3, endcap 1->2s + { + return chiSquared < 353821.69361145404f; + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) // barrel2, endcap 1,2 + { + return chiSquared < 33393.26076341235f; + } else if (layer1 == 2 and layer2 == 7 and layer3 == 13) //barrel 2, endcap 1, endcap2->2s + { + return chiSquared < 935179.5680742573f; + } + + return true; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool checkIntervalOverlappT3(const float& firstMin, + const float& firstMax, + const float& secondMin, + const float& secondMax) { + return ((firstMin <= secondMin) && (secondMin < firstMax)) || ((secondMin < firstMin) && (firstMin < secondMax)); + }; + + /*bounds for high Pt taken from : http://uaf-10.t2.ucsd.edu/~bsathian/SDL/T5_efficiency/efficiencies/new_efficiencies/efficiencies_20210513_T5_recovering_high_Pt_efficiencies/highE_radius_matching/highE_bounds.txt */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRadiusCriterionBBB(TAcc const& acc, + float const& pixelRadius, + float const& pixelRadiusError, + float const& tripletRadius) { + float tripletInvRadiusErrorBound = 0.15624f; + float pixelInvRadiusErrorBound = 0.17235f; + + if (pixelRadius > 2.0f * kR1GeVf) { + pixelInvRadiusErrorBound = 0.6375f; + tripletInvRadiusErrorBound = 0.6588f; + } + + float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound) / tripletRadius; + float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound) / tripletRadius, 0.0f); + + float pixelRadiusInvMax = + alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound) / pixelRadius, 1.f / (pixelRadius - pixelRadiusError)); + float pixelRadiusInvMin = + alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound) / pixelRadius, 1.f / (pixelRadius + pixelRadiusError)); + + return checkIntervalOverlappT3(tripletRadiusInvMin, tripletRadiusInvMax, pixelRadiusInvMin, pixelRadiusInvMax); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRadiusCriterionBBE(TAcc const& acc, + float const& pixelRadius, + float const& pixelRadiusError, + float const& tripletRadius) { + float tripletInvRadiusErrorBound = 0.45972f; + float pixelInvRadiusErrorBound = 0.19644f; + + if (pixelRadius > 2.0f * kR1GeVf) { + pixelInvRadiusErrorBound = 0.6805f; + tripletInvRadiusErrorBound = 0.8557f; + } + + float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound) / tripletRadius; + float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound) / tripletRadius, 0.0f); + + float pixelRadiusInvMax = + alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound) / pixelRadius, 1.f / (pixelRadius - pixelRadiusError)); + float pixelRadiusInvMin = + alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound) / pixelRadius, 1.f / (pixelRadius + pixelRadiusError)); + + return checkIntervalOverlappT3(tripletRadiusInvMin, tripletRadiusInvMax, pixelRadiusInvMin, pixelRadiusInvMax); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRadiusCriterionBEE(TAcc const& acc, + float const& pixelRadius, + float const& pixelRadiusError, + float const& tripletRadius) { + float tripletInvRadiusErrorBound = 1.59294f; + float pixelInvRadiusErrorBound = 0.255181f; + + if (pixelRadius > 2.0f * kR1GeVf) //as good as not having selections + { + pixelInvRadiusErrorBound = 2.2091f; + tripletInvRadiusErrorBound = 2.3548f; + } + + float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound) / tripletRadius; + float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound) / tripletRadius, 0.0f); + + float pixelRadiusInvMax = + alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound) / pixelRadius, 1.f / (pixelRadius - pixelRadiusError)); + float pixelRadiusInvMin = + alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound) / pixelRadius, 1.f / (pixelRadius + pixelRadiusError)); + pixelRadiusInvMin = alpaka::math::max(acc, pixelRadiusInvMin, 0.0f); + + return checkIntervalOverlappT3(tripletRadiusInvMin, tripletRadiusInvMax, pixelRadiusInvMin, pixelRadiusInvMax); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRadiusCriterionEEE(TAcc const& acc, + float const& pixelRadius, + float const& pixelRadiusError, + float const& tripletRadius) { + float tripletInvRadiusErrorBound = 1.7006f; + float pixelInvRadiusErrorBound = 0.26367f; + + if (pixelRadius > 2.0f * kR1GeVf) //as good as not having selections + { + pixelInvRadiusErrorBound = 2.286f; + tripletInvRadiusErrorBound = 2.436f; + } + + float tripletRadiusInvMax = (1 + tripletInvRadiusErrorBound) / tripletRadius; + float tripletRadiusInvMin = alpaka::math::max(acc, (1 - tripletInvRadiusErrorBound) / tripletRadius, 0.0f); + + float pixelRadiusInvMax = + alpaka::math::max(acc, (1 + pixelInvRadiusErrorBound) / pixelRadius, 1.f / (pixelRadius - pixelRadiusError)); + float pixelRadiusInvMin = + alpaka::math::min(acc, (1 - pixelInvRadiusErrorBound) / pixelRadius, 1.f / (pixelRadius + pixelRadiusError)); + pixelRadiusInvMin = alpaka::math::max(acc, 0.0f, pixelRadiusInvMin); + + return checkIntervalOverlappT3(tripletRadiusInvMin, tripletRadiusInvMax, pixelRadiusInvMin, pixelRadiusInvMax); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRadiusCriterion(TAcc const& acc, + struct SDL::modules const& modulesInGPU, + float const& pixelRadius, + float const& pixelRadiusError, + float const& tripletRadius, + uint16_t const& lowerModuleIndex, + uint16_t const& middleModuleIndex, + uint16_t const& upperModuleIndex) { + if (modulesInGPU.subdets[lowerModuleIndex] == SDL::Endcap) { + return passRadiusCriterionEEE(acc, pixelRadius, pixelRadiusError, tripletRadius); + } else if (modulesInGPU.subdets[middleModuleIndex] == SDL::Endcap) { + return passRadiusCriterionBEE(acc, pixelRadius, pixelRadiusError, tripletRadius); + } else if (modulesInGPU.subdets[upperModuleIndex] == SDL::Endcap) { + return passRadiusCriterionBBE(acc, pixelRadius, pixelRadiusError, tripletRadius); + } else { + return passRadiusCriterionBBB(acc, pixelRadius, pixelRadiusError, tripletRadius); + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT3RZChiSquared(TAcc const& acc, + struct SDL::modules const& modulesInGPU, + const uint16_t* lowerModuleIndices, + const float* rtPix, + const float* xPix, + const float* yPix, + const float* zPix, + const float* rts, + const float* xs, + const float* ys, + const float* zs, + float pixelSegmentPt, + float pixelSegmentPx, + float pixelSegmentPy, + float pixelSegmentPz, + int pixelSegmentCharge) { + float residual = 0; + float error = 0; + float RMSE = 0; + + float Px = pixelSegmentPx, Py = pixelSegmentPy, Pz = pixelSegmentPz; + int charge = pixelSegmentCharge; + float x1 = xPix[1] / 100; + float y1 = yPix[1] / 100; + float z1 = zPix[1] / 100; + float r1 = rtPix[1] / 100; + + float Bz = SDL::magnetic_field; + float a = -0.299792 * Bz * charge; + + for (size_t i = 0; i < 3; i++) { + float zsi = zs[i] / 100; + float rtsi = rts[i] / 100; + uint16_t lowerModuleIndex = lowerModuleIndices[i]; + const int moduleType = modulesInGPU.moduleType[lowerModuleIndex]; + const int moduleSide = modulesInGPU.sides[lowerModuleIndex]; + const int moduleSubdet = modulesInGPU.subdets[lowerModuleIndex]; + + // calculation is detailed documented here https://indico.cern.ch/event/1185895/contributions/4982756/attachments/2526561/4345805/helix%20pT3%20summarize.pdf + float diffr, diffz; + float p = alpaka::math::sqrt(acc, Px * Px + Py * Py + Pz * Pz); + + float rou = a / p; + if (moduleSubdet == SDL::Endcap) { + float s = (zsi - z1) * p / Pz; + float x = x1 + Px / a * alpaka::math::sin(acc, rou * s) - Py / a * (1 - alpaka::math::cos(acc, rou * s)); + float y = y1 + Py / a * alpaka::math::sin(acc, rou * s) + Px / a * (1 - alpaka::math::cos(acc, rou * s)); + diffr = alpaka::math::abs(acc, rtsi - alpaka::math::sqrt(acc, x * x + y * y)) * 100; + } + + if (moduleSubdet == SDL::Barrel) { + float paraA = r1 * r1 + 2 * (Px * Px + Py * Py) / (a * a) + 2 * (y1 * Px - x1 * Py) / a - rtsi * rtsi; + float paraB = 2 * (x1 * Px + y1 * Py) / a; + float paraC = 2 * (y1 * Px - x1 * Py) / a + 2 * (Px * Px + Py * Py) / (a * a); + float A = paraB * paraB + paraC * paraC; + float B = 2 * paraA * paraB; + float C = paraA * paraA - paraC * paraC; + float sol1 = (-B + alpaka::math::sqrt(acc, B * B - 4 * A * C)) / (2 * A); + float sol2 = (-B - alpaka::math::sqrt(acc, B * B - 4 * A * C)) / (2 * A); + float solz1 = alpaka::math::asin(acc, sol1) / rou * Pz / p + z1; + float solz2 = alpaka::math::asin(acc, sol2) / rou * Pz / p + z1; + float diffz1 = alpaka::math::abs(acc, solz1 - zsi) * 100; + float diffz2 = alpaka::math::abs(acc, solz2 - zsi) * 100; + diffz = alpaka::math::min(acc, diffz1, diffz2); + } + + residual = moduleSubdet == SDL::Barrel ? diffz : diffr; + + //PS Modules + if (moduleType == 0) { + error = 0.15f; + } else //2S modules + { + error = 5.0f; + } + + //special dispensation to tilted PS modules! + if (moduleType == 0 and moduleSubdet == SDL::Barrel and moduleSide != Center) { + float drdz = modulesInGPU.drdzs[lowerModuleIndex]; + error /= alpaka::math::sqrt(acc, 1 + drdz * drdz); + } + RMSE += (residual * residual) / (error * error); + } + + RMSE = alpaka::math::sqrt(acc, 0.2f * RMSE); //the constant doesn't really matter.... + + return RMSE; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::objectRanges& rangesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::triplets& tripletsInGPU, + unsigned int& pixelSegmentIndex, + unsigned int tripletIndex, + float& pixelRadius, + float& pixelRadiusError, + float& tripletRadius, + float& centerX, + float& centerY, + float& rzChiSquared, + float& rPhiChiSquared, + float& rPhiChiSquaredInwards, + bool runChiSquaredCuts = true) { + bool pass = true; + + //run pT4 compatibility between the pixel segment and inner segment, and between the pixel and outer segment of the triplet + uint16_t pixelModuleIndex = segmentsInGPU.innerLowerModuleIndices[pixelSegmentIndex]; + + uint16_t lowerModuleIndex = tripletsInGPU.lowerModuleIndices[3 * tripletIndex]; + uint16_t middleModuleIndex = tripletsInGPU.lowerModuleIndices[3 * tripletIndex + 1]; + uint16_t upperModuleIndex = tripletsInGPU.lowerModuleIndices[3 * tripletIndex + 2]; + + { + //placeholder + float zOut, rtOut, deltaPhiPos, deltaPhi, betaIn, betaOut, pt_beta; //temp stuff + float zLo, zHi, rtLo, rtHi, zLoPointed, zHiPointed, sdlCut, betaInCut, betaOutCut, deltaBetaCut, kZ; + + // pixel segment vs inner segment of the triplet + pass = pass and runPixelTrackletDefaultAlgopT3(acc, + modulesInGPU, + rangesInGPU, + mdsInGPU, + segmentsInGPU, + pixelModuleIndex, + lowerModuleIndex, + middleModuleIndex, + pixelSegmentIndex, + tripletsInGPU.segmentIndices[2 * tripletIndex], + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + zHi, + rtLo, + rtHi, + zLoPointed, + zHiPointed, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut, + kZ); + if (not pass) + return pass; + + //pixel segment vs outer segment of triplet + pass = pass and runPixelTrackletDefaultAlgopT3(acc, + modulesInGPU, + rangesInGPU, + mdsInGPU, + segmentsInGPU, + pixelModuleIndex, + middleModuleIndex, + upperModuleIndex, + pixelSegmentIndex, + tripletsInGPU.segmentIndices[2 * tripletIndex + 1], + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + zHi, + rtLo, + rtHi, + zLoPointed, + zHiPointed, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut, + kZ); + if (not pass) + return pass; + } + + //pt matching between the pixel ptin and the triplet circle pt + unsigned int pixelSegmentArrayIndex = pixelSegmentIndex - rangesInGPU.segmentModuleIndices[pixelModuleIndex]; + float pixelSegmentPt = segmentsInGPU.ptIn[pixelSegmentArrayIndex]; + float pixelSegmentPtError = segmentsInGPU.ptErr[pixelSegmentArrayIndex]; + float pixelSegmentPx = segmentsInGPU.px[pixelSegmentArrayIndex]; + float pixelSegmentPy = segmentsInGPU.py[pixelSegmentArrayIndex]; + float pixelSegmentPz = segmentsInGPU.pz[pixelSegmentArrayIndex]; + int pixelSegmentCharge = segmentsInGPU.charge[pixelSegmentArrayIndex]; + + float pixelG = segmentsInGPU.circleCenterX[pixelSegmentArrayIndex]; + float pixelF = segmentsInGPU.circleCenterY[pixelSegmentArrayIndex]; + float pixelRadiusPCA = segmentsInGPU.circleRadius[pixelSegmentArrayIndex]; + + unsigned int pixelInnerMDIndex = segmentsInGPU.mdIndices[2 * pixelSegmentIndex]; + unsigned int pixelOuterMDIndex = segmentsInGPU.mdIndices[2 * pixelSegmentIndex + 1]; + + pixelRadius = pixelSegmentPt * kR1GeVf; + pixelRadiusError = pixelSegmentPtError * kR1GeVf; + unsigned int tripletInnerSegmentIndex = tripletsInGPU.segmentIndices[2 * tripletIndex]; + unsigned int tripletOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * tripletIndex + 1]; + + unsigned int firstMDIndex = segmentsInGPU.mdIndices[2 * tripletInnerSegmentIndex]; + unsigned int secondMDIndex = segmentsInGPU.mdIndices[2 * tripletInnerSegmentIndex + 1]; + unsigned int thirdMDIndex = segmentsInGPU.mdIndices[2 * tripletOuterSegmentIndex + 1]; + + float xs[3] = {mdsInGPU.anchorX[firstMDIndex], mdsInGPU.anchorX[secondMDIndex], mdsInGPU.anchorX[thirdMDIndex]}; + float ys[3] = {mdsInGPU.anchorY[firstMDIndex], mdsInGPU.anchorY[secondMDIndex], mdsInGPU.anchorY[thirdMDIndex]}; + + float g, f; + tripletRadius = tripletsInGPU.circleRadius[tripletIndex]; + g = tripletsInGPU.circleCenterX[tripletIndex]; + f = tripletsInGPU.circleCenterY[tripletIndex]; + + pass = pass and passRadiusCriterion(acc, + modulesInGPU, + pixelRadius, + pixelRadiusError, + tripletRadius, + lowerModuleIndex, + middleModuleIndex, + upperModuleIndex); + if (not pass) + return pass; + + uint16_t lowerModuleIndices[3] = {lowerModuleIndex, middleModuleIndex, upperModuleIndex}; + + if (runChiSquaredCuts and pixelSegmentPt < 5.0f) { + float rts[3] = { + mdsInGPU.anchorRt[firstMDIndex], mdsInGPU.anchorRt[secondMDIndex], mdsInGPU.anchorRt[thirdMDIndex]}; + float zs[3] = {mdsInGPU.anchorZ[firstMDIndex], mdsInGPU.anchorZ[secondMDIndex], mdsInGPU.anchorZ[thirdMDIndex]}; + float rtPix[2] = {mdsInGPU.anchorRt[pixelInnerMDIndex], mdsInGPU.anchorRt[pixelOuterMDIndex]}; + float xPix[2] = {mdsInGPU.anchorX[pixelInnerMDIndex], mdsInGPU.anchorX[pixelOuterMDIndex]}; + float yPix[2] = {mdsInGPU.anchorY[pixelInnerMDIndex], mdsInGPU.anchorY[pixelOuterMDIndex]}; + float zPix[2] = {mdsInGPU.anchorZ[pixelInnerMDIndex], mdsInGPU.anchorZ[pixelOuterMDIndex]}; + + rzChiSquared = computePT3RZChiSquared(acc, + modulesInGPU, + lowerModuleIndices, + rtPix, + xPix, + yPix, + zPix, + rts, + xs, + ys, + zs, + pixelSegmentPt, + pixelSegmentPx, + pixelSegmentPy, + pixelSegmentPz, + pixelSegmentCharge); + pass = pass and + passPT3RZChiSquaredCuts(modulesInGPU, lowerModuleIndex, middleModuleIndex, upperModuleIndex, rzChiSquared); + if (not pass) + return pass; + } else { + rzChiSquared = -1; + } + + rPhiChiSquared = + computePT3RPhiChiSquared(acc, modulesInGPU, lowerModuleIndices, pixelG, pixelF, pixelRadiusPCA, xs, ys); + + if (runChiSquaredCuts and pixelSegmentPt < 5.0f) { + pass = pass and passPT3RPhiChiSquaredCuts( + modulesInGPU, lowerModuleIndex, middleModuleIndex, upperModuleIndex, rPhiChiSquared); + if (not pass) + return pass; + } + + float xPix[2] = {mdsInGPU.anchorX[pixelInnerMDIndex], mdsInGPU.anchorX[pixelOuterMDIndex]}; + float yPix[2] = {mdsInGPU.anchorY[pixelInnerMDIndex], mdsInGPU.anchorY[pixelOuterMDIndex]}; + rPhiChiSquaredInwards = computePT3RPhiChiSquaredInwards(modulesInGPU, g, f, tripletRadius, xPix, yPix); + + if (runChiSquaredCuts and pixelSegmentPt < 5.0f) { + pass = pass and passPT3RPhiChiSquaredInwardsCuts( + modulesInGPU, lowerModuleIndex, middleModuleIndex, upperModuleIndex, rPhiChiSquaredInwards); + if (not pass) + return pass; + } + centerX = 0; + centerY = 0; + return pass; + }; + + struct createPixelTripletsInGPUFromMapv2 { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::objectRanges rangesInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::triplets tripletsInGPU, + struct SDL::pixelTriplets pixelTripletsInGPU, + unsigned int* connectedPixelSize, + unsigned int* connectedPixelIndex, + unsigned int nPixelSegments) const { + auto const globalBlockIdx = alpaka::getIdx(acc); + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridBlockExtent = alpaka::getWorkDiv(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (unsigned int i_pLS = globalThreadIdx[1]; i_pLS < nPixelSegments; i_pLS += gridThreadExtent[1]) { + auto iLSModule_max = connectedPixelIndex[i_pLS] + connectedPixelSize[i_pLS]; + + for (unsigned int iLSModule = connectedPixelIndex[i_pLS] + globalBlockIdx[0]; iLSModule < iLSModule_max; + iLSModule += gridBlockExtent[0]) { + uint16_t tripletLowerModuleIndex = + modulesInGPU + .connectedPixels[iLSModule]; //connected pixels will have the appropriate lower module index by default! +#ifdef Warnings + if (tripletLowerModuleIndex >= *modulesInGPU.nLowerModules) { + printf("tripletLowerModuleIndex %d >= modulesInGPU.nLowerModules %d \n", + tripletLowerModuleIndex, + *modulesInGPU.nLowerModules); + continue; //sanity check + } +#endif + //Removes 2S-2S :FIXME: filter these out in the pixel map + if (modulesInGPU.moduleType[tripletLowerModuleIndex] == SDL::TwoS) + continue; + + uint16_t pixelModuleIndex = *modulesInGPU.nLowerModules; + unsigned int nOuterTriplets = tripletsInGPU.nTriplets[tripletLowerModuleIndex]; + if (nOuterTriplets == 0) + continue; + + unsigned int pixelSegmentIndex = rangesInGPU.segmentModuleIndices[pixelModuleIndex] + i_pLS; + + if (segmentsInGPU.isDup[i_pLS]) + continue; + if (segmentsInGPU.partOfPT5[i_pLS]) + continue; //don't make pT3s for those pixels that are part of pT5 + + short layer2_adjustment; + if (modulesInGPU.layers[tripletLowerModuleIndex] == 1) { + layer2_adjustment = 1; + } //get upper segment to be in second layer + else if (modulesInGPU.layers[tripletLowerModuleIndex] == 2) { + layer2_adjustment = 0; + } // get lower segment to be in second layer + else { + continue; + } + + //fetch the triplet + for (unsigned int outerTripletArrayIndex = globalThreadIdx[2]; outerTripletArrayIndex < nOuterTriplets; + outerTripletArrayIndex += gridThreadExtent[2]) { + unsigned int outerTripletIndex = + rangesInGPU.tripletModuleIndices[tripletLowerModuleIndex] + outerTripletArrayIndex; + if (modulesInGPU.moduleType[tripletsInGPU.lowerModuleIndices[3 * outerTripletIndex + 1]] == SDL::TwoS) + continue; //REMOVES PS-2S + + if (tripletsInGPU.partOfPT5[outerTripletIndex]) + continue; //don't create pT3s for T3s accounted in pT5s + + float pixelRadius, pixelRadiusError, tripletRadius, rPhiChiSquared, rzChiSquared, rPhiChiSquaredInwards, + centerX, centerY; + bool success = runPixelTripletDefaultAlgo(acc, + modulesInGPU, + rangesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + pixelSegmentIndex, + outerTripletIndex, + pixelRadius, + pixelRadiusError, + tripletRadius, + centerX, + centerY, + rzChiSquared, + rPhiChiSquared, + rPhiChiSquaredInwards); + + if (success) { + float phi = + mdsInGPU.anchorPhi[segmentsInGPU.mdIndices[2 * tripletsInGPU.segmentIndices[2 * outerTripletIndex] + + layer2_adjustment]]; + float eta = + mdsInGPU.anchorEta[segmentsInGPU.mdIndices[2 * tripletsInGPU.segmentIndices[2 * outerTripletIndex] + + layer2_adjustment]]; + float eta_pix = segmentsInGPU.eta[i_pLS]; + float phi_pix = segmentsInGPU.phi[i_pLS]; + float pt = segmentsInGPU.ptIn[i_pLS]; + float score = rPhiChiSquared + rPhiChiSquaredInwards; + unsigned int totOccupancyPixelTriplets = + alpaka::atomicOp(acc, pixelTripletsInGPU.totOccupancyPixelTriplets, 1u); + if (totOccupancyPixelTriplets >= N_MAX_PIXEL_TRIPLETS) { +#ifdef Warnings + printf("Pixel Triplet excess alert!\n"); +#endif + } else { + unsigned int pixelTripletIndex = + alpaka::atomicOp(acc, pixelTripletsInGPU.nPixelTriplets, 1u); + addPixelTripletToMemory(modulesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + pixelTripletsInGPU, + pixelSegmentIndex, + outerTripletIndex, + pixelRadius, + tripletRadius, + centerX, + centerY, + rPhiChiSquared, + rPhiChiSquaredInwards, + rzChiSquared, + pixelTripletIndex, + pt, + eta, + phi, + eta_pix, + phi_pix, + score); + tripletsInGPU.partOfPT3[outerTripletIndex] = true; + } + } + } // for outerTripletArrayIndex + } // for iLSModule < iLSModule_max + } // for i_pLS + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationspT3(TAcc const& acc, + float& betaIn, + float& betaOut, + float& betaAv, + float& pt_beta, + float sdIn_dr, + float sdOut_dr, + float dr, + float lIn) { + if (lIn == 0) { + betaOut += SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min(acc, sdOut_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaOut); + return; + } + + if (betaIn * betaOut > 0.f and + (alpaka::math::abs(acc, pt_beta) < 4.f * SDL::pt_betaMax or + (lIn >= 11 and alpaka::math::abs(acc, pt_beta) < + 8.f * SDL::pt_betaMax))) //and the pt_beta is well-defined; less strict for endcap-endcap + { + const float betaInUpd = + betaIn + + SDL::copysignf(alpaka::math::asin( + acc, + alpaka::math::min( + acc, sdIn_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + const float betaOutUpd = + betaOut + + SDL::copysignf(alpaka::math::asin( + acc, + alpaka::math::min( + acc, sdOut_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaOut); //FIXME: need a faster version + betaAv = 0.5f * (betaInUpd + betaOutUpd); + + //1st update + const float pt_beta_inv = + 1.f / alpaka::math::abs(acc, dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv)); //get a better pt estimate + + betaIn += SDL::copysignf( + alpaka::math::asin(acc, alpaka::math::min(acc, sdIn_dr * SDL::k2Rinv1GeVf * pt_beta_inv, SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + betaOut += SDL::copysignf( + alpaka::math::asin(acc, alpaka::math::min(acc, sdOut_dr * SDL::k2Rinv1GeVf * pt_beta_inv, SDL::sinAlphaMax)), + betaOut); //FIXME: need a faster version + //update the av and pt + betaAv = 0.5f * (betaIn + betaOut); + //2nd update + pt_beta = dr * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + } else if (lIn < 11 && alpaka::math::abs(acc, betaOut) < 0.2f * alpaka::math::abs(acc, betaIn) && + alpaka::math::abs(acc, pt_beta) < 12.f * SDL::pt_betaMax) //use betaIn sign as ref + { + const float pt_betaIn = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaIn); + + const float betaInUpd = + betaIn + SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min( + acc, sdIn_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + const float betaOutUpd = + betaOut + + SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min( + acc, sdOut_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + betaAv = (alpaka::math::abs(acc, betaOut) > 0.2f * alpaka::math::abs(acc, betaIn)) + ? (0.5f * (betaInUpd + betaOutUpd)) + : betaInUpd; + + //1st update + pt_beta = dr * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + betaIn += SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min(acc, sdIn_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + betaOut += SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min(acc, sdOut_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + //update the av and pt + betaAv = 0.5f * (betaIn + betaOut); + //2nd update + pt_beta = dr * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::objectRanges& rangesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& pixelModuleIndex, + uint16_t& outerInnerLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int thirdMDIndex, + unsigned int& fourthMDIndex, + float& /*z_OutLo*/, + float& /*rt_OutLo*/, + float& dPhiPos, + float& dPhi, + float& betaIn, + float& betaOut, + float& pt_beta, + float& zLo, + float& zHi, + float& zLoPointed, + float& zHiPointed, + float& sdlCut, + float& betaOutCut, + float& deltaBetaCut) // pixel to BB and BE segments + { + bool pass = true; + + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); + + float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; + float rt_InUp = mdsInGPU.anchorRt[secondMDIndex]; + float rt_OutLo = mdsInGPU.anchorRt[thirdMDIndex]; + float rt_OutUp = mdsInGPU.anchorRt[fourthMDIndex]; + + float z_InUp = mdsInGPU.anchorZ[secondMDIndex]; + float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; + + float x_InLo = mdsInGPU.anchorX[firstMDIndex]; + float x_InUp = mdsInGPU.anchorX[secondMDIndex]; + float x_OutLo = mdsInGPU.anchorX[thirdMDIndex]; + float x_OutUp = mdsInGPU.anchorX[fourthMDIndex]; + + float y_InLo = mdsInGPU.anchorY[firstMDIndex]; + float y_InUp = mdsInGPU.anchorY[secondMDIndex]; + float y_OutLo = mdsInGPU.anchorY[thirdMDIndex]; + float y_OutUp = mdsInGPU.anchorY[fourthMDIndex]; + + float rt_InOut = rt_InUp; + + pass = + pass and (alpaka::math::abs(acc, SDL::deltaPhi(acc, x_InUp, y_InUp, x_OutLo, y_OutLo)) <= 0.5f * float(M_PI)); + if (not pass) + return pass; + + unsigned int pixelSegmentArrayIndex = innerSegmentIndex - rangesInGPU.segmentModuleIndices[pixelModuleIndex]; + float ptIn = segmentsInGPU.ptIn[pixelSegmentArrayIndex]; + float ptSLo = ptIn; + float px = segmentsInGPU.px[pixelSegmentArrayIndex]; + float py = segmentsInGPU.py[pixelSegmentArrayIndex]; + float pz = segmentsInGPU.pz[pixelSegmentArrayIndex]; + float ptErr = segmentsInGPU.ptErr[pixelSegmentArrayIndex]; + float etaErr = segmentsInGPU.etaErr[pixelSegmentArrayIndex]; + ptSLo = alpaka::math::max(acc, ptCut, ptSLo - 10.0f * alpaka::math::max(acc, ptErr, 0.005f * ptSLo)); + ptSLo = alpaka::math::min(acc, 10.0f, ptSLo); + + float alpha1GeV_OutLo = + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, sinAlphaMax)); + const float rtRatio_OutLoInOut = + rt_OutLo / rt_InOut; // Outer segment beginning rt divided by inner segment beginning rt; + + float dzDrtScale = + alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly + const float zpitch_InLo = 0.05f; + const float zpitch_InOut = 0.05f; + float zpitch_OutLo = (isPS_OutLo ? pixelPSZpitch : strip2SZpitch); + float zGeom = zpitch_InLo + zpitch_OutLo; + zHi = z_InUp + (z_InUp + deltaZLum) * (rtRatio_OutLoInOut - 1.f) * (z_InUp < 0.f ? 1.f : dzDrtScale) + + (zpitch_InOut + zpitch_OutLo); + zLo = z_InUp + (z_InUp - deltaZLum) * (rtRatio_OutLoInOut - 1.f) * (z_InUp > 0.f ? 1.f : dzDrtScale) - + (zpitch_InOut + zpitch_OutLo); //slope-correction only on outer end + + pass = pass and ((z_OutLo >= zLo) && (z_OutLo <= zHi)); + if (not pass) + return pass; + + const float coshEta = alpaka::math::sqrt(acc, ptIn * ptIn + pz * pz) / ptIn; + + const float drt_OutLo_InUp = (rt_OutLo - rt_InUp); + + const float r3_InUp = alpaka::math::sqrt(acc, z_InUp * z_InUp + rt_InUp * rt_InUp); + + float drt_InSeg = rt_InOut - rt_InLo; + + const float sdlThetaMulsF = 0.015f * alpaka::math::sqrt(acc, 0.1f + 0.2f * (rt_OutLo - rt_InUp) / 50.f) * + alpaka::math::sqrt(acc, r3_InUp / rt_InUp); + const float sdlMuls = sdlThetaMulsF * 3.f / ptCut * 4.f; // will need a better guess than x4? + + float dzErr = drt_OutLo_InUp * etaErr * coshEta; //FIXME: check with the calc in the endcap + dzErr *= dzErr; + dzErr += 0.03f * 0.03f; // pixel size x2. ... random for now + dzErr *= 9.f; //3 sigma + dzErr += sdlMuls * sdlMuls * drt_OutLo_InUp * drt_OutLo_InUp / 3.f * coshEta * coshEta; //sloppy + dzErr += zGeom * zGeom; + dzErr = alpaka::math::sqrt(acc, dzErr); + + const float dzDrIn = pz / ptIn; + const float zWindow = dzErr / drt_InSeg * drt_OutLo_InUp + zGeom; + const float dzMean = dzDrIn * drt_OutLo_InUp * + (1.f + drt_OutLo_InUp * drt_OutLo_InUp * 4 * k2Rinv1GeVf * k2Rinv1GeVf / ptIn / ptIn / + 24.f); // with curved path correction + // Constructing upper and lower bound + zLoPointed = z_InUp + dzMean - zWindow; + zHiPointed = z_InUp + dzMean + zWindow; + + pass = pass and ((z_OutLo >= zLoPointed) && (z_OutLo <= zHiPointed)); + if (not pass) + return pass; + + const float sdlPVoff = 0.1f / rt_OutLo; + sdlCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, sdlMuls * sdlMuls + sdlPVoff * sdlPVoff); + +#ifdef CUT_VALUE_DEBUG + dPhiPos = SDL::deltaPhi(acc, x_InUp, y_InUp, x_OutUp, y_OutUp); +#endif + + //no dphipos cut + float midPointX = 0.5f * (x_InLo + x_OutLo); + float midPointY = 0.5f * (y_InLo + y_OutLo); + + float diffX = x_OutLo - x_InLo; + float diffY = y_OutLo - y_InLo; + + dPhi = SDL::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + + pass = pass and (alpaka::math::abs(acc, dPhi) <= sdlCut); + if (not pass) + return pass; + + //lots of array accesses below this... + + float alpha_InLo = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); + float alpha_OutLo = __H2F(segmentsInGPU.dPhiChanges[outerSegmentIndex]); + + bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == SDL::Endcap and + modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::TwoS; + + float alpha_OutUp, alpha_OutUp_highEdge, alpha_OutUp_lowEdge; + alpha_OutUp = SDL::deltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo); + + alpha_OutUp_highEdge = alpha_OutUp; + alpha_OutUp_lowEdge = alpha_OutUp; + + float tl_axis_x = x_OutUp - x_InUp; + float tl_axis_y = y_OutUp - y_InUp; + + float tl_axis_highEdge_x = tl_axis_x; + float tl_axis_highEdge_y = tl_axis_y; + + float tl_axis_lowEdge_x = tl_axis_x; + float tl_axis_lowEdge_y = tl_axis_y; + + betaIn = -SDL::deltaPhi(acc, px, py, tl_axis_x, tl_axis_y); + float betaInRHmin = betaIn; + float betaInRHmax = betaIn; + + betaOut = -alpha_OutUp + SDL::deltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y); + + float betaOutRHmin = betaOut; + float betaOutRHmax = betaOut; + + if (isEC_lastLayer) { + alpha_OutUp_highEdge = SDL::deltaPhi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex], + mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_OutLo, + mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_OutLo); + alpha_OutUp_lowEdge = SDL::deltaPhi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex], + mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_OutLo, + mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_OutLo); + + tl_axis_highEdge_x = mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_InUp; + tl_axis_highEdge_y = mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_InUp; + tl_axis_lowEdge_x = mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_InUp; + tl_axis_lowEdge_y = mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_InUp; + + betaOutRHmin = -alpha_OutUp_highEdge + SDL::deltaPhi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex], + tl_axis_highEdge_x, + tl_axis_highEdge_y); + betaOutRHmax = -alpha_OutUp_lowEdge + SDL::deltaPhi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex], + tl_axis_lowEdge_x, + tl_axis_lowEdge_y); + } + + //beta computation + float drt_tl_axis = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); + + //innerOuterAnchor - innerInnerAnchor + const float rt_InSeg = + alpaka::math::sqrt(acc, (x_InUp - x_InLo) * (x_InUp - x_InLo) + (y_InUp - y_InLo) * (y_InUp - y_InLo)); + + //no betaIn cut for the pixels + float betaAv = 0.5f * (betaIn + betaOut); + pt_beta = ptIn; + + int lIn = 0; + int lOut = isEC_lastLayer ? 11 : 5; + float sdOut_dr = + alpaka::math::sqrt(acc, (x_OutUp - x_OutLo) * (x_OutUp - x_OutLo) + (y_OutUp - y_OutLo) * (y_OutUp - y_OutLo)); + float sdOut_d = rt_OutUp - rt_OutLo; + + runDeltaBetaIterationspT3(acc, betaIn, betaOut, betaAv, pt_beta, rt_InSeg, sdOut_dr, drt_tl_axis, lIn); + + const float betaInMMSF = (alpaka::math::abs(acc, betaInRHmin + betaInRHmax) > 0) + ? (2.f * betaIn / alpaka::math::abs(acc, betaInRHmin + betaInRHmax)) + : 0.; //mean value of min,max is the old betaIn + const float betaOutMMSF = (alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax) > 0) + ? (2.f * betaOut / alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax)) + : 0.; + betaInRHmin *= betaInMMSF; + betaInRHmax *= betaInMMSF; + betaOutRHmin *= betaOutMMSF; + betaOutRHmax *= betaOutMMSF; + + const float dBetaMuls = + sdlThetaMulsF * 4.f / + alpaka::math::min( + acc, alpaka::math::abs(acc, pt_beta), SDL::pt_betaMax); //need to confirm the range-out value of 7 GeV + const float alphaInAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, alpha_InLo), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_InUp * k2Rinv1GeVf / 3.0f, sinAlphaMax))); + const float alphaOutAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, alpha_OutLo), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, sinAlphaMax))); + const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * deltaZLum / z_InUp); + const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * deltaZLum / z_OutLo); + const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum); + + const float sinDPhi = alpaka::math::sin(acc, dPhi); + const float dBetaRIn2 = 0; // TODO-RH + + float dBetaROut = 0; + if (isEC_lastLayer) { + dBetaROut = + (alpaka::math::sqrt(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex] * mdsInGPU.anchorHighEdgeX[fourthMDIndex] + + mdsInGPU.anchorHighEdgeY[fourthMDIndex] * mdsInGPU.anchorHighEdgeY[fourthMDIndex]) - + alpaka::math::sqrt(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex] * mdsInGPU.anchorLowEdgeX[fourthMDIndex] + + mdsInGPU.anchorLowEdgeY[fourthMDIndex] * mdsInGPU.anchorLowEdgeY[fourthMDIndex])) * + sinDPhi / drt_tl_axis; + } + + const float dBetaROut2 = dBetaROut * dBetaROut; + + //FIXME: need faster version + betaOutCut = alpaka::math::asin(acc, alpaka::math::min(acc, drt_tl_axis * k2Rinv1GeVf / ptCut, sinAlphaMax)) + + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls * dBetaMuls); + + //Cut #6: The real beta cut + pass = pass and (alpaka::math::abs(acc, betaOut) < betaOutCut); + if (not pass) + return pass; + const float dBetaRes = 0.02f / alpaka::math::min(acc, sdOut_d, drt_InSeg); + const float dBetaCut2 = + (dBetaRes * dBetaRes * 2.0f + dBetaMuls * dBetaMuls + dBetaLum2 + dBetaRIn2 + dBetaROut2 + + 0.25f * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax)) * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax))); + float dBeta = betaIn - betaOut; + +#ifdef CUT_VALUE_DEBUG + deltaBetaCut = alpaka::math::sqrt(acc, dBetaCut2); +#endif + pass = pass and (dBeta * dBeta <= dBetaCut2); + + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::objectRanges& rangesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& pixelModuleIndex, + uint16_t& outerInnerLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + unsigned int& fourthMDIndex, + float& /*z_OutLo*/, + float& /*rt_OutLo*/, + float& deltaPhiPos, + float& dPhi, + float& betaIn, + float& betaOut, + float& pt_beta, + float& zLo, + float& rtLo, + float& rtHi, + float& sdlCut, + float& betaInCut, + float& betaOutCut, + float& deltaBetaCut, + float& kZ) // pixel to EE segments + { + bool pass = true; + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); + + float z_InUp = mdsInGPU.anchorZ[secondMDIndex]; + float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; + + pass = pass and (z_InUp * z_OutLo > 0); + if (not pass) + return pass; + + float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; + float rt_InUp = mdsInGPU.anchorRt[secondMDIndex]; + float rt_OutLo = mdsInGPU.anchorRt[thirdMDIndex]; + float rt_OutUp = mdsInGPU.anchorRt[fourthMDIndex]; + + float x_InLo = mdsInGPU.anchorX[firstMDIndex]; + float x_InUp = mdsInGPU.anchorX[secondMDIndex]; + float x_OutLo = mdsInGPU.anchorX[thirdMDIndex]; + float x_OutUp = mdsInGPU.anchorX[fourthMDIndex]; + + float y_InLo = mdsInGPU.anchorY[firstMDIndex]; + float y_InUp = mdsInGPU.anchorY[secondMDIndex]; + float y_OutLo = mdsInGPU.anchorY[thirdMDIndex]; + float y_OutUp = mdsInGPU.anchorY[fourthMDIndex]; + + unsigned int pixelSegmentArrayIndex = innerSegmentIndex - rangesInGPU.segmentModuleIndices[pixelModuleIndex]; + + float ptIn = segmentsInGPU.ptIn[pixelSegmentArrayIndex]; + float ptSLo = ptIn; + float px = segmentsInGPU.px[pixelSegmentArrayIndex]; + float py = segmentsInGPU.py[pixelSegmentArrayIndex]; + float pz = segmentsInGPU.pz[pixelSegmentArrayIndex]; + float ptErr = segmentsInGPU.ptErr[pixelSegmentArrayIndex]; + float etaErr = segmentsInGPU.etaErr[pixelSegmentArrayIndex]; + + ptSLo = alpaka::math::max(acc, ptCut, ptSLo - 10.0f * alpaka::math::max(acc, ptErr, 0.005f * ptSLo)); + ptSLo = alpaka::math::min(acc, 10.0f, ptSLo); + + float rtOut_o_rtIn = rt_OutLo / rt_InUp; + const float zpitch_InLo = 0.05f; + float zpitch_OutLo = (isPS_OutLo ? pixelPSZpitch : strip2SZpitch); + float zGeom = zpitch_InLo + zpitch_OutLo; + + const float sdlSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, sinAlphaMax)); + const float dzDrtScale = alpaka::math::tan(acc, sdlSlope) / sdlSlope; //FIXME: need approximate value + zLo = z_InUp + (z_InUp - deltaZLum) * (rtOut_o_rtIn - 1.f) * (z_InUp > 0.f ? 1.f : dzDrtScale) - + zGeom; //slope-correction only on outer end + + const float dLum = SDL::copysignf(deltaZLum, z_InUp); + bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS; + + const float rtGeom1 = isOutSgInnerMDPS + ? pixelPSZpitch + : strip2SZpitch; //FIXME: make this chosen by configuration for lay11,12 full PS + const float zGeom1 = SDL::copysignf(zGeom, z_InUp); //used in B-E region + rtLo = rt_InUp * (1.f + (z_OutLo - z_InUp - zGeom1) / (z_InUp + zGeom1 + dLum) / dzDrtScale) - + rtGeom1; //slope correction only on the lower end + + float zInForHi = z_InUp - zGeom1 - dLum; + if (zInForHi * z_InUp < 0) + zInForHi = SDL::copysignf(0.1f, z_InUp); + rtHi = rt_InUp * (1.f + (z_OutLo - z_InUp + zGeom1) / zInForHi) + rtGeom1; + + // Cut #2: rt condition + pass = pass and ((rt_OutLo >= rtLo) && (rt_OutLo <= rtHi)); + if (not pass) + return pass; + + const float dzOutInAbs = alpaka::math::abs(acc, z_OutLo - z_InUp); + const float coshEta = alpaka::math::sqrt(acc, ptIn * ptIn + pz * pz) / ptIn; + const float multDzDr = dzOutInAbs * coshEta / (coshEta * coshEta - 1.f); + const float r3_InUp = alpaka::math::sqrt(acc, z_InUp * z_InUp + rt_InUp * rt_InUp); + const float sdlThetaMulsF = 0.015f * alpaka::math::sqrt(acc, 0.1f + 0.2f * (rt_OutLo - rt_InUp) / 50.f) * + alpaka::math::sqrt(acc, r3_InUp / rt_InUp); + const float sdlMuls = sdlThetaMulsF * 3.f / ptCut * 4.f; // will need a better guess than x4? + + float drtErr = etaErr * multDzDr; + drtErr *= drtErr; + drtErr += 0.03f * 0.03f; // pixel size x2. ... random for now + drtErr *= 9.f; //3 sigma + drtErr += + sdlMuls * sdlMuls * multDzDr * multDzDr / 3.f * coshEta * coshEta; //sloppy: relative muls is 1/3 of total muls + drtErr = alpaka::math::sqrt(acc, drtErr); + const float drtDzIn = alpaka::math::abs(acc, ptIn / pz); //all tracks are out-going in endcaps? + + const float drt_OutLo_InUp = (rt_OutLo - rt_InUp); // drOutIn + + const float rtWindow = drtErr + rtGeom1; + const float drtMean = drtDzIn * dzOutInAbs * + (1.f - drt_OutLo_InUp * drt_OutLo_InUp * 4 * k2Rinv1GeVf * k2Rinv1GeVf / ptIn / ptIn / + 24.f); // with curved path correction + const float rtLo_point = rt_InUp + drtMean - rtWindow; + const float rtHi_point = rt_InUp + drtMean + rtWindow; + + // Cut #3: rt-z pointed + pass = pass and ((rt_OutLo >= rtLo_point) && (rt_OutLo <= rtHi_point)); + if (not pass) + return pass; + + const float alpha1GeV_OutLo = + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, sinAlphaMax)); + const float sdlPVoff = 0.1f / rt_OutLo; + sdlCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, sdlMuls * sdlMuls + sdlPVoff * sdlPVoff); + + deltaPhiPos = SDL::deltaPhi(acc, x_InUp, y_InUp, x_OutUp, y_OutUp); + + float midPointX = 0.5f * (x_InLo + x_OutLo); + float midPointY = 0.5f * (y_InLo + y_OutLo); + + float diffX = x_OutLo - x_InLo; + float diffY = y_OutLo - y_InLo; + + dPhi = SDL::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + + // Cut #5: deltaPhiChange + pass = pass and (alpaka::math::abs(acc, dPhi) <= sdlCut); + if (not pass) + return pass; + + float alpha_InLo = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); + float alpha_OutLo = __H2F(segmentsInGPU.dPhiChanges[outerSegmentIndex]); + + bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == SDL::Endcap and + modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::TwoS; + + float alpha_OutUp, alpha_OutUp_highEdge, alpha_OutUp_lowEdge; + + alpha_OutUp = SDL::deltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo); + alpha_OutUp_highEdge = alpha_OutUp; + alpha_OutUp_lowEdge = alpha_OutUp; + + float tl_axis_x = x_OutUp - x_InUp; + float tl_axis_y = y_OutUp - y_InUp; + + float tl_axis_highEdge_x = tl_axis_x; + float tl_axis_highEdge_y = tl_axis_y; + + float tl_axis_lowEdge_x = tl_axis_x; + float tl_axis_lowEdge_y = tl_axis_y; + + betaIn = -SDL::deltaPhi(acc, px, py, tl_axis_x, tl_axis_y); + float betaInRHmin = betaIn; + float betaInRHmax = betaIn; + + betaOut = -alpha_OutUp + SDL::deltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y); + float betaOutRHmin = betaOut; + float betaOutRHmax = betaOut; + + if (isEC_lastLayer) { + alpha_OutUp_highEdge = SDL::deltaPhi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex], + mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_OutLo, + mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_OutLo); + alpha_OutUp_lowEdge = SDL::deltaPhi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex], + mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_OutLo, + mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_OutLo); + + tl_axis_highEdge_x = mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_InUp; + tl_axis_highEdge_y = mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_InUp; + tl_axis_lowEdge_x = mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_InUp; + tl_axis_lowEdge_y = mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_InUp; + + betaOutRHmin = -alpha_OutUp_highEdge + SDL::deltaPhi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex], + tl_axis_highEdge_x, + tl_axis_highEdge_y); + betaOutRHmax = -alpha_OutUp_lowEdge + SDL::deltaPhi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex], + tl_axis_lowEdge_x, + tl_axis_lowEdge_y); + } + + //beta computation + float drt_tl_axis = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); + //no betaIn cut for the pixels + const float rt_InSeg = + alpaka::math::sqrt(acc, (x_InUp - x_InLo) * (x_InUp - x_InLo) + (y_InUp - y_InLo) * (y_InUp - y_InLo)); + + float betaAv = 0.5f * (betaIn + betaOut); + pt_beta = ptIn; + + int lIn = 0; + int lOut = isEC_lastLayer ? 11 : 5; + float sdOut_dr = + alpaka::math::sqrt(acc, (x_OutUp - x_OutLo) * (x_OutUp - x_OutLo) + (y_OutUp - y_OutLo) * (y_OutUp - y_OutLo)); + float sdOut_d = rt_OutUp - rt_OutLo; + + runDeltaBetaIterationspT3(acc, betaIn, betaOut, betaAv, pt_beta, rt_InSeg, sdOut_dr, drt_tl_axis, lIn); + + const float betaInMMSF = (alpaka::math::abs(acc, betaInRHmin + betaInRHmax) > 0) + ? (2.f * betaIn / alpaka::math::abs(acc, betaInRHmin + betaInRHmax)) + : 0.; //mean value of min,max is the old betaIn + const float betaOutMMSF = (alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax) > 0) + ? (2.f * betaOut / alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax)) + : 0.; + betaInRHmin *= betaInMMSF; + betaInRHmax *= betaInMMSF; + betaOutRHmin *= betaOutMMSF; + betaOutRHmax *= betaOutMMSF; + + const float dBetaMuls = + sdlThetaMulsF * 4.f / + alpaka::math::min( + acc, alpaka::math::abs(acc, pt_beta), SDL::pt_betaMax); //need to confirm the range-out value of 7 GeV + + const float alphaInAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, alpha_InLo), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_InUp * k2Rinv1GeVf / 3.0f, sinAlphaMax))); + const float alphaOutAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, alpha_OutLo), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, sinAlphaMax))); + const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * deltaZLum / z_InUp); + const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * deltaZLum / z_OutLo); + const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum); + + const float sinDPhi = alpaka::math::sin(acc, dPhi); + const float dBetaRIn2 = 0; // TODO-RH + + float dBetaROut = 0; + if (isEC_lastLayer) { + dBetaROut = + (alpaka::math::sqrt(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex] * mdsInGPU.anchorHighEdgeX[fourthMDIndex] + + mdsInGPU.anchorHighEdgeY[fourthMDIndex] * mdsInGPU.anchorHighEdgeY[fourthMDIndex]) - + alpaka::math::sqrt(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex] * mdsInGPU.anchorLowEdgeX[fourthMDIndex] + + mdsInGPU.anchorLowEdgeY[fourthMDIndex] * mdsInGPU.anchorLowEdgeY[fourthMDIndex])) * + sinDPhi / drt_tl_axis; + } + + const float dBetaROut2 = dBetaROut * dBetaROut; + + betaOutCut = + alpaka::math::asin( + acc, alpaka::math::min(acc, drt_tl_axis * k2Rinv1GeVf / ptCut, sinAlphaMax)) //FIXME: need faster version + + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls * dBetaMuls); + + //Cut #6: The real beta cut + pass = pass and (alpaka::math::abs(acc, betaOut) < betaOutCut); + if (not pass) + return pass; + + float drt_InSeg = rt_InUp - rt_InLo; + + const float dBetaRes = 0.02f / alpaka::math::min(acc, sdOut_d, drt_InSeg); + const float dBetaCut2 = + (dBetaRes * dBetaRes * 2.0f + dBetaMuls * dBetaMuls + dBetaLum2 + dBetaRIn2 + dBetaROut2 + + 0.25f * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax)) * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax))); + float dBeta = betaIn - betaOut; +#ifdef CUT_VALUE_DEBUG + deltaBetaCut = alpaka::math::sqrt(acc, dBetaCut2); +#endif + pass = pass and (dBeta * dBeta <= dBetaCut2); + return pass; + }; +} // namespace SDL +#endif + +#ifndef PixelQuintuplet_cuh +#define PixelQuintuplet_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "Segment.h" +#include "MiniDoublet.h" +#include "Hit.h" +#include "Triplet.h" +#include "Quintuplet.h" +#include "PixelTriplet.h" + +namespace SDL { + struct pixelQuintuplets { + unsigned int* pixelIndices; + unsigned int* T5Indices; + unsigned int* nPixelQuintuplets; + unsigned int* totOccupancyPixelQuintuplets; + bool* isDup; + FPX* score; + FPX* eta; + FPX* phi; + uint8_t* logicalLayers; + unsigned int* hitIndices; + uint16_t* lowerModuleIndices; + FPX* pixelRadius; + FPX* quintupletRadius; + FPX* centerX; + FPX* centerY; + float* rzChiSquared; + float* rPhiChiSquared; + float* rPhiChiSquaredInwards; + + template + void setData(TBuff& pixelQuintupletsBuffer) { + pixelIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.pixelIndices_buf); + T5Indices = alpaka::getPtrNative(pixelQuintupletsBuffer.T5Indices_buf); + nPixelQuintuplets = alpaka::getPtrNative(pixelQuintupletsBuffer.nPixelQuintuplets_buf); + totOccupancyPixelQuintuplets = alpaka::getPtrNative(pixelQuintupletsBuffer.totOccupancyPixelQuintuplets_buf); + isDup = alpaka::getPtrNative(pixelQuintupletsBuffer.isDup_buf); + score = alpaka::getPtrNative(pixelQuintupletsBuffer.score_buf); + eta = alpaka::getPtrNative(pixelQuintupletsBuffer.eta_buf); + phi = alpaka::getPtrNative(pixelQuintupletsBuffer.phi_buf); + logicalLayers = alpaka::getPtrNative(pixelQuintupletsBuffer.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.hitIndices_buf); + lowerModuleIndices = alpaka::getPtrNative(pixelQuintupletsBuffer.lowerModuleIndices_buf); + pixelRadius = alpaka::getPtrNative(pixelQuintupletsBuffer.pixelRadius_buf); + quintupletRadius = alpaka::getPtrNative(pixelQuintupletsBuffer.quintupletRadius_buf); + centerX = alpaka::getPtrNative(pixelQuintupletsBuffer.centerX_buf); + centerY = alpaka::getPtrNative(pixelQuintupletsBuffer.centerY_buf); + rzChiSquared = alpaka::getPtrNative(pixelQuintupletsBuffer.rzChiSquared_buf); + rPhiChiSquared = alpaka::getPtrNative(pixelQuintupletsBuffer.rPhiChiSquared_buf); + rPhiChiSquaredInwards = alpaka::getPtrNative(pixelQuintupletsBuffer.rPhiChiSquaredInwards_buf); + } + }; + + template + struct pixelQuintupletsBuffer : pixelQuintuplets { + Buf pixelIndices_buf; + Buf T5Indices_buf; + Buf nPixelQuintuplets_buf; + Buf totOccupancyPixelQuintuplets_buf; + Buf isDup_buf; + Buf score_buf; + Buf eta_buf; + Buf phi_buf; + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf lowerModuleIndices_buf; + Buf pixelRadius_buf; + Buf quintupletRadius_buf; + Buf centerX_buf; + Buf centerY_buf; + Buf rzChiSquared_buf; + Buf rPhiChiSquared_buf; + Buf rPhiChiSquaredInwards_buf; + + template + pixelQuintupletsBuffer(unsigned int maxPixelQuintuplets, TDevAcc const& devAccIn, TQueue& queue) + : pixelIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + T5Indices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + nPixelQuintuplets_buf(allocBufWrapper(devAccIn, 1, queue)), + totOccupancyPixelQuintuplets_buf(allocBufWrapper(devAccIn, 1, queue)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + score_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + eta_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + phi_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets * 7, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets * 14, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets * 7, queue)), + pixelRadius_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + quintupletRadius_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + centerX_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + centerY_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + rzChiSquared_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + rPhiChiSquared_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)), + rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)) { + alpaka::memset(queue, nPixelQuintuplets_buf, 0u); + alpaka::memset(queue, totOccupancyPixelQuintuplets_buf, 0u); + alpaka::wait(queue); + } + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::quintuplets& quintupletsInGPU, + struct SDL::pixelQuintuplets& pixelQuintupletsInGPU, + unsigned int pixelIndex, + unsigned int T5Index, + unsigned int pixelQuintupletIndex, + float& rzChiSquared, + float& rPhiChiSquared, + float& rPhiChiSquaredInwards, + float score, + float eta, + float phi, + float& pixelRadius, + float& quintupletRadius, + float& centerX, + float& centerY) { + pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex] = pixelIndex; + pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex] = T5Index; + pixelQuintupletsInGPU.isDup[pixelQuintupletIndex] = false; + pixelQuintupletsInGPU.score[pixelQuintupletIndex] = __F2H(score); + pixelQuintupletsInGPU.eta[pixelQuintupletIndex] = __F2H(eta); + pixelQuintupletsInGPU.phi[pixelQuintupletIndex] = __F2H(phi); + + pixelQuintupletsInGPU.pixelRadius[pixelQuintupletIndex] = __F2H(pixelRadius); + pixelQuintupletsInGPU.quintupletRadius[pixelQuintupletIndex] = __F2H(quintupletRadius); + pixelQuintupletsInGPU.centerX[pixelQuintupletIndex] = __F2H(centerX); + pixelQuintupletsInGPU.centerY[pixelQuintupletIndex] = __F2H(centerY); + + pixelQuintupletsInGPU.logicalLayers[7 * pixelQuintupletIndex] = 0; + pixelQuintupletsInGPU.logicalLayers[7 * pixelQuintupletIndex + 1] = 0; + pixelQuintupletsInGPU.logicalLayers[7 * pixelQuintupletIndex + 2] = quintupletsInGPU.logicalLayers[T5Index * 5]; + pixelQuintupletsInGPU.logicalLayers[7 * pixelQuintupletIndex + 3] = quintupletsInGPU.logicalLayers[T5Index * 5 + 1]; + pixelQuintupletsInGPU.logicalLayers[7 * pixelQuintupletIndex + 4] = quintupletsInGPU.logicalLayers[T5Index * 5 + 2]; + pixelQuintupletsInGPU.logicalLayers[7 * pixelQuintupletIndex + 5] = quintupletsInGPU.logicalLayers[T5Index * 5 + 3]; + pixelQuintupletsInGPU.logicalLayers[7 * pixelQuintupletIndex + 6] = quintupletsInGPU.logicalLayers[T5Index * 5 + 4]; + + pixelQuintupletsInGPU.lowerModuleIndices[7 * pixelQuintupletIndex] = + segmentsInGPU.innerLowerModuleIndices[pixelIndex]; + pixelQuintupletsInGPU.lowerModuleIndices[7 * pixelQuintupletIndex + 1] = + segmentsInGPU.outerLowerModuleIndices[pixelIndex]; + pixelQuintupletsInGPU.lowerModuleIndices[7 * pixelQuintupletIndex + 2] = + quintupletsInGPU.lowerModuleIndices[T5Index * 5]; + pixelQuintupletsInGPU.lowerModuleIndices[7 * pixelQuintupletIndex + 3] = + quintupletsInGPU.lowerModuleIndices[T5Index * 5 + 1]; + pixelQuintupletsInGPU.lowerModuleIndices[7 * pixelQuintupletIndex + 4] = + quintupletsInGPU.lowerModuleIndices[T5Index * 5 + 2]; + pixelQuintupletsInGPU.lowerModuleIndices[7 * pixelQuintupletIndex + 5] = + quintupletsInGPU.lowerModuleIndices[T5Index * 5 + 3]; + pixelQuintupletsInGPU.lowerModuleIndices[7 * pixelQuintupletIndex + 6] = + quintupletsInGPU.lowerModuleIndices[T5Index * 5 + 4]; + + unsigned int pixelInnerMD = segmentsInGPU.mdIndices[2 * pixelIndex]; + unsigned int pixelOuterMD = segmentsInGPU.mdIndices[2 * pixelIndex + 1]; + + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex] = mdsInGPU.anchorHitIndices[pixelInnerMD]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 1] = mdsInGPU.outerHitIndices[pixelInnerMD]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 2] = mdsInGPU.anchorHitIndices[pixelOuterMD]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 3] = mdsInGPU.outerHitIndices[pixelOuterMD]; + + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 4] = quintupletsInGPU.hitIndices[10 * T5Index]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 5] = quintupletsInGPU.hitIndices[10 * T5Index + 1]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 6] = quintupletsInGPU.hitIndices[10 * T5Index + 2]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 7] = quintupletsInGPU.hitIndices[10 * T5Index + 3]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 8] = quintupletsInGPU.hitIndices[10 * T5Index + 4]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 9] = quintupletsInGPU.hitIndices[10 * T5Index + 5]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 10] = quintupletsInGPU.hitIndices[10 * T5Index + 6]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 11] = quintupletsInGPU.hitIndices[10 * T5Index + 7]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 12] = quintupletsInGPU.hitIndices[10 * T5Index + 8]; + pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex + 13] = quintupletsInGPU.hitIndices[10 * T5Index + 9]; + + pixelQuintupletsInGPU.rzChiSquared[pixelQuintupletIndex] = rzChiSquared; + pixelQuintupletsInGPU.rPhiChiSquared[pixelQuintupletIndex] = rPhiChiSquared; + pixelQuintupletsInGPU.rPhiChiSquaredInwards[pixelQuintupletIndex] = rPhiChiSquaredInwards; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RZChiSquaredCuts(struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + uint16_t& lowerModuleIndex4, + uint16_t& lowerModuleIndex5, + float& rzChiSquared) { + const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == SDL::TwoS); + const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == SDL::TwoS); + const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == SDL::TwoS); + const int layer4 = modulesInGPU.layers[lowerModuleIndex4] + + 6 * (modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex4] == SDL::TwoS); + const int layer5 = modulesInGPU.layers[lowerModuleIndex5] + + 6 * (modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex5] == SDL::TwoS); + + if (layer1 == 1 and layer2 == 2 and layer3 == 3) { + if (layer4 == 12 and layer5 == 13) { + return rzChiSquared < 451.141f; + } else if (layer4 == 4 and layer5 == 12) { + return rzChiSquared < 392.654f; + } else if (layer4 == 4 and layer5 == 5) { + return rzChiSquared < 225.322f; + } else if (layer4 == 7 and layer5 == 13) { + return rzChiSquared < 595.546f; + } else if (layer4 == 7 and layer5 == 8) { + return rzChiSquared < 196.111f; + } + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) { + if (layer4 == 13 and layer5 == 14) { + return rzChiSquared < 297.446f; + } else if (layer4 == 8 and layer5 == 14) { + return rzChiSquared < 451.141f; + } else if (layer4 == 8 and layer5 == 9) { + return rzChiSquared < 518.339f; + } + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8) { + if (layer4 == 9 and layer5 == 10) { + return rzChiSquared < 341.75f; + } else if (layer4 == 9 and layer5 == 15) { + return rzChiSquared < 341.75f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) { + if (layer4 == 12 and layer5 == 13) { + return rzChiSquared < 392.655f; + } else if (layer4 == 5 and layer5 == 12) { + return rzChiSquared < 341.75f; + } else if (layer4 == 5 and layer5 == 6) { + return rzChiSquared < 112.537f; + } + } else if (layer1 == 2 and layer2 == 3 and layer4 == 7) { + if (layer4 == 13 and layer5 == 14) { + return rzChiSquared < 595.545f; + } else if (layer4 == 8 and layer5 == 14) { + return rzChiSquared < 74.198f; + } + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) { + if (layer4 == 14 and layer5 == 15) { + return rzChiSquared < 518.339f; + } else if (layer4 == 9 and layer5 == 10) { + return rzChiSquared < 8.046f; + } else if (layer4 == 9 and layer5 == 15) { + return rzChiSquared < 451.141f; + } + } else if (layer1 == 3 and layer2 == 7 and layer3 == 8 and layer4 == 14 and layer5 == 15) { + return rzChiSquared < 56.207f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 9) { + if (layer4 == 10 and layer5 == 11) { + return rzChiSquared < 64.578f; + } else if (layer4 == 10 and layer5 == 16) { + return rzChiSquared < 85.250f; + } else if (layer4 == 15 and layer5 == 16) { + return rzChiSquared < 85.250f; + } + } + return true; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RPhiChiSquaredCuts(struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + uint16_t& lowerModuleIndex4, + uint16_t& lowerModuleIndex5, + float rPhiChiSquared) { + const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == SDL::TwoS); + const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == SDL::TwoS); + const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == SDL::TwoS); + const int layer4 = modulesInGPU.layers[lowerModuleIndex4] + + 6 * (modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex4] == SDL::TwoS); + const int layer5 = modulesInGPU.layers[lowerModuleIndex5] + + 6 * (modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex5] == SDL::TwoS); + + if (layer1 == 1 and layer2 == 2 and layer3 == 3) { + if (layer4 == 12 and layer5 == 13) { + return rPhiChiSquared < 48.921f; + } else if (layer4 == 4 and layer5 == 12) { + return rPhiChiSquared < 97.948f; + } else if (layer4 == 4 and layer5 == 5) { + return rPhiChiSquared < 129.3f; + } else if (layer4 == 7 and layer5 == 13) { + return rPhiChiSquared < 56.21f; + } else if (layer4 == 7 and layer5 == 8) { + return rPhiChiSquared < 74.198f; + } + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) { + if (layer4 == 13 and layer5 == 14) { + return rPhiChiSquared < 21.265f; + } else if (layer4 == 8 and layer5 == 14) { + return rPhiChiSquared < 37.058f; + } else if (layer4 == 8 and layer5 == 9) { + return rPhiChiSquared < 42.578f; + } + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8) { + if (layer4 == 9 and layer5 == 10) { + return rPhiChiSquared < 32.253f; + } else if (layer4 == 9 and layer5 == 15) { + return rPhiChiSquared < 37.058f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) { + if (layer4 == 12 and layer5 == 13) { + return rPhiChiSquared < 97.947f; + } else if (layer4 == 5 and layer5 == 12) { + return rPhiChiSquared < 129.3f; + } else if (layer4 == 5 and layer5 == 6) { + return rPhiChiSquared < 170.68f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 7) { + if (layer4 == 13 and layer5 == 14) { + return rPhiChiSquared < 48.92f; + } else if (layer4 == 8 and layer5 == 14) { + return rPhiChiSquared < 74.2f; + } + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) { + if (layer4 == 14 and layer5 == 15) { + return rPhiChiSquared < 42.58f; + } else if (layer4 == 9 and layer5 == 10) { + return rPhiChiSquared < 37.06f; + } else if (layer4 == 9 and layer5 == 15) { + return rPhiChiSquared < 48.92f; + } + } else if (layer1 == 3 and layer2 == 7 and layer3 == 8 and layer4 == 14 and layer5 == 15) { + return rPhiChiSquared < 85.25f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 9) { + if (layer4 == 10 and layer5 == 11) { + return rPhiChiSquared < 42.58f; + } else if (layer4 == 10 and layer5 == 16) { + return rPhiChiSquared < 37.06f; + } else if (layer4 == 15 and layer5 == 16) { + return rPhiChiSquared < 37.06f; + } + } + return true; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeChiSquaredpT5(TAcc const& acc, + unsigned int nPoints, + float* xs, + float* ys, + float* delta1, + float* delta2, + float* slopes, + bool* isFlat, + float g, + float f, + float radius) { + /* + Given values of (g, f, radius) and a set of points (and its uncertainties) compute chi squared + */ + float c = g * g + f * f - radius * radius; + float chiSquared = 0.f; + float absArctanSlope, angleM, xPrime, yPrime, sigma; + for (size_t i = 0; i < nPoints; i++) { + absArctanSlope = ((slopes[i] != SDL::SDL_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) + : 0.5f * float(M_PI)); + if (xs[i] > 0 and ys[i] > 0) { + angleM = 0.5f * float(M_PI) - absArctanSlope; + } else if (xs[i] < 0 and ys[i] > 0) { + angleM = absArctanSlope + 0.5f * float(M_PI); + } else if (xs[i] < 0 and ys[i] < 0) { + angleM = -(absArctanSlope + 0.5f * float(M_PI)); + } else if (xs[i] > 0 and ys[i] < 0) { + angleM = -(0.5f * float(M_PI) - absArctanSlope); + } else { + angleM = 0; + } + if (not isFlat[i]) { + xPrime = xs[i] * alpaka::math::cos(acc, angleM) + ys[i] * alpaka::math::sin(acc, angleM); + yPrime = ys[i] * alpaka::math::cos(acc, angleM) - xs[i] * alpaka::math::sin(acc, angleM); + } else { + xPrime = xs[i]; + yPrime = ys[i]; + } + sigma = 2 * alpaka::math::sqrt( + acc, (xPrime * delta1[i]) * (xPrime * delta1[i]) + (yPrime * delta2[i]) * (yPrime * delta2[i])); + chiSquared += (xs[i] * xs[i] + ys[i] * ys[i] - 2 * g * xs[i] - 2 * f * ys[i] + c) * + (xs[i] * xs[i] + ys[i] * ys[i] - 2 * g * xs[i] - 2 * f * ys[i] + c) / (sigma * sigma); + } + return chiSquared; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeSigmasForRegression_pT5(TAcc const& acc, + SDL::modules& modulesInGPU, + const uint16_t* lowerModuleIndices, + float* delta1, + float* delta2, + float* slopes, + bool* isFlat, + unsigned int nPoints = 5, + bool anchorHits = true) { + /* + bool anchorHits required to deal with a weird edge case wherein + the hits ultimately used in the regression are anchor hits, but the + lower modules need not all be Pixel Modules (in case of PS). Similarly, + when we compute the chi squared for the non-anchor hits, the "partner module" + need not always be a PS strip module, but all non-anchor hits sit on strip + modules. + */ + ModuleType moduleType; + short moduleSubdet, moduleSide; + float inv1 = 0.01f / 0.009f; + float inv2 = 0.15f / 0.009f; + float inv3 = 2.4f / 0.009f; + for (size_t i = 0; i < nPoints; i++) { + moduleType = modulesInGPU.moduleType[lowerModuleIndices[i]]; + moduleSubdet = modulesInGPU.subdets[lowerModuleIndices[i]]; + moduleSide = modulesInGPU.sides[lowerModuleIndices[i]]; + const float& drdz = modulesInGPU.drdzs[lowerModuleIndices[i]]; + slopes[i] = modulesInGPU.dxdys[lowerModuleIndices[i]]; + //category 1 - barrel PS flat + if (moduleSubdet == Barrel and moduleType == PS and moduleSide == Center) { + delta1[i] = inv1; + delta2[i] = inv1; + slopes[i] = -999.f; + isFlat[i] = true; + } + //category 2 - barrel 2S + else if (moduleSubdet == Barrel and moduleType == TwoS) { + delta1[i] = 1.f; + delta2[i] = 1.f; + slopes[i] = -999.f; + isFlat[i] = true; + } + //category 3 - barrel PS tilted + else if (moduleSubdet == Barrel and moduleType == PS and moduleSide != Center) { + delta1[i] = inv1; + isFlat[i] = false; + + if (anchorHits) { + delta2[i] = (inv2 * drdz / alpaka::math::sqrt(acc, 1 + drdz * drdz)); + } else { + delta2[i] = (inv3 * drdz / alpaka::math::sqrt(acc, 1 + drdz * drdz)); + } + } + //category 4 - endcap PS + else if (moduleSubdet == Endcap and moduleType == PS) { + delta1[i] = inv1; + isFlat[i] = false; + /* + despite the type of the module layer of the lower module index, + all anchor hits are on the pixel side and all non-anchor hits are + on the strip side! + */ + if (anchorHits) { + delta2[i] = inv2; + } else { + delta2[i] = inv3; + } + } + //category 5 - endcap 2S + else if (moduleSubdet == Endcap and moduleType == TwoS) { + delta1[i] = 1.f; + delta2[i] = 500.f * inv1; + isFlat[i] = false; + } +#ifdef Warnings + else { + printf("ERROR!!!!! I SHOULDN'T BE HERE!!!! subdet = %d, type = %d, side = %d\n", + moduleSubdet, + moduleType, + moduleSide); + } +#endif + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT5RPhiChiSquared(TAcc const& acc, + struct SDL::modules& modulesInGPU, + uint16_t* lowerModuleIndices, + float& g, + float& f, + float& radius, + float* xs, + float* ys) { + /* + Compute circle parameters from 3 pixel hits, and then use them to compute the chi squared for the outer hits + */ + + float delta1[5], delta2[5], slopes[5]; + bool isFlat[5]; + float chiSquared = 0; + + computeSigmasForRegression_pT5(acc, modulesInGPU, lowerModuleIndices, delta1, delta2, slopes, isFlat); + chiSquared = computeChiSquaredpT5(acc, 5, xs, ys, delta1, delta2, slopes, isFlat, g, f, radius); + + return chiSquared; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT5RPhiChiSquaredInwards( + struct SDL::modules& modulesInGPU, float& g, float& f, float& r, float* xPix, float* yPix) { + /* + Using the computed regression center and radius, compute the chi squared for the pixels + */ + + float chiSquared = 0; + for (size_t i = 0; i < 2; i++) { + float residual = (xPix[i] - g) * (xPix[i] - g) + (yPix[i] - f) * (yPix[i] - f) - r * r; + chiSquared += residual * residual; + } + chiSquared *= 0.5f; + return chiSquared; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RPhiChiSquaredInwardsCuts(struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + uint16_t& lowerModuleIndex4, + uint16_t& lowerModuleIndex5, + float rPhiChiSquared) { + const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == SDL::TwoS); + const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == SDL::TwoS); + const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == SDL::TwoS); + const int layer4 = modulesInGPU.layers[lowerModuleIndex4] + + 6 * (modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex4] == SDL::TwoS); + const int layer5 = modulesInGPU.layers[lowerModuleIndex5] + + 6 * (modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex5] == SDL::TwoS); + + if (layer1 == 1 and layer2 == 2 and layer3 == 3) { + if (layer4 == 12 and layer5 == 13) { + return rPhiChiSquared < 451.141f; + } else if (layer4 == 4 and layer5 == 12) { + return rPhiChiSquared < 786.173f; + } else if (layer4 == 4 and layer5 == 5) { + return rPhiChiSquared < 595.545f; + } else if (layer4 == 7 and layer5 == 13) { + return rPhiChiSquared < 581.339f; + } else if (layer4 == 7 and layer5 == 8) { + return rPhiChiSquared < 112.537f; + } + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) { + if (layer4 == 13 and layer5 == 14) { + return rPhiChiSquared < 225.322f; + } else if (layer4 == 8 and layer5 == 14) { + return rPhiChiSquared < 1192.402f; + } else if (layer4 == 8 and layer5 == 9) { + return rPhiChiSquared < 786.173f; + } + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8) { + if (layer4 == 9 and layer5 == 10) { + return rPhiChiSquared < 1037.817f; + } else if (layer4 == 9 and layer5 == 15) { + return rPhiChiSquared < 1808.536f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) { + if (layer4 == 12 and layer5 == 13) { + return rPhiChiSquared < 684.253f; + } else if (layer4 == 5 and layer5 == 12) { + return rPhiChiSquared < 684.253f; + } else if (layer4 == 5 and layer5 == 6) { + return rPhiChiSquared < 684.253f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 7) { + if (layer4 == 13 and layer5 == 14) { + return rPhiChiSquared < 451.141f; + } else if (layer4 == 8 and layer5 == 14) { + return rPhiChiSquared < 518.34f; + } + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) { + if (layer4 == 14 and layer5 == 15) { + return rPhiChiSquared < 2077.92f; + } else if (layer4 == 9 and layer5 == 10) { + return rPhiChiSquared < 74.20f; + } else if (layer4 == 9 and layer5 == 15) { + return rPhiChiSquared < 1808.536f; + } + } else if (layer1 == 3 and layer2 == 7 and layer3 == 8 and layer4 == 14 and layer5 == 15) { + return rPhiChiSquared < 786.173f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 9) { + if (layer4 == 10 and layer5 == 11) { + return rPhiChiSquared < 1574.076f; + } else if (layer4 == 10 and layer5 == 16) { + return rPhiChiSquared < 5492.11f; + } else if (layer4 == 15 and layer5 == 16) { + return rPhiChiSquared < 2743.037f; + } + } + return true; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::objectRanges& rangesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::triplets& tripletsInGPU, + struct SDL::quintuplets& quintupletsInGPU, + unsigned int& pixelSegmentIndex, + unsigned int& quintupletIndex, + float& rzChiSquared, + float& rPhiChiSquared, + float& rPhiChiSquaredInwards, + float& pixelRadius, + float& quintupletRadius, + float& centerX, + float& centerY, + unsigned int pixelSegmentArrayIndex) { + bool pass = true; + + unsigned int T5InnerT3Index = quintupletsInGPU.tripletIndices[2 * quintupletIndex]; + unsigned int T5OuterT3Index = quintupletsInGPU.tripletIndices[2 * quintupletIndex + 1]; + + float pixelRadiusTemp, pixelRadiusError, tripletRadius, rPhiChiSquaredTemp, rzChiSquaredTemp, + rPhiChiSquaredInwardsTemp, centerXTemp, centerYTemp; + + pass = pass and runPixelTripletDefaultAlgo(acc, + modulesInGPU, + rangesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + pixelSegmentIndex, + T5InnerT3Index, + pixelRadiusTemp, + pixelRadiusError, + tripletRadius, + centerXTemp, + centerYTemp, + rzChiSquaredTemp, + rPhiChiSquaredTemp, + rPhiChiSquaredInwardsTemp, + false); + if (not pass) + return false; + + unsigned int firstSegmentIndex = tripletsInGPU.segmentIndices[2 * T5InnerT3Index]; + unsigned int secondSegmentIndex = tripletsInGPU.segmentIndices[2 * T5InnerT3Index + 1]; + unsigned int thirdSegmentIndex = tripletsInGPU.segmentIndices[2 * T5OuterT3Index]; + unsigned int fourthSegmentIndex = tripletsInGPU.segmentIndices[2 * T5OuterT3Index + 1]; + + unsigned int pixelInnerMDIndex = segmentsInGPU.mdIndices[2 * pixelSegmentIndex]; + unsigned int pixelOuterMDIndex = segmentsInGPU.mdIndices[2 * pixelSegmentIndex + 1]; + unsigned int firstMDIndex = segmentsInGPU.mdIndices[2 * firstSegmentIndex]; + unsigned int secondMDIndex = segmentsInGPU.mdIndices[2 * secondSegmentIndex]; + unsigned int thirdMDIndex = segmentsInGPU.mdIndices[2 * secondSegmentIndex + 1]; + unsigned int fourthMDIndex = segmentsInGPU.mdIndices[2 * thirdSegmentIndex + 1]; + unsigned int fifthMDIndex = segmentsInGPU.mdIndices[2 * fourthSegmentIndex + 1]; + + uint16_t lowerModuleIndex1 = quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex]; + uint16_t lowerModuleIndex2 = quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex + 1]; + uint16_t lowerModuleIndex3 = quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex + 2]; + uint16_t lowerModuleIndex4 = quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex + 3]; + uint16_t lowerModuleIndex5 = quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex + 4]; + + uint16_t lowerModuleIndices[5] = { + lowerModuleIndex1, lowerModuleIndex2, lowerModuleIndex3, lowerModuleIndex4, lowerModuleIndex5}; + + float zPix[2] = {mdsInGPU.anchorZ[pixelInnerMDIndex], mdsInGPU.anchorZ[pixelOuterMDIndex]}; + float rtPix[2] = {mdsInGPU.anchorRt[pixelInnerMDIndex], mdsInGPU.anchorRt[pixelOuterMDIndex]}; + float zs[5] = {mdsInGPU.anchorZ[firstMDIndex], + mdsInGPU.anchorZ[secondMDIndex], + mdsInGPU.anchorZ[thirdMDIndex], + mdsInGPU.anchorZ[fourthMDIndex], + mdsInGPU.anchorZ[fifthMDIndex]}; + float rts[5] = {mdsInGPU.anchorRt[firstMDIndex], + mdsInGPU.anchorRt[secondMDIndex], + mdsInGPU.anchorRt[thirdMDIndex], + mdsInGPU.anchorRt[fourthMDIndex], + mdsInGPU.anchorRt[fifthMDIndex]}; + + rzChiSquared = computePT5RZChiSquared(acc, modulesInGPU, lowerModuleIndices, rtPix, zPix, rts, zs); + + if (/*pixelRadius*/ 0 < 5.0f * kR1GeVf) { // FIXME: pixelRadius is not defined yet + pass = pass and passPT5RZChiSquaredCuts(modulesInGPU, + lowerModuleIndex1, + lowerModuleIndex2, + lowerModuleIndex3, + lowerModuleIndex4, + lowerModuleIndex5, + rzChiSquared); + if (not pass) + return pass; + } + + //outer T5 + float xs[5] = {mdsInGPU.anchorX[firstMDIndex], + mdsInGPU.anchorX[secondMDIndex], + mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorX[fourthMDIndex], + mdsInGPU.anchorX[fifthMDIndex]}; + float ys[5] = {mdsInGPU.anchorY[firstMDIndex], + mdsInGPU.anchorY[secondMDIndex], + mdsInGPU.anchorY[thirdMDIndex], + mdsInGPU.anchorY[fourthMDIndex], + mdsInGPU.anchorY[fifthMDIndex]}; + + //get the appropriate radii and centers + centerX = segmentsInGPU.circleCenterX[pixelSegmentArrayIndex]; + centerY = segmentsInGPU.circleCenterY[pixelSegmentArrayIndex]; + pixelRadius = segmentsInGPU.circleRadius[pixelSegmentArrayIndex]; + + float T5CenterX = quintupletsInGPU.regressionG[quintupletIndex]; + float T5CenterY = quintupletsInGPU.regressionF[quintupletIndex]; + quintupletRadius = quintupletsInGPU.regressionRadius[quintupletIndex]; + + rPhiChiSquared = + computePT5RPhiChiSquared(acc, modulesInGPU, lowerModuleIndices, centerX, centerY, pixelRadius, xs, ys); + + if (pixelRadius < 5.0f * kR1GeVf) { + pass = pass and passPT5RPhiChiSquaredCuts(modulesInGPU, + lowerModuleIndex1, + lowerModuleIndex2, + lowerModuleIndex3, + lowerModuleIndex4, + lowerModuleIndex5, + rPhiChiSquared); + if (not pass) + return pass; + } + + float xPix[] = {mdsInGPU.anchorX[pixelInnerMDIndex], mdsInGPU.anchorX[pixelOuterMDIndex]}; + float yPix[] = {mdsInGPU.anchorY[pixelInnerMDIndex], mdsInGPU.anchorY[pixelOuterMDIndex]}; + rPhiChiSquaredInwards = + computePT5RPhiChiSquaredInwards(modulesInGPU, T5CenterX, T5CenterY, quintupletRadius, xPix, yPix); + + if (quintupletsInGPU.regressionRadius[quintupletIndex] < 5.0f * kR1GeVf) { + pass = pass and passPT5RPhiChiSquaredInwardsCuts(modulesInGPU, + lowerModuleIndex1, + lowerModuleIndex2, + lowerModuleIndex3, + lowerModuleIndex4, + lowerModuleIndex5, + rPhiChiSquaredInwards); + if (not pass) + return pass; + } + //trusting the T5 regression center to also be a good estimate.. + centerX = (centerX + T5CenterX) / 2; + centerY = (centerY + T5CenterY) / 2; + + //other cuts will be filled here! + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT5RZChiSquared(TAcc const& acc, + struct SDL::modules& modulesInGPU, + uint16_t* lowerModuleIndices, + float* rtPix, + float* zPix, + float* rts, + float* zs) { + //use the two anchor hits of the pixel segment to compute the slope + //then compute the pseudo chi squared of the five outer hits + + float slope = (zPix[1] - zPix[0]) / (rtPix[1] - rtPix[0]); + float residual = 0; + float error = 0; + //hardcoded array indices!!! + float RMSE = 0; + for (size_t i = 0; i < 5; i++) { + uint16_t& lowerModuleIndex = lowerModuleIndices[i]; + const int moduleType = modulesInGPU.moduleType[lowerModuleIndex]; + const int moduleSide = modulesInGPU.sides[lowerModuleIndex]; + const int moduleSubdet = modulesInGPU.subdets[lowerModuleIndex]; + + residual = (moduleSubdet == SDL::Barrel) ? (zs[i] - zPix[0]) - slope * (rts[i] - rtPix[0]) + : (rts[i] - rtPix[0]) - (zs[i] - zPix[0]) / slope; + const float& drdz = modulesInGPU.drdzs[lowerModuleIndex]; + //PS Modules + if (moduleType == 0) { + error = 0.15f; + } else //2S modules + { + error = 5.0f; + } + + //special dispensation to tilted PS modules! + if (moduleType == 0 and moduleSubdet == SDL::Barrel and moduleSide != Center) { + error /= alpaka::math::sqrt(acc, 1.f + drdz * drdz); + } + RMSE += (residual * residual) / (error * error); + } + + RMSE = alpaka::math::sqrt(acc, 0.2f * RMSE); + return RMSE; + }; + + struct createPixelQuintupletsInGPUFromMapv2 { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::triplets tripletsInGPU, + struct SDL::quintuplets quintupletsInGPU, + struct SDL::pixelQuintuplets pixelQuintupletsInGPU, + unsigned int* connectedPixelSize, + unsigned int* connectedPixelIndex, + unsigned int nPixelSegments, + struct SDL::objectRanges rangesInGPU) const { + auto const globalBlockIdx = alpaka::getIdx(acc); + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridBlockExtent = alpaka::getWorkDiv(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (unsigned int i_pLS = globalThreadIdx[1]; i_pLS < nPixelSegments; i_pLS += gridThreadExtent[1]) { + auto iLSModule_max = connectedPixelIndex[i_pLS] + connectedPixelSize[i_pLS]; + for (unsigned int iLSModule = connectedPixelIndex[i_pLS] + globalBlockIdx[0]; iLSModule < iLSModule_max; + iLSModule += gridBlockExtent[0]) { + //these are actual module indices + uint16_t quintupletLowerModuleIndex = modulesInGPU.connectedPixels[iLSModule]; + if (quintupletLowerModuleIndex >= *modulesInGPU.nLowerModules) + continue; + if (modulesInGPU.moduleType[quintupletLowerModuleIndex] == SDL::TwoS) + continue; + uint16_t pixelModuleIndex = *modulesInGPU.nLowerModules; + if (segmentsInGPU.isDup[i_pLS]) + continue; + unsigned int nOuterQuintuplets = quintupletsInGPU.nQuintuplets[quintupletLowerModuleIndex]; + + if (nOuterQuintuplets == 0) + continue; + + unsigned int pixelSegmentIndex = rangesInGPU.segmentModuleIndices[pixelModuleIndex] + i_pLS; + + //fetch the quintuplet + for (unsigned int outerQuintupletArrayIndex = globalThreadIdx[2]; + outerQuintupletArrayIndex < nOuterQuintuplets; + outerQuintupletArrayIndex += gridThreadExtent[2]) { + unsigned int quintupletIndex = + rangesInGPU.quintupletModuleIndices[quintupletLowerModuleIndex] + outerQuintupletArrayIndex; + + if (quintupletsInGPU.isDup[quintupletIndex]) + continue; + + float rzChiSquared, rPhiChiSquared, rPhiChiSquaredInwards, pixelRadius, quintupletRadius, centerX, centerY; + + bool success = runPixelQuintupletDefaultAlgo(acc, + modulesInGPU, + rangesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + quintupletsInGPU, + pixelSegmentIndex, + quintupletIndex, + rzChiSquared, + rPhiChiSquared, + rPhiChiSquaredInwards, + pixelRadius, + quintupletRadius, + centerX, + centerY, + static_cast(i_pLS)); + if (success) { + unsigned int totOccupancyPixelQuintuplets = + alpaka::atomicOp(acc, pixelQuintupletsInGPU.totOccupancyPixelQuintuplets, 1u); + if (totOccupancyPixelQuintuplets >= N_MAX_PIXEL_QUINTUPLETS) { +#ifdef Warnings + printf("Pixel Quintuplet excess alert!\n"); +#endif + } else { + unsigned int pixelQuintupletIndex = + alpaka::atomicOp(acc, pixelQuintupletsInGPU.nPixelQuintuplets, 1u); + float eta = __H2F(quintupletsInGPU.eta[quintupletIndex]); + float phi = __H2F(quintupletsInGPU.phi[quintupletIndex]); + + addPixelQuintupletToMemory(modulesInGPU, + mdsInGPU, + segmentsInGPU, + quintupletsInGPU, + pixelQuintupletsInGPU, + pixelSegmentIndex, + quintupletIndex, + pixelQuintupletIndex, + rzChiSquared, + rPhiChiSquared, + rPhiChiSquaredInwards, + rPhiChiSquared, + eta, + phi, + pixelRadius, + quintupletRadius, + centerX, + centerY); + + tripletsInGPU.partOfPT5[quintupletsInGPU.tripletIndices[2 * quintupletIndex]] = true; + tripletsInGPU.partOfPT5[quintupletsInGPU.tripletIndices[2 * quintupletIndex + 1]] = true; + segmentsInGPU.partOfPT5[i_pLS] = true; + quintupletsInGPU.partOfPT5[quintupletIndex] = true; + } // tot occupancy + } // end success + } // end T5 + } // end iLS + } // end i_pLS + } + }; +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h new file mode 100644 index 0000000000000..cc17012019d6d --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h @@ -0,0 +1,3237 @@ +#ifndef Quintuplet_cuh +#define Quintuplet_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "NeuralNetwork.h" +#include "EndcapGeometry.h" +#include "Segment.h" +#include "MiniDoublet.h" +#include "Hit.h" +#include "Triplet.h" + +namespace SDL { + struct quintuplets { + unsigned int* tripletIndices; + uint16_t* lowerModuleIndices; + unsigned int* nQuintuplets; + unsigned int* totOccupancyQuintuplets; + unsigned int* nMemoryLocations; + + FPX* innerRadius; + FPX* bridgeRadius; + FPX* outerRadius; + FPX* pt; + FPX* eta; + FPX* phi; + FPX* score_rphisum; + uint8_t* layer; + char* isDup; + bool* TightCutFlag; + bool* partOfPT5; + + float* regressionRadius; + float* regressionG; + float* regressionF; + + uint8_t* logicalLayers; + unsigned int* hitIndices; + float* rzChiSquared; + float* chiSquared; + float* nonAnchorChiSquared; + + template + void setData(TBuff& quintupletsbuf) { + tripletIndices = alpaka::getPtrNative(quintupletsbuf.tripletIndices_buf); + lowerModuleIndices = alpaka::getPtrNative(quintupletsbuf.lowerModuleIndices_buf); + nQuintuplets = alpaka::getPtrNative(quintupletsbuf.nQuintuplets_buf); + totOccupancyQuintuplets = alpaka::getPtrNative(quintupletsbuf.totOccupancyQuintuplets_buf); + nMemoryLocations = alpaka::getPtrNative(quintupletsbuf.nMemoryLocations_buf); + innerRadius = alpaka::getPtrNative(quintupletsbuf.innerRadius_buf); + bridgeRadius = alpaka::getPtrNative(quintupletsbuf.bridgeRadius_buf); + outerRadius = alpaka::getPtrNative(quintupletsbuf.outerRadius_buf); + pt = alpaka::getPtrNative(quintupletsbuf.pt_buf); + eta = alpaka::getPtrNative(quintupletsbuf.eta_buf); + phi = alpaka::getPtrNative(quintupletsbuf.phi_buf); + score_rphisum = alpaka::getPtrNative(quintupletsbuf.score_rphisum_buf); + layer = alpaka::getPtrNative(quintupletsbuf.layer_buf); + isDup = alpaka::getPtrNative(quintupletsbuf.isDup_buf); + TightCutFlag = alpaka::getPtrNative(quintupletsbuf.TightCutFlag_buf); + partOfPT5 = alpaka::getPtrNative(quintupletsbuf.partOfPT5_buf); + regressionRadius = alpaka::getPtrNative(quintupletsbuf.regressionRadius_buf); + regressionG = alpaka::getPtrNative(quintupletsbuf.regressionG_buf); + regressionF = alpaka::getPtrNative(quintupletsbuf.regressionF_buf); + logicalLayers = alpaka::getPtrNative(quintupletsbuf.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(quintupletsbuf.hitIndices_buf); + rzChiSquared = alpaka::getPtrNative(quintupletsbuf.rzChiSquared_buf); + chiSquared = alpaka::getPtrNative(quintupletsbuf.chiSquared_buf); + nonAnchorChiSquared = alpaka::getPtrNative(quintupletsbuf.nonAnchorChiSquared_buf); + } + }; + + template + struct quintupletsBuffer : quintuplets { + Buf tripletIndices_buf; + Buf lowerModuleIndices_buf; + Buf nQuintuplets_buf; + Buf totOccupancyQuintuplets_buf; + Buf nMemoryLocations_buf; + + Buf innerRadius_buf; + Buf bridgeRadius_buf; + Buf outerRadius_buf; + Buf pt_buf; + Buf eta_buf; + Buf phi_buf; + Buf score_rphisum_buf; + Buf layer_buf; + Buf isDup_buf; + Buf TightCutFlag_buf; + Buf partOfPT5_buf; + + Buf regressionRadius_buf; + Buf regressionG_buf; + Buf regressionF_buf; + + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf rzChiSquared_buf; + Buf chiSquared_buf; + Buf nonAnchorChiSquared_buf; + + template + quintupletsBuffer(unsigned int nTotalQuintuplets, unsigned int nLowerModules, TDevAcc const& devAccIn, TQueue& queue) + : tripletIndices_buf(allocBufWrapper(devAccIn, 2 * nTotalQuintuplets, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 5 * nTotalQuintuplets, queue)), + nQuintuplets_buf(allocBufWrapper(devAccIn, nLowerModules, queue)), + totOccupancyQuintuplets_buf(allocBufWrapper(devAccIn, nLowerModules, queue)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1, queue)), + innerRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + bridgeRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + outerRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + pt_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + eta_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + phi_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + score_rphisum_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + layer_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + isDup_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + TightCutFlag_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + partOfPT5_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + regressionRadius_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + regressionG_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + regressionF_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, 5 * nTotalQuintuplets, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, 10 * nTotalQuintuplets, queue)), + rzChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + chiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)), + nonAnchorChiSquared_buf(allocBufWrapper(devAccIn, nTotalQuintuplets, queue)) { + alpaka::memset(queue, nQuintuplets_buf, 0u); + alpaka::memset(queue, totOccupancyQuintuplets_buf, 0u); + alpaka::memset(queue, isDup_buf, 0u); + alpaka::memset(queue, TightCutFlag_buf, false); + alpaka::memset(queue, partOfPT5_buf, false); + alpaka::wait(queue); + } + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool checkIntervalOverlap(const float& firstMin, + const float& firstMax, + const float& secondMin, + const float& secondMax) { + return ((firstMin <= secondMin) && (secondMin < firstMax)) || ((secondMin < firstMin) && (firstMin < secondMax)); + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addQuintupletToMemory(struct SDL::triplets& tripletsInGPU, + struct SDL::quintuplets& quintupletsInGPU, + unsigned int innerTripletIndex, + unsigned int outerTripletIndex, + uint16_t& lowerModule1, + uint16_t& lowerModule2, + uint16_t& lowerModule3, + uint16_t& lowerModule4, + uint16_t& lowerModule5, + float& innerRadius, + float& bridgeRadius, + float& outerRadius, + float& regressionG, + float& regressionF, + float& regressionRadius, + float& rzChiSquared, + float& rPhiChiSquared, + float& nonAnchorChiSquared, + float pt, + float eta, + float phi, + float scores, + uint8_t layer, + unsigned int quintupletIndex, + bool TightCutFlag) { + quintupletsInGPU.tripletIndices[2 * quintupletIndex] = innerTripletIndex; + quintupletsInGPU.tripletIndices[2 * quintupletIndex + 1] = outerTripletIndex; + + quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex] = lowerModule1; + quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex + 1] = lowerModule2; + quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex + 2] = lowerModule3; + quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex + 3] = lowerModule4; + quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex + 4] = lowerModule5; + quintupletsInGPU.innerRadius[quintupletIndex] = __F2H(innerRadius); + quintupletsInGPU.outerRadius[quintupletIndex] = __F2H(outerRadius); + quintupletsInGPU.pt[quintupletIndex] = __F2H(pt); + quintupletsInGPU.eta[quintupletIndex] = __F2H(eta); + quintupletsInGPU.phi[quintupletIndex] = __F2H(phi); + quintupletsInGPU.score_rphisum[quintupletIndex] = __F2H(scores); + quintupletsInGPU.layer[quintupletIndex] = layer; + quintupletsInGPU.isDup[quintupletIndex] = 0; + quintupletsInGPU.TightCutFlag[quintupletIndex] = TightCutFlag; + quintupletsInGPU.regressionRadius[quintupletIndex] = regressionRadius; + quintupletsInGPU.regressionG[quintupletIndex] = regressionG; + quintupletsInGPU.regressionF[quintupletIndex] = regressionF; + quintupletsInGPU.logicalLayers[5 * quintupletIndex] = tripletsInGPU.logicalLayers[3 * innerTripletIndex]; + quintupletsInGPU.logicalLayers[5 * quintupletIndex + 1] = tripletsInGPU.logicalLayers[3 * innerTripletIndex + 1]; + quintupletsInGPU.logicalLayers[5 * quintupletIndex + 2] = tripletsInGPU.logicalLayers[3 * innerTripletIndex + 2]; + quintupletsInGPU.logicalLayers[5 * quintupletIndex + 3] = tripletsInGPU.logicalLayers[3 * outerTripletIndex + 1]; + quintupletsInGPU.logicalLayers[5 * quintupletIndex + 4] = tripletsInGPU.logicalLayers[3 * outerTripletIndex + 2]; + + quintupletsInGPU.hitIndices[10 * quintupletIndex] = tripletsInGPU.hitIndices[6 * innerTripletIndex]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 1] = tripletsInGPU.hitIndices[6 * innerTripletIndex + 1]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 2] = tripletsInGPU.hitIndices[6 * innerTripletIndex + 2]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 3] = tripletsInGPU.hitIndices[6 * innerTripletIndex + 3]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 4] = tripletsInGPU.hitIndices[6 * innerTripletIndex + 4]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 5] = tripletsInGPU.hitIndices[6 * innerTripletIndex + 5]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 6] = tripletsInGPU.hitIndices[6 * outerTripletIndex + 2]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 7] = tripletsInGPU.hitIndices[6 * outerTripletIndex + 3]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 8] = tripletsInGPU.hitIndices[6 * outerTripletIndex + 4]; + quintupletsInGPU.hitIndices[10 * quintupletIndex + 9] = tripletsInGPU.hitIndices[6 * outerTripletIndex + 5]; + quintupletsInGPU.bridgeRadius[quintupletIndex] = bridgeRadius; + quintupletsInGPU.rzChiSquared[quintupletIndex] = rzChiSquared; + quintupletsInGPU.chiSquared[quintupletIndex] = rPhiChiSquared; + quintupletsInGPU.nonAnchorChiSquared[quintupletIndex] = nonAnchorChiSquared; + }; + + //90% constraint + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passChiSquaredConstraint(struct SDL::modules& modulesInGPU, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + uint16_t& lowerModuleIndex4, + uint16_t& lowerModuleIndex5, + float& chiSquared) { + //following Philip's layer number prescription + const int layer1 = modulesInGPU.sdlLayers[lowerModuleIndex1]; + const int layer2 = modulesInGPU.sdlLayers[lowerModuleIndex2]; + const int layer3 = modulesInGPU.sdlLayers[lowerModuleIndex3]; + const int layer4 = modulesInGPU.sdlLayers[lowerModuleIndex4]; + const int layer5 = modulesInGPU.sdlLayers[lowerModuleIndex5]; + + if (layer1 == 7 and layer2 == 8 and layer3 == 9) { + if (layer4 == 10 and layer5 == 11) { + return chiSquared < 0.01788f; + } else if (layer4 == 10 and layer5 == 16) { + return chiSquared < 0.04725f; + } else if (layer4 == 15 and layer5 == 16) { + return chiSquared < 0.04725f; + } + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8) { + if (layer4 == 9 and layer5 == 10) { + return chiSquared < 0.01788f; + } else if (layer4 == 9 and layer5 == 15) { + return chiSquared < 0.08234f; + } + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) { + if (layer4 == 8 and layer5 == 9) { + return chiSquared < 0.02360f; + } else if (layer4 == 8 and layer5 == 14) { + return chiSquared < 0.07167f; + } else if (layer4 == 13 and layer5 == 14) { + return chiSquared < 0.08234f; + } + } else if (layer1 == 1 and layer2 == 2 and layer3 == 3) { + if (layer4 == 7 and layer5 == 8) { + return chiSquared < 0.01026f; + } else if (layer4 == 7 and layer5 == 13) { + return chiSquared < 0.06238f; + } else if (layer4 == 12 and layer5 == 13) { + return chiSquared < 0.06238f; + } + } else if (layer1 == 1 and layer2 == 2 and layer3 == 3 and layer4 == 4) { + if (layer5 == 5) { + return chiSquared < 0.04725f; + } else if (layer5 == 12) { + return chiSquared < 0.09461f; + } + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) { + if (layer4 == 9 and layer5 == 10) { + return chiSquared < 0.00512f; + } + if (layer4 == 9 and layer5 == 15) { + return chiSquared < 0.04112f; + } else if (layer4 == 14 and layer5 == 15) { + return chiSquared < 0.06238f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 7) { + if (layer4 == 8 and layer5 == 14) { + return chiSquared < 0.07167f; + } else if (layer4 == 13 and layer5 == 14) { + return chiSquared < 0.06238f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) { + if (layer4 == 5 and layer5 == 6) { + return chiSquared < 0.08234f; + } else if (layer4 == 5 and layer5 == 12) { + return chiSquared < 0.10870f; + } else if (layer4 == 12 and layer5 == 13) { + return chiSquared < 0.10870f; + } + } else if (layer1 == 3 and layer2 == 7 and layer3 == 8 and layer4 == 14 and layer5 == 15) { + return chiSquared < 0.09461f; + } else if (layer1 == 3 and layer2 == 4 and layer3 == 5 and layer4 == 12 and layer5 == 13) { + return chiSquared < 0.09461f; + } + + return true; + }; + + //bounds can be found at http://uaf-10.t2.ucsd.edu/~bsathian/SDL/T5_RZFix/t5_rz_thresholds.txt + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passT5RZConstraint(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + unsigned int firstMDIndex, + unsigned int secondMDIndex, + unsigned int thirdMDIndex, + unsigned int fourthMDIndex, + unsigned int fifthMDIndex, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + uint16_t& lowerModuleIndex4, + uint16_t& lowerModuleIndex5, + float& rzChiSquared, + float inner_pt, + float innerRadius, + float g, + float f, + bool& TightCutFlag) { + //(g,f) is the center of the circle fitted by the innermost 3 points on x,y coordinates + const float& rt1 = mdsInGPU.anchorRt[firstMDIndex] / 100; //in the unit of m instead of cm + const float& rt2 = mdsInGPU.anchorRt[secondMDIndex] / 100; + const float& rt3 = mdsInGPU.anchorRt[thirdMDIndex] / 100; + const float& rt4 = mdsInGPU.anchorRt[fourthMDIndex] / 100; + const float& rt5 = mdsInGPU.anchorRt[fifthMDIndex] / 100; + + const float& z1 = mdsInGPU.anchorZ[firstMDIndex] / 100; + const float& z2 = mdsInGPU.anchorZ[secondMDIndex] / 100; + const float& z3 = mdsInGPU.anchorZ[thirdMDIndex] / 100; + const float& z4 = mdsInGPU.anchorZ[fourthMDIndex] / 100; + const float& z5 = mdsInGPU.anchorZ[fifthMDIndex] / 100; + + //following Philip's layer number prescription + const int layer1 = modulesInGPU.sdlLayers[lowerModuleIndex1]; + const int layer2 = modulesInGPU.sdlLayers[lowerModuleIndex2]; + const int layer3 = modulesInGPU.sdlLayers[lowerModuleIndex3]; + const int layer4 = modulesInGPU.sdlLayers[lowerModuleIndex4]; + const int layer5 = modulesInGPU.sdlLayers[lowerModuleIndex5]; + + //slope computed using the internal T3s + const int moduleType1 = modulesInGPU.moduleType[lowerModuleIndex1]; //0 is ps, 1 is 2s + const int moduleType2 = modulesInGPU.moduleType[lowerModuleIndex2]; + const int moduleType3 = modulesInGPU.moduleType[lowerModuleIndex3]; + const int moduleType4 = modulesInGPU.moduleType[lowerModuleIndex4]; + const int moduleType5 = modulesInGPU.moduleType[lowerModuleIndex5]; + + const float& x1 = mdsInGPU.anchorX[firstMDIndex] / 100; + const float& x2 = mdsInGPU.anchorX[secondMDIndex] / 100; + const float& x3 = mdsInGPU.anchorX[thirdMDIndex] / 100; + const float& x4 = mdsInGPU.anchorX[fourthMDIndex] / 100; + const float& y1 = mdsInGPU.anchorY[firstMDIndex] / 100; + const float& y2 = mdsInGPU.anchorY[secondMDIndex] / 100; + const float& y3 = mdsInGPU.anchorY[thirdMDIndex] / 100; + const float& y4 = mdsInGPU.anchorY[fourthMDIndex] / 100; + + float residual = 0; + float error = 0; + float x_center = g / 100, y_center = f / 100; + float x_init = mdsInGPU.anchorX[thirdMDIndex] / 100; + float y_init = mdsInGPU.anchorY[thirdMDIndex] / 100; + float z_init = mdsInGPU.anchorZ[thirdMDIndex] / 100; + float rt_init = mdsInGPU.anchorRt[thirdMDIndex] / 100; //use the second MD as initial point + + if (moduleType3 == 1) // 1: if MD3 is in 2s layer + { + x_init = mdsInGPU.anchorX[secondMDIndex] / 100; + y_init = mdsInGPU.anchorY[secondMDIndex] / 100; + z_init = mdsInGPU.anchorZ[secondMDIndex] / 100; + rt_init = mdsInGPU.anchorRt[secondMDIndex] / 100; + } + + // start from a circle of inner T3. + // to determine the charge + int charge = 0; + float slope3c = (y3 - y_center) / (x3 - x_center); + float slope1c = (y1 - y_center) / (x1 - x_center); + // these 4 "if"s basically separate the x-y plane into 4 quarters. It determines geometrically how a circle and line slope goes and their positions, and we can get the charges correspondingly. + if ((y3 - y_center) > 0 && (y1 - y_center) > 0) { + if (slope1c > 0 && slope3c < 0) + charge = -1; // on x axis of a quarter, 3 hits go anti-clockwise + else if (slope1c < 0 && slope3c > 0) + charge = 1; // on x axis of a quarter, 3 hits go clockwise + else if (slope3c > slope1c) + charge = -1; + else if (slope3c < slope1c) + charge = 1; + } else if ((y3 - y_center) < 0 && (y1 - y_center) < 0) { + if (slope1c < 0 && slope3c > 0) + charge = 1; + else if (slope1c > 0 && slope3c < 0) + charge = -1; + else if (slope3c > slope1c) + charge = -1; + else if (slope3c < slope1c) + charge = 1; + } else if ((y3 - y_center) < 0 && (y1 - y_center) > 0) { + if ((x3 - x_center) > 0 && (x1 - x_center) > 0) + charge = 1; + else if ((x3 - x_center) < 0 && (x1 - x_center) < 0) + charge = -1; + } else if ((y3 - y_center) > 0 && (y1 - y_center) < 0) { + if ((x3 - x_center) > 0 && (x1 - x_center) > 0) + charge = -1; + else if ((x3 - x_center) < 0 && (x1 - x_center) < 0) + charge = 1; + } + + float pseudo_phi = alpaka::math::atan( + acc, (y_init - y_center) / (x_init - x_center)); //actually represent pi/2-phi, wrt helix axis z + float Pt = inner_pt, Px = Pt * alpaka::math::abs(acc, alpaka::math::sin(acc, pseudo_phi)), + Py = Pt * alpaka::math::abs(acc, cos(pseudo_phi)); + + // Above line only gives you the correct value of Px and Py, but signs of Px and Py calculated below. + // We look at if the circle is clockwise or anti-clock wise, to make it simpler, we separate the x-y plane into 4 quarters. + if (x_init > x_center && y_init > y_center) //1st quad + { + if (charge == 1) + Py = -Py; + if (charge == -1) + Px = -Px; + } + if (x_init < x_center && y_init > y_center) //2nd quad + { + if (charge == -1) { + Px = -Px; + Py = -Py; + } + } + if (x_init < x_center && y_init < y_center) //3rd quad + { + if (charge == 1) + Px = -Px; + if (charge == -1) + Py = -Py; + } + if (x_init > x_center && y_init < y_center) //4th quad + { + if (charge == 1) { + Px = -Px; + Py = -Py; + } + } + + // But if the initial T5 curve goes across quarters(i.e. cross axis to separate the quarters), need special redeclaration of Px,Py signs on these to avoid errors + if (moduleType3 == 0) { // 0 is ps + if (x4 < x3 && x3 < x2) + Px = -alpaka::math::abs(acc, Px); + if (x4 > x3 && x3 > x2) + Px = alpaka::math::abs(acc, Px); + if (y4 < y3 && y3 < y2) + Py = -alpaka::math::abs(acc, Py); + if (y4 > y3 && y3 > y2) + Py = alpaka::math::abs(acc, Py); + } else if (moduleType3 == 1) // 1 is 2s + { + if (x3 < x2 && x2 < x1) + Px = -alpaka::math::abs(acc, Px); + if (x3 > x2 && x2 > x1) + Px = alpaka::math::abs(acc, Px); + if (y3 < y2 && y2 < y1) + Py = -alpaka::math::abs(acc, Py); + if (y3 > y2 && y2 > y1) + Py = alpaka::math::abs(acc, Py); + } + + //to get Pz, we use pt/pz=ds/dz, ds is the arclength between MD1 and MD3. + float AO = alpaka::math::sqrt(acc, (x1 - x_center) * (x1 - x_center) + (y1 - y_center) * (y1 - y_center)); + float BO = + alpaka::math::sqrt(acc, (x_init - x_center) * (x_init - x_center) + (y_init - y_center) * (y_init - y_center)); + float AB = alpaka::math::sqrt(acc, (x1 - x_init) * (x1 - x_init) + (y1 - y_init) * (y1 - y_init)); + float dPhi = alpaka::math::acos(acc, (AO * AO + BO * BO - AB * AB) / (2 * AO * BO)); + float ds = innerRadius / 100 * dPhi; + + float Pz = (z_init - z1) / ds * Pt; + float p = alpaka::math::sqrt(acc, Px * Px + Py * Py + Pz * Pz); + + float Bz = SDL::magnetic_field; + float a = -0.299792 * Bz * charge; + + float zsi, rtsi; + int layeri, moduleTypei; + rzChiSquared = 0; + for (size_t i = 2; i < 6; i++) { + if (i == 2) { + zsi = z2; + rtsi = rt2; + layeri = layer2; + moduleTypei = moduleType2; + } else if (i == 3) { + zsi = z3; + rtsi = rt3; + layeri = layer3; + moduleTypei = moduleType3; + } else if (i == 4) { + zsi = z4; + rtsi = rt4; + layeri = layer4; + moduleTypei = moduleType4; + } else if (i == 5) { + zsi = z5; + rtsi = rt5; + layeri = layer5; + moduleTypei = moduleType5; + } + + if (moduleType3 == 0) { //0: ps + if (i == 3) + continue; + } else { + if (i == 2) + continue; + } + + // calculation is copied from PixelTriplet.cc SDL::computePT3RZChiSquared + float diffr = 0, diffz = 0; + + float rou = a / p; + // for endcap + float s = (zsi - z_init) * p / Pz; + float x = x_init + Px / a * alpaka::math::sin(acc, rou * s) - Py / a * (1 - alpaka::math::cos(acc, rou * s)); + float y = y_init + Py / a * alpaka::math::sin(acc, rou * s) + Px / a * (1 - alpaka::math::cos(acc, rou * s)); + diffr = (rtsi - alpaka::math::sqrt(acc, x * x + y * y)) * 100; + + // for barrel + if (layeri <= 6) { + float paraA = + rt_init * rt_init + 2 * (Px * Px + Py * Py) / (a * a) + 2 * (y_init * Px - x_init * Py) / a - rtsi * rtsi; + float paraB = 2 * (x_init * Px + y_init * Py) / a; + float paraC = 2 * (y_init * Px - x_init * Py) / a + 2 * (Px * Px + Py * Py) / (a * a); + float A = paraB * paraB + paraC * paraC; + float B = 2 * paraA * paraB; + float C = paraA * paraA - paraC * paraC; + float sol1 = (-B + alpaka::math::sqrt(acc, B * B - 4 * A * C)) / (2 * A); + float sol2 = (-B - alpaka::math::sqrt(acc, B * B - 4 * A * C)) / (2 * A); + float solz1 = alpaka::math::asin(acc, sol1) / rou * Pz / p + z_init; + float solz2 = alpaka::math::asin(acc, sol2) / rou * Pz / p + z_init; + float diffz1 = (solz1 - zsi) * 100; + float diffz2 = (solz2 - zsi) * 100; + // Alpaka : Needs to be moved over + if (alpaka::math::isnan(acc, diffz1)) + diffz = diffz2; + else if (alpaka::math::isnan(acc, diffz2)) + diffz = diffz1; + else { + diffz = (alpaka::math::abs(acc, diffz1) < alpaka::math::abs(acc, diffz2)) ? diffz1 : diffz2; + } + } + residual = (layeri > 6) ? diffr : diffz; + + //PS Modules + if (moduleTypei == 0) { + error = 0.15f; + } else //2S modules + { + error = 5.0f; + } + + //check the tilted module, side: PosZ, NegZ, Center(for not tilted) + float drdz; + short side, subdets; + if (i == 2) { + drdz = alpaka::math::abs(acc, modulesInGPU.drdzs[lowerModuleIndex2]); + side = modulesInGPU.sides[lowerModuleIndex2]; + subdets = modulesInGPU.subdets[lowerModuleIndex2]; + } + if (i == 3) { + drdz = alpaka::math::abs(acc, modulesInGPU.drdzs[lowerModuleIndex3]); + side = modulesInGPU.sides[lowerModuleIndex3]; + subdets = modulesInGPU.subdets[lowerModuleIndex3]; + } + if (i == 2 || i == 3) { + residual = (layeri <= 6 && ((side == SDL::Center) or (drdz < 1))) ? diffz : diffr; + float projection_missing = 1; + if (drdz < 1) + projection_missing = ((subdets == SDL::Endcap) or (side == SDL::Center)) + ? 1.f + : 1 / alpaka::math::sqrt(acc, 1 + drdz * drdz); // cos(atan(drdz)), if dr/dz<1 + if (drdz > 1) + projection_missing = ((subdets == SDL::Endcap) or (side == SDL::Center)) + ? 1.f + : drdz / alpaka::math::sqrt(acc, 1 + drdz * drdz); //sin(atan(drdz)), if dr/dz>1 + error = error * projection_missing; + } + rzChiSquared += 12 * (residual * residual) / (error * error); + } + // for set rzchi2 cut + // if the 5 points are linear, helix calculation gives nan + // Alpaka : Needs to be moved over + if (inner_pt > 100 || alpaka::math::isnan(acc, rzChiSquared)) { + float slope; + if (moduleType1 == 0 and moduleType2 == 0 and moduleType3 == 1) //PSPS2S + { + slope = (z2 - z1) / (rt2 - rt1); + } else { + slope = (z3 - z1) / (rt3 - rt1); + } + float residual4_linear = (layer4 <= 6) ? ((z4 - z1) - slope * (rt4 - rt1)) : ((rt4 - rt1) - (z4 - z1) / slope); + float residual5_linear = (layer4 <= 6) ? ((z5 - z1) - slope * (rt5 - rt1)) : ((rt5 - rt1) - (z5 - z1) / slope); + + // creating a chi squared type quantity + // 0-> PS, 1->2S + residual4_linear = (moduleType4 == 0) ? residual4_linear / 0.15f : residual4_linear / 5.0f; + residual5_linear = (moduleType5 == 0) ? residual5_linear / 0.15f : residual5_linear / 5.0f; + residual4_linear = residual4_linear * 100; + residual5_linear = residual5_linear * 100; + + rzChiSquared = 12 * (residual4_linear * residual4_linear + residual5_linear * residual5_linear); + return rzChiSquared < 4.677f; + } + + // when building T5, apply 99% chi2 cuts as default, and add to pT5 collection. But when adding T5 to TC collections, apply 95% cut to reduce the fake rate + TightCutFlag = false; + // The category numbers are related to module regions and layers, decoding of the region numbers can be found here in slide 2 table. https://github.com/SegmentLinking/TrackLooper/files/11420927/part.2.pdf + // The commented numbers after each case is the region code, and can look it up from the table to see which category it belongs to. For example, //0 means T5 built with Endcap 1,2,3,4,5 ps modules + if (layer1 == 7 and layer2 == 8 and layer3 == 9 and layer4 == 10 and layer5 == 11) //0 + { + if (rzChiSquared < 94.470f) + TightCutFlag = true; + return true; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 9 and layer4 == 10 and layer5 == 16) //1 + { + if (rzChiSquared < 22.099f) + TightCutFlag = true; + return rzChiSquared < 37.956f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 9 and layer4 == 15 and layer5 == 16) //2 + { + if (rzChiSquared < 7.992f) + TightCutFlag = true; + return rzChiSquared < 11.622f; + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8 and layer4 == 9) { + if (layer5 == 10) //3 + { + if (rzChiSquared < 111.390f) + TightCutFlag = true; + return true; + } + if (layer5 == 15) //4 + { + if (rzChiSquared < 18.351f) + TightCutFlag = true; + return rzChiSquared < 37.941f; + } + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) { + if (layer4 == 8 and layer5 == 9) //5 + { + if (rzChiSquared < 116.148f) + TightCutFlag = true; + return true; + } + if (layer4 == 8 and layer5 == 14) //6 + { + if (rzChiSquared < 19.352f) + TightCutFlag = true; + return rzChiSquared < 52.561f; + } else if (layer4 == 13 and layer5 == 14) //7 + { + if (rzChiSquared < 10.392f) + TightCutFlag = true; + return rzChiSquared < 13.76f; + } + } else if (layer1 == 1 and layer2 == 2 and layer3 == 3) { + if (layer4 == 7 and layer5 == 8) //8 + { + if (rzChiSquared < 27.824f) + TightCutFlag = true; + return rzChiSquared < 44.247f; + } else if (layer4 == 7 and layer5 == 13) //9 + { + if (rzChiSquared < 18.145f) + TightCutFlag = true; + return rzChiSquared < 33.752f; + } else if (layer4 == 12 and layer5 == 13) //10 + { + if (rzChiSquared < 13.308f) + TightCutFlag = true; + return rzChiSquared < 21.213f; + } else if (layer4 == 4 and layer5 == 5) //11 + { + if (rzChiSquared < 15.627f) + TightCutFlag = true; + return rzChiSquared < 29.035f; + } else if (layer4 == 4 and layer5 == 12) //12 + { + if (rzChiSquared < 14.64f) + TightCutFlag = true; + return rzChiSquared < 23.037f; + } + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) { + if (layer4 == 9 and layer5 == 15) //14 + { + if (rzChiSquared < 24.662f) + TightCutFlag = true; + return rzChiSquared < 41.036f; + } else if (layer4 == 14 and layer5 == 15) //15 + { + if (rzChiSquared < 8.866f) + TightCutFlag = true; + return rzChiSquared < 14.092f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 7) { + if (layer4 == 8 and layer5 == 14) //16 + { + if (rzChiSquared < 23.730f) + TightCutFlag = true; + return rzChiSquared < 23.748f; + } + if (layer4 == 13 and layer5 == 14) //17 + { + if (rzChiSquared < 10.772f) + TightCutFlag = true; + return rzChiSquared < 17.945f; + } + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) { + if (layer4 == 5 and layer5 == 6) //18 + { + if (rzChiSquared < 6.065f) + TightCutFlag = true; + return rzChiSquared < 8.803f; + } else if (layer4 == 5 and layer5 == 12) //19 + { + if (rzChiSquared < 5.693f) + TightCutFlag = true; + return rzChiSquared < 7.930f; + } + + else if (layer4 == 12 and layer5 == 13) //20 + { + if (rzChiSquared < 5.473f) + TightCutFlag = true; + return rzChiSquared < 7.626f; + } + } + return true; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(struct SDL::triplets& tripletsInGPU, + struct SDL::segments& segmentsInGPU, + unsigned int innerTripletIndex, + unsigned int outerTripletIndex) { + unsigned int innerOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1]; + unsigned int outerInnerSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex]; + unsigned int innerOuterOuterMiniDoubletIndex = + segmentsInGPU.mdIndices[2 * innerOuterSegmentIndex + 1]; //inner triplet outer segment outer MD index + unsigned int outerInnerInnerMiniDoubletIndex = + segmentsInGPU.mdIndices[2 * outerInnerSegmentIndex]; //outer triplet inner segment inner MD index + + return (innerOuterOuterMiniDoubletIndex == outerInnerInnerMiniDoubletIndex); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeErrorInRadius(TAcc const& acc, + float* x1Vec, + float* y1Vec, + float* x2Vec, + float* y2Vec, + float* x3Vec, + float* y3Vec, + float& minimumRadius, + float& maximumRadius) { + //brute force + float candidateRadius; + float g, f; + minimumRadius = SDL::SDL_INF; + maximumRadius = 0.f; + for (size_t i = 0; i < 3; i++) { + float x1 = x1Vec[i]; + float y1 = y1Vec[i]; + for (size_t j = 0; j < 3; j++) { + float x2 = x2Vec[j]; + float y2 = y2Vec[j]; + for (size_t k = 0; k < 3; k++) { + float x3 = x3Vec[k]; + float y3 = y3Vec[k]; + candidateRadius = computeRadiusFromThreeAnchorHits(acc, x1, y1, x2, y2, x3, y3, g, f); + maximumRadius = alpaka::math::max(acc, candidateRadius, maximumRadius); + minimumRadius = alpaka::math::min(acc, candidateRadius, minimumRadius); + } + } + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBEE12378(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + const float& innerRadiusMin2S, + const float& innerRadiusMax2S, + const float& bridgeRadiusMin2S, + const float& bridgeRadiusMax2S, + const float& outerRadiusMin2S, + const float& outerRadiusMax2S, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 0.178f; + float bridgeInvRadiusErrorBound = 0.507f; + float outerInvRadiusErrorBound = 7.655f; + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(innerInvRadiusMin, + innerInvRadiusMax, + alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), + alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); + }; + + /*bounds for high Pt taken from : http://uaf-10.t2.ucsd.edu/~bsathian/SDL/T5_efficiency/efficiencies/new_efficiencies/efficiencies_20210513_T5_recovering_high_Pt_efficiencies/highE_radius_matching/highE_bounds.txt */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBBB(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 0.1512f; + float bridgeInvRadiusErrorBound = 0.1781f; + float outerInvRadiusErrorBound = 0.1840f; + + if (innerRadius > 2.0f / (2.f * k2Rinv1GeVf)) { + innerInvRadiusErrorBound = 0.4449f; + bridgeInvRadiusErrorBound = 0.4033f; + outerInvRadiusErrorBound = 0.8016f; + } + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(innerInvRadiusMin, innerInvRadiusMax, bridgeInvRadiusMin, bridgeInvRadiusMax); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBBE(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + const float& innerRadiusMin2S, + const float& innerRadiusMax2S, + const float& bridgeRadiusMin2S, + const float& bridgeRadiusMax2S, + const float& outerRadiusMin2S, + const float& outerRadiusMax2S, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 0.1781f; + float bridgeInvRadiusErrorBound = 0.2167f; + float outerInvRadiusErrorBound = 1.1116f; + + if (innerRadius > 2.0f / (2.f * k2Rinv1GeVf)) { + innerInvRadiusErrorBound = 0.4750f; + bridgeInvRadiusErrorBound = 0.3903f; + outerInvRadiusErrorBound = 15.2120f; + } + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(innerInvRadiusMin, innerInvRadiusMax, bridgeInvRadiusMin, bridgeInvRadiusMax); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBEE(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + const float& innerRadiusMin2S, + const float& innerRadiusMax2S, + const float& bridgeRadiusMin2S, + const float& bridgeRadiusMax2S, + const float& outerRadiusMin2S, + const float& outerRadiusMax2S, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 0.1840f; + float bridgeInvRadiusErrorBound = 0.5971f; + float outerInvRadiusErrorBound = 11.7102f; + + if (innerRadius > 2.0f / (2.f * k2Rinv1GeVf)) //as good as no selections + { + innerInvRadiusErrorBound = 1.0412f; + outerInvRadiusErrorBound = 32.2737f; + bridgeInvRadiusErrorBound = 10.9688f; + } + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(innerInvRadiusMin, + innerInvRadiusMax, + alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), + alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBEE23478(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + const float& innerRadiusMin2S, + const float& innerRadiusMax2S, + const float& bridgeRadiusMin2S, + const float& bridgeRadiusMax2S, + const float& outerRadiusMin2S, + const float& outerRadiusMax2S, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 0.2097f; + float bridgeInvRadiusErrorBound = 0.8557f; + float outerInvRadiusErrorBound = 24.0450f; + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(innerInvRadiusMin, + innerInvRadiusMax, + alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), + alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBEE34578(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + const float& innerRadiusMin2S, + const float& innerRadiusMax2S, + const float& bridgeRadiusMin2S, + const float& bridgeRadiusMax2S, + const float& outerRadiusMin2S, + const float& outerRadiusMax2S, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 0.066f; + float bridgeInvRadiusErrorBound = 0.617f; + float outerInvRadiusErrorBound = 2.688f; + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(innerInvRadiusMin, + innerInvRadiusMax, + alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), + alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBEEE(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + const float& innerRadiusMin2S, + const float& innerRadiusMax2S, + const float& bridgeRadiusMin2S, + const float& bridgeRadiusMax2S, + const float& outerRadiusMin2S, + const float& outerRadiusMax2S, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 0.6376f; + float bridgeInvRadiusErrorBound = 2.1381f; + float outerInvRadiusErrorBound = 20.4179f; + + if (innerRadius > 2.0f / (2.f * k2Rinv1GeVf)) //as good as no selections! + { + innerInvRadiusErrorBound = 12.9173f; + outerInvRadiusErrorBound = 25.6702f; + bridgeInvRadiusErrorBound = 5.1700f; + } + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(innerInvRadiusMin, + innerInvRadiusMax, + alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), + alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBEEEE(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + const float& innerRadiusMin2S, + const float& innerRadiusMax2S, + const float& bridgeRadiusMin2S, + const float& bridgeRadiusMax2S, + const float& outerRadiusMin2S, + const float& outerRadiusMax2S, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 1.9382f; + float bridgeInvRadiusErrorBound = 3.7280f; + float outerInvRadiusErrorBound = 5.7030f; + + if (innerRadius > 2.0f / (2.f * k2Rinv1GeVf)) { + innerInvRadiusErrorBound = 23.2713f; + outerInvRadiusErrorBound = 24.0450f; + bridgeInvRadiusErrorBound = 21.7980f; + } + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(alpaka::math::min(acc, innerInvRadiusMin, 1.0 / innerRadiusMax2S), + alpaka::math::max(acc, innerInvRadiusMax, 1.0 / innerRadiusMin2S), + alpaka::math::min(acc, bridgeInvRadiusMin, 1.0 / bridgeRadiusMax2S), + alpaka::math::max(acc, bridgeInvRadiusMax, 1.0 / bridgeRadiusMin2S)); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiEEEEE(TAcc const& acc, + const float& innerRadius, + const float& bridgeRadius, + const float& outerRadius, + const float& innerRadiusMin2S, + const float& innerRadiusMax2S, + const float& bridgeRadiusMin2S, + const float& bridgeRadiusMax2S, + const float& outerRadiusMin2S, + const float& outerRadiusMax2S, + float& innerInvRadiusMin, + float& innerInvRadiusMax, + float& bridgeInvRadiusMin, + float& bridgeInvRadiusMax, + float& outerInvRadiusMin, + float& outerInvRadiusMax) { + float innerInvRadiusErrorBound = 1.9382f; + float bridgeInvRadiusErrorBound = 2.2091f; + float outerInvRadiusErrorBound = 7.4084f; + + if (innerRadius > 2.0f / (2.f * k2Rinv1GeVf)) { + innerInvRadiusErrorBound = 22.5226f; + bridgeInvRadiusErrorBound = 21.0966f; + outerInvRadiusErrorBound = 19.1252f; + } + + innerInvRadiusMax = (1.f + innerInvRadiusErrorBound) / innerRadius; + innerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - innerInvRadiusErrorBound) / innerRadius); + + bridgeInvRadiusMax = (1.f + bridgeInvRadiusErrorBound) / bridgeRadius; + bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); + + outerInvRadiusMax = (1.f + outerInvRadiusErrorBound) / outerRadius; + outerInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - outerInvRadiusErrorBound) / outerRadius); + + return checkIntervalOverlap(alpaka::math::min(acc, innerInvRadiusMin, 1.0 / innerRadiusMax2S), + alpaka::math::max(acc, innerInvRadiusMax, 1.0 / innerRadiusMin2S), + alpaka::math::min(acc, bridgeInvRadiusMin, 1.0 / bridgeRadiusMax2S), + alpaka::math::max(acc, bridgeInvRadiusMax, 1.0 / bridgeRadiusMin2S)); + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeSigmasForRegression(TAcc const& acc, + SDL::modules& modulesInGPU, + const uint16_t* lowerModuleIndices, + float* delta1, + float* delta2, + float* slopes, + bool* isFlat, + unsigned int nPoints = 5, + bool anchorHits = true) { + /* + Bool anchorHits required to deal with a weird edge case wherein + the hits ultimately used in the regression are anchor hits, but the + lower modules need not all be Pixel Modules (in case of PS). Similarly, + when we compute the chi squared for the non-anchor hits, the "partner module" + need not always be a PS strip module, but all non-anchor hits sit on strip + modules. + */ + + ModuleType moduleType; + short moduleSubdet, moduleSide; + float inv1 = 0.01f / 0.009f; + float inv2 = 0.15f / 0.009f; + float inv3 = 2.4f / 0.009f; + for (size_t i = 0; i < nPoints; i++) { + moduleType = modulesInGPU.moduleType[lowerModuleIndices[i]]; + moduleSubdet = modulesInGPU.subdets[lowerModuleIndices[i]]; + moduleSide = modulesInGPU.sides[lowerModuleIndices[i]]; + const float& drdz = modulesInGPU.drdzs[lowerModuleIndices[i]]; + slopes[i] = modulesInGPU.dxdys[lowerModuleIndices[i]]; + //category 1 - barrel PS flat + if (moduleSubdet == Barrel and moduleType == PS and moduleSide == Center) { + delta1[i] = inv1; + delta2[i] = inv1; + slopes[i] = -999.f; + isFlat[i] = true; + } + //category 2 - barrel 2S + else if (moduleSubdet == Barrel and moduleType == TwoS) { + delta1[i] = 1.f; + delta2[i] = 1.f; + slopes[i] = -999.f; + isFlat[i] = true; + } + //category 3 - barrel PS tilted + else if (moduleSubdet == Barrel and moduleType == PS and moduleSide != Center) { + delta1[i] = inv1; + isFlat[i] = false; + + if (anchorHits) { + delta2[i] = (inv2 * drdz / alpaka::math::sqrt(acc, 1 + drdz * drdz)); + } else { + delta2[i] = (inv3 * drdz / alpaka::math::sqrt(acc, 1 + drdz * drdz)); + } + } + //category 4 - endcap PS + else if (moduleSubdet == Endcap and moduleType == PS) { + delta1[i] = inv1; + isFlat[i] = false; + + /* + despite the type of the module layer of the lower module index, + all anchor hits are on the pixel side and all non-anchor hits are + on the strip side! + */ + if (anchorHits) { + delta2[i] = inv2; + } else { + delta2[i] = inv3; + } + } + //category 5 - endcap 2S + else if (moduleSubdet == Endcap and moduleType == TwoS) { + delta1[i] = 1.f; + delta2[i] = 500.f * inv1; + isFlat[i] = false; + } else { +#ifdef Warnings + printf("ERROR!!!!! I SHOULDN'T BE HERE!!!! subdet = %d, type = %d, side = %d\n", + moduleSubdet, + moduleType, + moduleSide); +#endif + } + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeRadiusUsingRegression(TAcc const& acc, + unsigned int nPoints, + float* xs, + float* ys, + float* delta1, + float* delta2, + float* slopes, + bool* isFlat, + float& g, + float& f, + float* sigmas, + float& chiSquared) { + float radius = 0.f; + + // Some extra variables + // the two variables will be called x1 and x2, and y (which is x^2 + y^2) + + float sigmaX1Squared = 0.f; + float sigmaX2Squared = 0.f; + float sigmaX1X2 = 0.f; + float sigmaX1y = 0.f; + float sigmaX2y = 0.f; + float sigmaY = 0.f; + float sigmaX1 = 0.f; + float sigmaX2 = 0.f; + float sigmaOne = 0.f; + + float xPrime, yPrime, absArctanSlope, angleM; + for (size_t i = 0; i < nPoints; i++) { + // Computing sigmas is a very tricky affair + // if the module is tilted or endcap, we need to use the slopes properly! + + absArctanSlope = ((slopes[i] != SDL::SDL_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) + : 0.5f * float(M_PI)); + + if (xs[i] > 0 and ys[i] > 0) { + angleM = 0.5f * float(M_PI) - absArctanSlope; + } else if (xs[i] < 0 and ys[i] > 0) { + angleM = absArctanSlope + 0.5f * float(M_PI); + } else if (xs[i] < 0 and ys[i] < 0) { + angleM = -(absArctanSlope + 0.5f * float(M_PI)); + } else if (xs[i] > 0 and ys[i] < 0) { + angleM = -(0.5f * float(M_PI) - absArctanSlope); + } else { + angleM = 0; + } + + if (not isFlat[i]) { + xPrime = xs[i] * alpaka::math::cos(acc, angleM) + ys[i] * alpaka::math::sin(acc, angleM); + yPrime = ys[i] * alpaka::math::cos(acc, angleM) - xs[i] * alpaka::math::sin(acc, angleM); + } else { + xPrime = xs[i]; + yPrime = ys[i]; + } + sigmas[i] = + 2 * alpaka::math::sqrt( + acc, (xPrime * delta1[i]) * (xPrime * delta1[i]) + (yPrime * delta2[i]) * (yPrime * delta2[i])); + + sigmaX1Squared += (xs[i] * xs[i]) / (sigmas[i] * sigmas[i]); + sigmaX2Squared += (ys[i] * ys[i]) / (sigmas[i] * sigmas[i]); + sigmaX1X2 += (xs[i] * ys[i]) / (sigmas[i] * sigmas[i]); + sigmaX1y += (xs[i] * (xs[i] * xs[i] + ys[i] * ys[i])) / (sigmas[i] * sigmas[i]); + sigmaX2y += (ys[i] * (xs[i] * xs[i] + ys[i] * ys[i])) / (sigmas[i] * sigmas[i]); + sigmaY += (xs[i] * xs[i] + ys[i] * ys[i]) / (sigmas[i] * sigmas[i]); + sigmaX1 += xs[i] / (sigmas[i] * sigmas[i]); + sigmaX2 += ys[i] / (sigmas[i] * sigmas[i]); + sigmaOne += 1.0f / (sigmas[i] * sigmas[i]); + } + float denominator = (sigmaX1X2 - sigmaX1 * sigmaX2) * (sigmaX1X2 - sigmaX1 * sigmaX2) - + (sigmaX1Squared - sigmaX1 * sigmaX1) * (sigmaX2Squared - sigmaX2 * sigmaX2); + + float twoG = ((sigmaX2y - sigmaX2 * sigmaY) * (sigmaX1X2 - sigmaX1 * sigmaX2) - + (sigmaX1y - sigmaX1 * sigmaY) * (sigmaX2Squared - sigmaX2 * sigmaX2)) / + denominator; + float twoF = ((sigmaX1y - sigmaX1 * sigmaY) * (sigmaX1X2 - sigmaX1 * sigmaX2) - + (sigmaX2y - sigmaX2 * sigmaY) * (sigmaX1Squared - sigmaX1 * sigmaX1)) / + denominator; + + float c = -(sigmaY - twoG * sigmaX1 - twoF * sigmaX2) / sigmaOne; + g = 0.5f * twoG; + f = 0.5f * twoF; + if (g * g + f * f - c < 0) { +#ifdef Warnings + printf("FATAL! r^2 < 0!\n"); +#endif + chiSquared = -1; + return -1; + } + + radius = alpaka::math::sqrt(acc, g * g + f * f - c); + // compute chi squared + chiSquared = 0.f; + for (size_t i = 0; i < nPoints; i++) { + chiSquared += (xs[i] * xs[i] + ys[i] * ys[i] - twoG * xs[i] - twoF * ys[i] + c) * + (xs[i] * xs[i] + ys[i] * ys[i] - twoG * xs[i] - twoF * ys[i] + c) / (sigmas[i] * sigmas[i]); + } + return radius; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeChiSquared(TAcc const& acc, + unsigned int nPoints, + float* xs, + float* ys, + float* delta1, + float* delta2, + float* slopes, + bool* isFlat, + float g, + float f, + float radius) { + // given values of (g, f, radius) and a set of points (and its uncertainties) + // compute chi squared + float c = g * g + f * f - radius * radius; + float chiSquared = 0.f; + float absArctanSlope, angleM, xPrime, yPrime, sigma; + for (size_t i = 0; i < nPoints; i++) { + absArctanSlope = ((slopes[i] != SDL::SDL_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) + : 0.5f * float(M_PI)); + if (xs[i] > 0 and ys[i] > 0) { + angleM = 0.5f * float(M_PI) - absArctanSlope; + } else if (xs[i] < 0 and ys[i] > 0) { + angleM = absArctanSlope + 0.5f * float(M_PI); + } else if (xs[i] < 0 and ys[i] < 0) { + angleM = -(absArctanSlope + 0.5f * float(M_PI)); + } else if (xs[i] > 0 and ys[i] < 0) { + angleM = -(0.5f * float(M_PI) - absArctanSlope); + } else { + angleM = 0; + } + + if (not isFlat[i]) { + xPrime = xs[i] * alpaka::math::cos(acc, angleM) + ys[i] * alpaka::math::sin(acc, angleM); + yPrime = ys[i] * alpaka::math::cos(acc, angleM) - xs[i] * alpaka::math::sin(acc, angleM); + } else { + xPrime = xs[i]; + yPrime = ys[i]; + } + sigma = 2 * alpaka::math::sqrt( + acc, (xPrime * delta1[i]) * (xPrime * delta1[i]) + (yPrime * delta2[i]) * (yPrime * delta2[i])); + chiSquared += (xs[i] * xs[i] + ys[i] * ys[i] - 2 * g * xs[i] - 2 * f * ys[i] + c) * + (xs[i] * xs[i] + ys[i] * ys[i] - 2 * g * xs[i] - 2 * f * ys[i] + c) / (sigma * sigma); + } + return chiSquared; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationsT5(TAcc const& acc, + float& betaIn, + float& betaOut, + float& betaAv, + float& pt_beta, + float sdIn_dr, + float sdOut_dr, + float dr, + float lIn) { + if (lIn == 0) { + betaOut += SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min(acc, sdOut_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaOut); + return; + } + + if (betaIn * betaOut > 0.f and + (alpaka::math::abs(acc, pt_beta) < 4.f * SDL::pt_betaMax or + (lIn >= 11 and alpaka::math::abs(acc, pt_beta) < + 8.f * SDL::pt_betaMax))) //and the pt_beta is well-defined; less strict for endcap-endcap + { + const float betaInUpd = + betaIn + + SDL::copysignf(alpaka::math::asin( + acc, + alpaka::math::min( + acc, sdIn_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + const float betaOutUpd = + betaOut + + SDL::copysignf(alpaka::math::asin( + acc, + alpaka::math::min( + acc, sdOut_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaOut); //FIXME: need a faster version + betaAv = 0.5f * (betaInUpd + betaOutUpd); + + //1st update + const float pt_beta_inv = + 1.f / alpaka::math::abs(acc, dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv)); //get a better pt estimate + + betaIn += SDL::copysignf( + alpaka::math::asin(acc, alpaka::math::min(acc, sdIn_dr * SDL::k2Rinv1GeVf * pt_beta_inv, SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + betaOut += SDL::copysignf( + alpaka::math::asin(acc, alpaka::math::min(acc, sdOut_dr * SDL::k2Rinv1GeVf * pt_beta_inv, SDL::sinAlphaMax)), + betaOut); //FIXME: need a faster version + //update the av and pt + betaAv = 0.5f * (betaIn + betaOut); + //2nd update + pt_beta = dr * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + } else if (lIn < 11 && alpaka::math::abs(acc, betaOut) < 0.2f * alpaka::math::abs(acc, betaIn) && + alpaka::math::abs(acc, pt_beta) < 12.f * SDL::pt_betaMax) //use betaIn sign as ref + { + const float pt_betaIn = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaIn); + + const float betaInUpd = + betaIn + SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min( + acc, sdIn_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + const float betaOutUpd = + betaOut + + SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min( + acc, sdOut_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + betaAv = (alpaka::math::abs(acc, betaOut) > 0.2f * alpaka::math::abs(acc, betaIn)) + ? (0.5f * (betaInUpd + betaOutUpd)) + : betaInUpd; + + //1st update + pt_beta = dr * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + betaIn += SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min(acc, sdIn_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + betaOut += SDL::copysignf( + alpaka::math::asin( + acc, + alpaka::math::min(acc, sdOut_dr * SDL::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), SDL::sinAlphaMax)), + betaIn); //FIXME: need a faster version + //update the av and pt + betaAv = 0.5f * (betaIn + betaOut); + //2nd update + pt_beta = dr * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& innerOuterLowerModuleIndex, + uint16_t& outerInnerLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + unsigned int& fourthMDIndex, + float& zOut, + float& rtOut, + float& deltaPhiPos, + float& dPhi, + float& betaIn, + float& betaOut, + float& pt_beta, + float& zLo, + float& zHi, + float& zLoPointed, + float& zHiPointed, + float& sdlCut, + float& betaInCut, + float& betaOutCut, + float& deltaBetaCut) { + bool pass = true; + + bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); + + float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; + float rt_InOut = mdsInGPU.anchorRt[secondMDIndex]; + float rt_OutLo = mdsInGPU.anchorRt[thirdMDIndex]; + + float z_InLo = mdsInGPU.anchorZ[firstMDIndex]; + float z_InOut = mdsInGPU.anchorZ[secondMDIndex]; + float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; + + float alpha1GeV_OutLo = + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)); + + float rtRatio_OutLoInLo = rt_OutLo / rt_InLo; // Outer segment beginning rt divided by inner segment beginning rt; + float dzDrtScale = + alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly + float zpitch_InLo = (isPS_InLo ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + float zpitch_OutLo = (isPS_OutLo ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + + zHi = z_InLo + (z_InLo + SDL::deltaZLum) * (rtRatio_OutLoInLo - 1.f) * (z_InLo < 0.f ? 1.f : dzDrtScale) + + (zpitch_InLo + zpitch_OutLo); + zLo = z_InLo + (z_InLo - SDL::deltaZLum) * (rtRatio_OutLoInLo - 1.f) * (z_InLo > 0.f ? 1.f : dzDrtScale) - + (zpitch_InLo + zpitch_OutLo); + + //Cut 1 - z compatibility + zOut = z_OutLo; + rtOut = rt_OutLo; + pass = pass and ((z_OutLo >= zLo) && (z_OutLo <= zHi)); + if (not pass) + return pass; + + float drt_OutLo_InLo = (rt_OutLo - rt_InLo); + float r3_InLo = alpaka::math::sqrt(acc, z_InLo * z_InLo + rt_InLo * rt_InLo); + float drt_InSeg = rt_InOut - rt_InLo; + float dz_InSeg = z_InOut - z_InLo; + float dr3_InSeg = alpaka::math::sqrt(acc, rt_InOut * rt_InOut + z_InOut * z_InOut) - + alpaka::math::sqrt(acc, rt_InLo * rt_InLo + z_InLo * z_InLo); + + float coshEta = dr3_InSeg / drt_InSeg; + float dzErr = (zpitch_InLo + zpitch_OutLo) * (zpitch_InLo + zpitch_OutLo) * 2.f; + + float sdlThetaMulsF = 0.015f * alpaka::math::sqrt(acc, 0.1f + 0.2f * (rt_OutLo - rt_InLo) / 50.f) * + alpaka::math::sqrt(acc, r3_InLo / rt_InLo); + float sdlMuls = sdlThetaMulsF * 3.f / SDL::ptCut * 4.f; // will need a better guess than x4? + dzErr += sdlMuls * sdlMuls * drt_OutLo_InLo * drt_OutLo_InLo / 3.f * coshEta * coshEta; //sloppy + dzErr = alpaka::math::sqrt(acc, dzErr); + + // Constructing upper and lower bound + const float dzMean = dz_InSeg / drt_InSeg * drt_OutLo_InLo; + const float zWindow = + dzErr / drt_InSeg * drt_OutLo_InLo + + (zpitch_InLo + zpitch_OutLo); //FIXME for SDL::ptCut lower than ~0.8 need to add curv path correction + zLoPointed = z_InLo + dzMean * (z_InLo > 0.f ? 1.f : dzDrtScale) - zWindow; + zHiPointed = z_InLo + dzMean * (z_InLo < 0.f ? 1.f : dzDrtScale) + zWindow; + + // Cut #2: Pointed Z (Inner segment two MD points to outer segment inner MD) + pass = pass and ((z_OutLo >= zLoPointed) && (z_OutLo <= zHiPointed)); + if (not pass) + return pass; + + float sdlPVoff = 0.1f / rt_OutLo; + sdlCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, sdlMuls * sdlMuls + sdlPVoff * sdlPVoff); + + deltaPhiPos = SDL::phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); + // Cut #3: FIXME:deltaPhiPos can be tighter + pass = pass and (alpaka::math::abs(acc, deltaPhiPos) <= sdlCut); + if (not pass) + return pass; + + float midPointX = 0.5f * (mdsInGPU.anchorX[firstMDIndex] + mdsInGPU.anchorX[thirdMDIndex]); + float midPointY = 0.5f * (mdsInGPU.anchorY[firstMDIndex] + mdsInGPU.anchorY[thirdMDIndex]); + float diffX = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float diffY = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + + dPhi = SDL::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + + // Cut #4: deltaPhiChange + pass = pass and (alpaka::math::abs(acc, dPhi) <= sdlCut); + //lots of array accesses below. Cut here! + if (not pass) + return pass; + + // First obtaining the raw betaIn and betaOut values without any correction and just purely based on the mini-doublet hit positions + + float alpha_InLo = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); + float alpha_OutLo = __H2F(segmentsInGPU.dPhiChanges[outerSegmentIndex]); + + bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == SDL::Endcap and + modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::TwoS; + + float alpha_OutUp, alpha_OutUp_highEdge, alpha_OutUp_lowEdge; + + alpha_OutUp = SDL::phi_mpi_pi(acc, + SDL::phi(acc, + mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - + mdsInGPU.anchorPhi[fourthMDIndex]); + + alpha_OutUp_highEdge = alpha_OutUp; + alpha_OutUp_lowEdge = alpha_OutUp; + + float tl_axis_x = mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float tl_axis_y = mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + float tl_axis_highEdge_x = tl_axis_x; + float tl_axis_highEdge_y = tl_axis_y; + float tl_axis_lowEdge_x = tl_axis_x; + float tl_axis_lowEdge_y = tl_axis_y; + + betaIn = alpha_InLo - SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + + float betaInRHmin = betaIn; + float betaInRHmax = betaIn; + betaOut = + -alpha_OutUp + SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); + + float betaOutRHmin = betaOut; + float betaOutRHmax = betaOut; + + if (isEC_lastLayer) { + alpha_OutUp_highEdge = + SDL::phi_mpi_pi(acc, + SDL::phi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - + mdsInGPU.anchorHighEdgePhi[fourthMDIndex]); + alpha_OutUp_lowEdge = + SDL::phi_mpi_pi(acc, + SDL::phi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - + mdsInGPU.anchorLowEdgePhi[fourthMDIndex]); + + tl_axis_highEdge_x = mdsInGPU.anchorHighEdgeX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + tl_axis_highEdge_y = mdsInGPU.anchorHighEdgeY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + tl_axis_lowEdge_x = mdsInGPU.anchorLowEdgeX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + tl_axis_lowEdge_y = mdsInGPU.anchorLowEdgeY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + + betaOutRHmin = -alpha_OutUp_highEdge + SDL::phi_mpi_pi(acc, + SDL::phi(acc, tl_axis_highEdge_x, tl_axis_highEdge_y) - + mdsInGPU.anchorHighEdgePhi[fourthMDIndex]); + betaOutRHmax = -alpha_OutUp_lowEdge + SDL::phi_mpi_pi(acc, + SDL::phi(acc, tl_axis_lowEdge_x, tl_axis_lowEdge_y) - + mdsInGPU.anchorLowEdgePhi[fourthMDIndex]); + } + + //beta computation + float drt_tl_axis = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); + + float corrF = 1.f; + //innerOuterAnchor - innerInnerAnchor + const float rt_InSeg = + alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) * + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) + + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex]) * + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex])); + betaInCut = alpaka::math::asin( + acc, + alpaka::math::min( + acc, (-rt_InSeg * corrF + drt_tl_axis) * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / drt_InSeg); + + //Cut #5: first beta cut + pass = pass and (alpaka::math::abs(acc, betaInRHmin) < betaInCut); + if (not pass) + return pass; + + float betaAv = 0.5f * (betaIn + betaOut); + pt_beta = drt_tl_axis * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); + int lIn = 5; + int lOut = isEC_lastLayer ? 11 : 5; + float sdOut_dr = alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex]) * + (mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex]) + + (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) * + (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex])); + float sdOut_d = mdsInGPU.anchorRt[fourthMDIndex] - mdsInGPU.anchorRt[thirdMDIndex]; + + SDL::runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, rt_InSeg, sdOut_dr, drt_tl_axis, lIn); + + const float betaInMMSF = (alpaka::math::abs(acc, betaInRHmin + betaInRHmax) > 0) + ? (2.f * betaIn / alpaka::math::abs(acc, betaInRHmin + betaInRHmax)) + : 0.f; //mean value of min,max is the old betaIn + const float betaOutMMSF = (alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax) > 0) + ? (2.f * betaOut / alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax)) + : 0.f; + betaInRHmin *= betaInMMSF; + betaInRHmax *= betaInMMSF; + betaOutRHmin *= betaOutMMSF; + betaOutRHmax *= betaOutMMSF; + + const float dBetaMuls = + sdlThetaMulsF * 4.f / + alpaka::math::min( + acc, alpaka::math::abs(acc, pt_beta), SDL::pt_betaMax); //need to confimm the range-out value of 7 GeV + + const float alphaInAbsReg = alpaka::math::max( + acc, + alpaka::math::abs(acc, alpha_InLo), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * SDL::k2Rinv1GeVf / 3.0f, SDL::sinAlphaMax))); + const float alphaOutAbsReg = alpaka::math::max( + acc, + alpaka::math::abs(acc, alpha_OutLo), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * SDL::k2Rinv1GeVf / 3.0f, SDL::sinAlphaMax))); + const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * SDL::deltaZLum / z_InLo); + const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * SDL::deltaZLum / z_OutLo); + const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum); + const float sinDPhi = alpaka::math::sin(acc, dPhi); + + const float dBetaRIn2 = 0; // TODO-RH + float dBetaROut = 0; + if (isEC_lastLayer) { + dBetaROut = + (alpaka::math::sqrt(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex] * mdsInGPU.anchorHighEdgeX[fourthMDIndex] + + mdsInGPU.anchorHighEdgeY[fourthMDIndex] * mdsInGPU.anchorHighEdgeY[fourthMDIndex]) - + alpaka::math::sqrt(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex] * mdsInGPU.anchorLowEdgeX[fourthMDIndex] + + mdsInGPU.anchorLowEdgeY[fourthMDIndex] * mdsInGPU.anchorLowEdgeY[fourthMDIndex])) * + sinDPhi / drt_tl_axis; + } + + const float dBetaROut2 = dBetaROut * dBetaROut; + + //FIXME: need faster version + betaOutCut = + alpaka::math::asin(acc, alpaka::math::min(acc, drt_tl_axis * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls * dBetaMuls); + + //Cut #6: The real beta cut + pass = pass and ((alpaka::math::abs(acc, betaOut) < betaOutCut)); + if (not pass) + return pass; + + float dBetaRes = 0.02f / alpaka::math::min(acc, sdOut_d, drt_InSeg); + float dBetaCut2 = + (dBetaRes * dBetaRes * 2.0f + dBetaMuls * dBetaMuls + dBetaLum2 + dBetaRIn2 + dBetaROut2 + + 0.25f * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax)) * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax))); + + float dBeta = betaIn - betaOut; + deltaBetaCut = alpaka::math::sqrt(acc, dBetaCut2); + pass = pass and (dBeta * dBeta <= dBetaCut2); + + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& innerOuterLowerModuleIndex, + uint16_t& outerInnerLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + unsigned int& fourthMDIndex, + float& zOut, + float& rtOut, + float& deltaPhiPos, + float& dPhi, + float& betaIn, + float& betaOut, + float& pt_beta, + float& zLo, + float& rtLo, + float& rtHi, + float& sdlCut, + float& betaInCut, + float& betaOutCut, + float& deltaBetaCut, + float& kZ) { + bool pass = true; + bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); + + float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; + float rt_InOut = mdsInGPU.anchorRt[secondMDIndex]; + float rt_OutLo = mdsInGPU.anchorRt[thirdMDIndex]; + + float z_InLo = mdsInGPU.anchorZ[firstMDIndex]; + float z_InOut = mdsInGPU.anchorZ[secondMDIndex]; + float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; + + float alpha1GeV_OutLo = + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)); + + float rtRatio_OutLoInLo = rt_OutLo / rt_InLo; // Outer segment beginning rt divided by inner segment beginning rt; + float dzDrtScale = + alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly + float zpitch_InLo = (isPS_InLo ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + float zpitch_OutLo = (isPS_OutLo ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + float zGeom = zpitch_InLo + zpitch_OutLo; + + zLo = z_InLo + (z_InLo - SDL::deltaZLum) * (rtRatio_OutLoInLo - 1.f) * (z_InLo > 0.f ? 1.f : dzDrtScale) - zGeom; + + // Cut #0: Preliminary (Only here in endcap case) + pass = pass and (z_InLo * z_OutLo > 0); + if (not pass) + return pass; + + float dLum = SDL::copysignf(SDL::deltaZLum, z_InLo); + bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS; + float rtGeom1 = isOutSgInnerMDPS ? SDL::pixelPSZpitch : SDL::strip2SZpitch; + float zGeom1 = SDL::copysignf(zGeom, z_InLo); + rtLo = rt_InLo * (1.f + (z_OutLo - z_InLo - zGeom1) / (z_InLo + zGeom1 + dLum) / dzDrtScale) - + rtGeom1; //slope correction only on the lower end + zOut = z_OutLo; + rtOut = rt_OutLo; + + //Cut #1: rt condition + pass = pass and (rtOut >= rtLo); + if (not pass) + return pass; + + float zInForHi = z_InLo - zGeom1 - dLum; + if (zInForHi * z_InLo < 0) { + zInForHi = SDL::copysignf(0.1f, z_InLo); + } + rtHi = rt_InLo * (1.f + (z_OutLo - z_InLo + zGeom1) / zInForHi) + rtGeom1; + + //Cut #2: rt condition + pass = pass and ((rt_OutLo >= rtLo) && (rt_OutLo <= rtHi)); + if (not pass) + return pass; + + float rIn = alpaka::math::sqrt(acc, z_InLo * z_InLo + rt_InLo * rt_InLo); + const float drtSDIn = rt_InOut - rt_InLo; + const float dzSDIn = z_InOut - z_InLo; + const float dr3SDIn = alpaka::math::sqrt(acc, rt_InOut * rt_InOut + z_InOut * z_InOut) - + alpaka::math::sqrt(acc, rt_InLo * rt_InLo + z_InLo * z_InLo); + + const float coshEta = dr3SDIn / drtSDIn; //direction estimate + const float dzOutInAbs = alpaka::math::abs(acc, z_OutLo - z_InLo); + const float multDzDr = dzOutInAbs * coshEta / (coshEta * coshEta - 1.f); + const float zGeom1_another = SDL::pixelPSZpitch; + kZ = (z_OutLo - z_InLo) / dzSDIn; + float drtErr = + zGeom1_another * zGeom1_another * drtSDIn * drtSDIn / dzSDIn / dzSDIn * (1.f - 2.f * kZ + 2.f * kZ * kZ); + const float sdlThetaMulsF = 0.015f * alpaka::math::sqrt(acc, 0.1f + 0.2f * (rt_OutLo - rt_InLo) / 50.f) * + alpaka::math::sqrt(acc, rIn / rt_InLo); + const float sdlMuls = sdlThetaMulsF * 3.f / SDL::ptCut * 4.f; //will need a better guess than x4? + drtErr += + sdlMuls * sdlMuls * multDzDr * multDzDr / 3.f * coshEta * coshEta; //sloppy: relative muls is 1/3 of total muls + drtErr = alpaka::math::sqrt(acc, drtErr); + + //Cut #3: rt-z pointed + pass = pass and ((kZ >= 0) && (rtOut >= rtLo) && (rtOut <= rtHi)); + if (not pass) + return pass; + + const float sdlPVoff = 0.1f / rt_OutLo; + sdlCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, sdlMuls * sdlMuls + sdlPVoff * sdlPVoff); + + deltaPhiPos = SDL::phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); + + //Cut #4: deltaPhiPos can be tighter + pass = pass and (alpaka::math::abs(acc, deltaPhiPos) <= sdlCut); + if (not pass) + return pass; + + float midPointX = 0.5f * (mdsInGPU.anchorX[firstMDIndex] + mdsInGPU.anchorX[thirdMDIndex]); + float midPointY = 0.5f * (mdsInGPU.anchorY[firstMDIndex] + mdsInGPU.anchorY[thirdMDIndex]); + float diffX = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float diffY = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + + dPhi = SDL::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + // Cut #5: deltaPhiChange + pass = pass and (alpaka::math::abs(acc, dPhi) <= sdlCut); + if (not pass) + return pass; + + float sdIn_alpha = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); + float sdIn_alpha_min = __H2F(segmentsInGPU.dPhiChangeMins[innerSegmentIndex]); + float sdIn_alpha_max = __H2F(segmentsInGPU.dPhiChangeMaxs[innerSegmentIndex]); + float sdOut_alpha = sdIn_alpha; //weird + + float sdOut_alphaOut = SDL::phi_mpi_pi(acc, + SDL::phi(acc, + mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - + mdsInGPU.anchorPhi[fourthMDIndex]); + + float sdOut_alphaOut_min = SDL::phi_mpi_pi( + acc, __H2F(segmentsInGPU.dPhiChangeMins[outerSegmentIndex]) - __H2F(segmentsInGPU.dPhiMins[outerSegmentIndex])); + float sdOut_alphaOut_max = SDL::phi_mpi_pi( + acc, __H2F(segmentsInGPU.dPhiChangeMaxs[outerSegmentIndex]) - __H2F(segmentsInGPU.dPhiMaxs[outerSegmentIndex])); + + float tl_axis_x = mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float tl_axis_y = mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + + betaIn = sdIn_alpha - SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + + float betaInRHmin = betaIn; + float betaInRHmax = betaIn; + betaOut = + -sdOut_alphaOut + SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); + + float betaOutRHmin = betaOut; + float betaOutRHmax = betaOut; + + bool isEC_secondLayer = (modulesInGPU.subdets[innerOuterLowerModuleIndex] == SDL::Endcap) and + (modulesInGPU.moduleType[innerOuterLowerModuleIndex] == SDL::TwoS); + + if (isEC_secondLayer) { + betaInRHmin = betaIn - sdIn_alpha_min + sdIn_alpha; + betaInRHmax = betaIn - sdIn_alpha_max + sdIn_alpha; + } + + betaOutRHmin = betaOut - sdOut_alphaOut_min + sdOut_alphaOut; + betaOutRHmax = betaOut - sdOut_alphaOut_max + sdOut_alphaOut; + + float swapTemp; + if (alpaka::math::abs(acc, betaOutRHmin) > alpaka::math::abs(acc, betaOutRHmax)) { + swapTemp = betaOutRHmin; + betaOutRHmin = betaOutRHmax; + betaOutRHmax = swapTemp; + } + + if (alpaka::math::abs(acc, betaInRHmin) > alpaka::math::abs(acc, betaInRHmax)) { + swapTemp = betaInRHmin; + betaInRHmin = betaInRHmax; + betaInRHmax = swapTemp; + } + + float sdIn_dr = alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) * + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) + + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex]) * + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex])); + float sdIn_d = rt_InOut - rt_InLo; + + float dr = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); + const float corrF = 1.f; + betaInCut = + alpaka::math::asin( + acc, alpaka::math::min(acc, (-sdIn_dr * corrF + dr) * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / sdIn_d); + + //Cut #6: first beta cut + pass = pass and (alpaka::math::abs(acc, betaInRHmin) < betaInCut); + if (not pass) + return pass; + + float betaAv = 0.5f * (betaIn + betaOut); + pt_beta = dr * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); + + float lIn = 5; + float lOut = 11; + + float sdOut_dr = alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex]) * + (mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex]) + + (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) * + (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex])); + float sdOut_d = mdsInGPU.anchorRt[fourthMDIndex] - mdsInGPU.anchorRt[thirdMDIndex]; + + SDL::runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, sdIn_dr, sdOut_dr, dr, lIn); + + const float betaInMMSF = (alpaka::math::abs(acc, betaInRHmin + betaInRHmax) > 0) + ? (2.f * betaIn / alpaka::math::abs(acc, betaInRHmin + betaInRHmax)) + : 0.; //mean value of min,max is the old betaIn + const float betaOutMMSF = (alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax) > 0) + ? (2.f * betaOut / alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax)) + : 0.; + betaInRHmin *= betaInMMSF; + betaInRHmax *= betaInMMSF; + betaOutRHmin *= betaOutMMSF; + betaOutRHmax *= betaOutMMSF; + + const float dBetaMuls = + sdlThetaMulsF * 4.f / + alpaka::math::min( + acc, alpaka::math::abs(acc, pt_beta), SDL::pt_betaMax); //need to confirm the range-out value of 7 GeV + + const float alphaInAbsReg = alpaka::math::max( + acc, + alpaka::math::abs(acc, sdIn_alpha), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * SDL::k2Rinv1GeVf / 3.0f, SDL::sinAlphaMax))); + const float alphaOutAbsReg = alpaka::math::max( + acc, + alpaka::math::abs(acc, sdOut_alpha), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * SDL::k2Rinv1GeVf / 3.0f, SDL::sinAlphaMax))); + const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * SDL::deltaZLum / z_InLo); + const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * SDL::deltaZLum / z_OutLo); + const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum); + const float sinDPhi = alpaka::math::sin(acc, dPhi); + + const float dBetaRIn2 = 0; // TODO-RH + float dBetaROut = 0; + if (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::TwoS) { + dBetaROut = + (alpaka::math::sqrt(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex] * mdsInGPU.anchorHighEdgeX[fourthMDIndex] + + mdsInGPU.anchorHighEdgeY[fourthMDIndex] * mdsInGPU.anchorHighEdgeY[fourthMDIndex]) - + alpaka::math::sqrt(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex] * mdsInGPU.anchorLowEdgeX[fourthMDIndex] + + mdsInGPU.anchorLowEdgeY[fourthMDIndex] * mdsInGPU.anchorLowEdgeY[fourthMDIndex])) * + sinDPhi / dr; + } + + const float dBetaROut2 = dBetaROut * dBetaROut; + //FIXME: need faster version + betaOutCut = alpaka::math::asin(acc, alpaka::math::min(acc, dr * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls * dBetaMuls); + + //Cut #6: The real beta cut + pass = pass and (alpaka::math::abs(acc, betaOut) < betaOutCut); + if (not pass) + return pass; + + float dBetaRes = 0.02f / alpaka::math::min(acc, sdOut_d, sdIn_d); + float dBetaCut2 = + (dBetaRes * dBetaRes * 2.0f + dBetaMuls * dBetaMuls + dBetaLum2 + dBetaRIn2 + dBetaROut2 + + 0.25f * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax)) * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax))); + float dBeta = betaIn - betaOut; + deltaBetaCut = alpaka::math::sqrt(acc, dBetaCut2); + //Cut #7: Cut on dBet + pass = pass and (dBeta * dBeta <= dBetaCut2); + + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& innerOuterLowerModuleIndex, + uint16_t& outerInnerLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + unsigned int& fourthMDIndex, + float& zOut, + float& rtOut, + float& deltaPhiPos, + float& dPhi, + float& betaIn, + float& betaOut, + float& pt_beta, + float& zLo, + float& rtLo, + float& rtHi, + float& sdlCut, + float& betaInCut, + float& betaOutCut, + float& deltaBetaCut, + float& kZ) { + bool pass = true; + + bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS); + + float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; + float rt_InOut = mdsInGPU.anchorRt[secondMDIndex]; + float rt_OutLo = mdsInGPU.anchorRt[thirdMDIndex]; + + float z_InLo = mdsInGPU.anchorZ[firstMDIndex]; + float z_InOut = mdsInGPU.anchorZ[secondMDIndex]; + float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; + + float alpha1GeV_OutLo = + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)); + + float rtRatio_OutLoInLo = rt_OutLo / rt_InLo; // Outer segment beginning rt divided by inner segment beginning rt; + float dzDrtScale = + alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly + float zpitch_InLo = (isPS_InLo ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + float zpitch_OutLo = (isPS_OutLo ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + float zGeom = zpitch_InLo + zpitch_OutLo; + + zLo = z_InLo + (z_InLo - SDL::deltaZLum) * (rtRatio_OutLoInLo - 1.f) * (z_InLo > 0.f ? 1.f : dzDrtScale) - + zGeom; //slope-correction only on outer end + + // Cut #0: Preliminary (Only here in endcap case) + pass = pass and ((z_InLo * z_OutLo) > 0); + if (not pass) + return pass; + + float dLum = SDL::copysignf(SDL::deltaZLum, z_InLo); + bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == SDL::PS; + bool isInSgInnerMDPS = modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS; + + float rtGeom = (isInSgInnerMDPS and isOutSgInnerMDPS) ? 2.f * SDL::pixelPSZpitch + : (isInSgInnerMDPS or isOutSgInnerMDPS) ? SDL::pixelPSZpitch + SDL::strip2SZpitch + : 2.f * SDL::strip2SZpitch; + + float dz = z_OutLo - z_InLo; + rtLo = rt_InLo * (1.f + dz / (z_InLo + dLum) / dzDrtScale) - rtGeom; //slope correction only on the lower end + + zOut = z_OutLo; + rtOut = rt_OutLo; + + //Cut #1: rt condition + + rtHi = rt_InLo * (1.f + dz / (z_InLo - dLum)) + rtGeom; + + pass = pass and ((rtOut >= rtLo) && (rtOut <= rtHi)); + if (not pass) + return pass; + + bool isInSgOuterMDPS = modulesInGPU.moduleType[innerOuterLowerModuleIndex] == SDL::PS; + + const float drtSDIn = rt_InOut - rt_InLo; + const float dzSDIn = z_InOut - z_InLo; + const float dr3SDIn = alpaka::math::sqrt(acc, rt_InOut * rt_InOut + z_InOut * z_InOut) - + alpaka::math::sqrt(acc, rt_InLo * rt_InLo + z_InLo * z_InLo); + float coshEta = dr3SDIn / drtSDIn; //direction estimate + float dzOutInAbs = alpaka::math::abs(acc, z_OutLo - z_InLo); + float multDzDr = dzOutInAbs * coshEta / (coshEta * coshEta - 1.f); + + kZ = (z_OutLo - z_InLo) / dzSDIn; + float sdlThetaMulsF = 0.015f * alpaka::math::sqrt(acc, 0.1f + 0.2f * (rt_OutLo - rt_InLo) / 50.f); + + float sdlMuls = sdlThetaMulsF * 3.f / SDL::ptCut * 4.f; //will need a better guess than x4? + + float drtErr = alpaka::math::sqrt( + acc, + SDL::pixelPSZpitch * SDL::pixelPSZpitch * 2.f / (dzSDIn * dzSDIn) * (dzOutInAbs * dzOutInAbs) + + sdlMuls * sdlMuls * multDzDr * multDzDr / 3.f * coshEta * coshEta); + + float drtMean = drtSDIn * dzOutInAbs / alpaka::math::abs(acc, dzSDIn); + float rtWindow = drtErr + rtGeom; + float rtLo_point = rt_InLo + drtMean / dzDrtScale - rtWindow; + float rtHi_point = rt_InLo + drtMean + rtWindow; + + // Cut #3: rt-z pointed + // https://github.com/slava77/cms-tkph2-ntuple/blob/superDoubletLinked-91X-noMock/doubletAnalysis.C#L3765 + + if (isInSgInnerMDPS and isInSgOuterMDPS) // If both PS then we can point + { + pass = pass and (kZ >= 0 and rtOut >= rtLo_point and rtOut <= rtHi_point); + if (not pass) + return pass; + } + + float sdlPVoff = 0.1f / rtOut; + sdlCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, sdlMuls * sdlMuls + sdlPVoff * sdlPVoff); + + deltaPhiPos = SDL::phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); + + pass = pass and (alpaka::math::abs(acc, deltaPhiPos) <= sdlCut); + if (not pass) + return pass; + + float midPointX = 0.5f * (mdsInGPU.anchorX[firstMDIndex] + mdsInGPU.anchorX[thirdMDIndex]); + float midPointY = 0.5f * (mdsInGPU.anchorY[firstMDIndex] + mdsInGPU.anchorY[thirdMDIndex]); + float diffX = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float diffY = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + + dPhi = SDL::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + + // Cut #5: deltaPhiChange + pass = pass and ((alpaka::math::abs(acc, dPhi) <= sdlCut)); + if (not pass) + return pass; + + float sdIn_alpha = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); + float sdOut_alpha = sdIn_alpha; //weird + float sdOut_dPhiPos = SDL::phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[thirdMDIndex]); + + float sdOut_dPhiChange = __H2F(segmentsInGPU.dPhiChanges[outerSegmentIndex]); + float sdOut_dPhiChange_min = __H2F(segmentsInGPU.dPhiChangeMins[outerSegmentIndex]); + float sdOut_dPhiChange_max = __H2F(segmentsInGPU.dPhiChangeMaxs[outerSegmentIndex]); + + float sdOut_alphaOutRHmin = SDL::phi_mpi_pi(acc, sdOut_dPhiChange_min - sdOut_dPhiPos); + float sdOut_alphaOutRHmax = SDL::phi_mpi_pi(acc, sdOut_dPhiChange_max - sdOut_dPhiPos); + float sdOut_alphaOut = SDL::phi_mpi_pi(acc, sdOut_dPhiChange - sdOut_dPhiPos); + + float tl_axis_x = mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float tl_axis_y = mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + + betaIn = sdIn_alpha - SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + + float sdIn_alphaRHmin = __H2F(segmentsInGPU.dPhiChangeMins[innerSegmentIndex]); + float sdIn_alphaRHmax = __H2F(segmentsInGPU.dPhiChangeMaxs[innerSegmentIndex]); + float betaInRHmin = betaIn + sdIn_alphaRHmin - sdIn_alpha; + float betaInRHmax = betaIn + sdIn_alphaRHmax - sdIn_alpha; + + betaOut = + -sdOut_alphaOut + SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); + + float betaOutRHmin = betaOut - sdOut_alphaOutRHmin + sdOut_alphaOut; + float betaOutRHmax = betaOut - sdOut_alphaOutRHmax + sdOut_alphaOut; + + float swapTemp; + if (alpaka::math::abs(acc, betaOutRHmin) > alpaka::math::abs(acc, betaOutRHmax)) { + swapTemp = betaOutRHmin; + betaOutRHmin = betaOutRHmax; + betaOutRHmax = swapTemp; + } + + if (alpaka::math::abs(acc, betaInRHmin) > alpaka::math::abs(acc, betaInRHmax)) { + swapTemp = betaInRHmin; + betaInRHmin = betaInRHmax; + betaInRHmax = swapTemp; + } + float sdIn_dr = alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) * + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) + + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex]) * + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex])); + float sdIn_d = rt_InOut - rt_InLo; + + float dr = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); + const float corrF = 1.f; + betaInCut = + alpaka::math::asin( + acc, alpaka::math::min(acc, (-sdIn_dr * corrF + dr) * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / sdIn_d); + + //Cut #6: first beta cut + pass = pass and (alpaka::math::abs(acc, betaInRHmin) < betaInCut); + if (not pass) + return pass; + + float betaAv = 0.5f * (betaIn + betaOut); + pt_beta = dr * SDL::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); + + int lIn = 11; //endcap + int lOut = 13; //endcap + + float sdOut_dr = alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex]) * + (mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex]) + + (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) * + (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex])); + float sdOut_d = mdsInGPU.anchorRt[fourthMDIndex] - mdsInGPU.anchorRt[thirdMDIndex]; + + SDL::runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, sdIn_dr, sdOut_dr, dr, lIn); + + const float betaInMMSF = (alpaka::math::abs(acc, betaInRHmin + betaInRHmax) > 0) + ? (2.f * betaIn / alpaka::math::abs(acc, betaInRHmin + betaInRHmax)) + : 0.; //mean value of min,max is the old betaIn + const float betaOutMMSF = (alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax) > 0) + ? (2.f * betaOut / alpaka::math::abs(acc, betaOutRHmin + betaOutRHmax)) + : 0.; + betaInRHmin *= betaInMMSF; + betaInRHmax *= betaInMMSF; + betaOutRHmin *= betaOutMMSF; + betaOutRHmax *= betaOutMMSF; + + const float dBetaMuls = + sdlThetaMulsF * 4.f / + alpaka::math::min( + acc, alpaka::math::abs(acc, pt_beta), SDL::pt_betaMax); //need to confirm the range-out value of 7 GeV + + const float alphaInAbsReg = alpaka::math::max( + acc, + alpaka::math::abs(acc, sdIn_alpha), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * SDL::k2Rinv1GeVf / 3.0f, SDL::sinAlphaMax))); + const float alphaOutAbsReg = alpaka::math::max( + acc, + alpaka::math::abs(acc, sdOut_alpha), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * SDL::k2Rinv1GeVf / 3.0f, SDL::sinAlphaMax))); + const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * SDL::deltaZLum / z_InLo); + const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * SDL::deltaZLum / z_OutLo); + const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum); + + const float dBetaRIn2 = 0; // TODO-RH + + float dBetaROut2 = 0; //TODO-RH + //FIXME: need faster version + betaOutCut = alpaka::math::asin(acc, alpaka::math::min(acc, dr * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls * dBetaMuls); + + //Cut #6: The real beta cut + pass = pass and (alpaka::math::abs(acc, betaOut) < betaOutCut); + if (not pass) + return pass; + + float dBetaRes = 0.02f / alpaka::math::min(acc, sdOut_d, sdIn_d); + float dBetaCut2 = + (dBetaRes * dBetaRes * 2.0f + dBetaMuls * dBetaMuls + dBetaLum2 + dBetaRIn2 + dBetaROut2 + + 0.25f * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax)) * + (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax))); + float dBeta = betaIn - betaOut; + //Cut #7: Cut on dBeta + deltaBetaCut = alpaka::math::sqrt(acc, dBetaCut2); + + pass = pass and (dBeta * dBeta <= dBetaCut2); + + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& innerOuterLowerModuleIndex, + uint16_t& outerInnerLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + unsigned int& fourthMDIndex, + float& zOut, + float& rtOut, + float& deltaPhiPos, + float& deltaPhi, + float& betaIn, + float& betaOut, + float& pt_beta, + float& zLo, + float& zHi, + float& rtLo, + float& rtHi, + float& zLoPointed, + float& zHiPointed, + float& sdlCut, + float& betaInCut, + float& betaOutCut, + float& deltaBetaCut, + float& kZ) { + bool pass = false; + + zLo = -999; + zHi = -999; + rtLo = -999; + rtHi = -999; + zLoPointed = -999; + zHiPointed = -999; + kZ = -999; + betaInCut = -999; + + short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex]; + short innerOuterLowerModuleSubdet = modulesInGPU.subdets[innerOuterLowerModuleIndex]; + short outerInnerLowerModuleSubdet = modulesInGPU.subdets[outerInnerLowerModuleIndex]; + short outerOuterLowerModuleSubdet = modulesInGPU.subdets[outerOuterLowerModuleIndex]; + + if (innerInnerLowerModuleSubdet == SDL::Barrel and innerOuterLowerModuleSubdet == SDL::Barrel and + outerInnerLowerModuleSubdet == SDL::Barrel and outerOuterLowerModuleSubdet == SDL::Barrel) { + return runQuintupletDefaultAlgoBBBB(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + innerOuterLowerModuleIndex, + outerInnerLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + zHi, + zLoPointed, + zHiPointed, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut); + } else if (innerInnerLowerModuleSubdet == SDL::Barrel and innerOuterLowerModuleSubdet == SDL::Barrel and + outerInnerLowerModuleSubdet == SDL::Endcap and outerOuterLowerModuleSubdet == SDL::Endcap) { + return runQuintupletDefaultAlgoBBEE(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + innerOuterLowerModuleIndex, + outerInnerLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + rtLo, + rtHi, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut, + kZ); + } else if (innerInnerLowerModuleSubdet == SDL::Barrel and innerOuterLowerModuleSubdet == SDL::Barrel and + outerInnerLowerModuleSubdet == SDL::Barrel and outerOuterLowerModuleSubdet == SDL::Endcap) { + return runQuintupletDefaultAlgoBBBB(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + innerOuterLowerModuleIndex, + outerInnerLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + zHi, + zLoPointed, + zHiPointed, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut); + } else if (innerInnerLowerModuleSubdet == SDL::Barrel and innerOuterLowerModuleSubdet == SDL::Endcap and + outerInnerLowerModuleSubdet == SDL::Endcap and outerOuterLowerModuleSubdet == SDL::Endcap) { + return runQuintupletDefaultAlgoBBEE(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + innerOuterLowerModuleIndex, + outerInnerLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + rtLo, + rtHi, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut, + kZ); + } else if (innerInnerLowerModuleSubdet == SDL::Endcap and innerOuterLowerModuleSubdet == SDL::Endcap and + outerInnerLowerModuleSubdet == SDL::Endcap and outerOuterLowerModuleSubdet == SDL::Endcap) { + return runQuintupletDefaultAlgoEEEE(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + innerOuterLowerModuleIndex, + outerInnerLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + rtLo, + rtHi, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut, + kZ); + } + + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::triplets& tripletsInGPU, + uint16_t& lowerModuleIndex1, + uint16_t& lowerModuleIndex2, + uint16_t& lowerModuleIndex3, + uint16_t& lowerModuleIndex4, + uint16_t& lowerModuleIndex5, + unsigned int& innerTripletIndex, + unsigned int& outerTripletIndex, + float& innerRadius, + float& outerRadius, + float& bridgeRadius, + float& regressionG, + float& regressionF, + float& regressionRadius, + float& rzChiSquared, + float& chiSquared, + float& nonAnchorChiSquared, + bool& TightCutFlag) { + bool pass = true; + unsigned int firstSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex]; + unsigned int secondSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1]; + unsigned int thirdSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex]; + unsigned int fourthSegmentIndex = tripletsInGPU.segmentIndices[2 * outerTripletIndex + 1]; + + unsigned int innerOuterOuterMiniDoubletIndex = + segmentsInGPU.mdIndices[2 * secondSegmentIndex + 1]; //inner triplet outer segment outer MD index + unsigned int outerInnerInnerMiniDoubletIndex = + segmentsInGPU.mdIndices[2 * thirdSegmentIndex]; //outer triplet inner segment inner MD index + + //this cut reduces the number of candidates by a factor of 3, i.e., 2 out of 3 warps can end right here! + if (innerOuterOuterMiniDoubletIndex != outerInnerInnerMiniDoubletIndex) + return false; + + //apply T4 criteria between segments 1 and 3 + float zOut, rtOut, deltaPhiPos, deltaPhi, betaIn, betaOut, pt_beta; //temp stuff + float zLo, zHi, rtLo, rtHi, zLoPointed, zHiPointed, sdlCut, betaInCut, betaOutCut, deltaBetaCut, kZ; + + unsigned int firstMDIndex = segmentsInGPU.mdIndices[2 * firstSegmentIndex]; + unsigned int secondMDIndex = segmentsInGPU.mdIndices[2 * secondSegmentIndex]; + unsigned int thirdMDIndex = segmentsInGPU.mdIndices[2 * secondSegmentIndex + 1]; + unsigned int fourthMDIndex = segmentsInGPU.mdIndices[2 * thirdSegmentIndex + 1]; + unsigned int fifthMDIndex = segmentsInGPU.mdIndices[2 * fourthSegmentIndex + 1]; + + pass = pass and runQuintupletAlgoSelector(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + lowerModuleIndex1, + lowerModuleIndex2, + lowerModuleIndex3, + lowerModuleIndex4, + firstSegmentIndex, + thirdSegmentIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + zHi, + rtLo, + rtHi, + zLoPointed, + zHiPointed, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut, + kZ); + if (not pass) + return pass; + + pass = pass and runQuintupletAlgoSelector(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + lowerModuleIndex1, + lowerModuleIndex2, + lowerModuleIndex4, + lowerModuleIndex5, + firstSegmentIndex, + fourthSegmentIndex, + firstMDIndex, + secondMDIndex, + fourthMDIndex, + fifthMDIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + betaOut, + pt_beta, + zLo, + zHi, + rtLo, + rtHi, + zLoPointed, + zHiPointed, + sdlCut, + betaInCut, + betaOutCut, + deltaBetaCut, + kZ); + if (not pass) + return pass; + + float x1 = mdsInGPU.anchorX[firstMDIndex]; + float x2 = mdsInGPU.anchorX[secondMDIndex]; + float x3 = mdsInGPU.anchorX[thirdMDIndex]; + float x4 = mdsInGPU.anchorX[fourthMDIndex]; + float x5 = mdsInGPU.anchorX[fifthMDIndex]; + + float y1 = mdsInGPU.anchorY[firstMDIndex]; + float y2 = mdsInGPU.anchorY[secondMDIndex]; + float y3 = mdsInGPU.anchorY[thirdMDIndex]; + float y4 = mdsInGPU.anchorY[fourthMDIndex]; + float y5 = mdsInGPU.anchorY[fifthMDIndex]; + + //construct the arrays + float x1Vec[] = {x1, x1, x1}; + float y1Vec[] = {y1, y1, y1}; + float x2Vec[] = {x2, x2, x2}; + float y2Vec[] = {y2, y2, y2}; + float x3Vec[] = {x3, x3, x3}; + float y3Vec[] = {y3, y3, y3}; + + if (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == SDL::TwoS) { + x1Vec[1] = mdsInGPU.anchorLowEdgeX[firstMDIndex]; + x1Vec[2] = mdsInGPU.anchorHighEdgeX[firstMDIndex]; + + y1Vec[1] = mdsInGPU.anchorLowEdgeY[firstMDIndex]; + y1Vec[2] = mdsInGPU.anchorHighEdgeY[firstMDIndex]; + } + if (modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == SDL::TwoS) { + x2Vec[1] = mdsInGPU.anchorLowEdgeX[secondMDIndex]; + x2Vec[2] = mdsInGPU.anchorHighEdgeX[secondMDIndex]; + + y2Vec[1] = mdsInGPU.anchorLowEdgeY[secondMDIndex]; + y2Vec[2] = mdsInGPU.anchorHighEdgeY[secondMDIndex]; + } + if (modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == SDL::TwoS) { + x3Vec[1] = mdsInGPU.anchorLowEdgeX[thirdMDIndex]; + x3Vec[2] = mdsInGPU.anchorHighEdgeX[thirdMDIndex]; + + y3Vec[1] = mdsInGPU.anchorLowEdgeY[thirdMDIndex]; + y3Vec[2] = mdsInGPU.anchorHighEdgeY[thirdMDIndex]; + } + + float innerRadiusMin2S, innerRadiusMax2S; + computeErrorInRadius(acc, x1Vec, y1Vec, x2Vec, y2Vec, x3Vec, y3Vec, innerRadiusMin2S, innerRadiusMax2S); + + for (int i = 0; i < 3; i++) { + x1Vec[i] = x4; + y1Vec[i] = y4; + } + if (modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex4] == SDL::TwoS) { + x1Vec[1] = mdsInGPU.anchorLowEdgeX[fourthMDIndex]; + x1Vec[2] = mdsInGPU.anchorHighEdgeX[fourthMDIndex]; + + y1Vec[1] = mdsInGPU.anchorLowEdgeY[fourthMDIndex]; + y1Vec[2] = mdsInGPU.anchorHighEdgeY[fourthMDIndex]; + } + + float bridgeRadiusMin2S, bridgeRadiusMax2S; + computeErrorInRadius(acc, x2Vec, y2Vec, x3Vec, y3Vec, x1Vec, y1Vec, bridgeRadiusMin2S, bridgeRadiusMax2S); + + for (int i = 0; i < 3; i++) { + x2Vec[i] = x5; + y2Vec[i] = y5; + } + if (modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap and + modulesInGPU.moduleType[lowerModuleIndex5] == SDL::TwoS) { + x2Vec[1] = mdsInGPU.anchorLowEdgeX[fifthMDIndex]; + x2Vec[2] = mdsInGPU.anchorHighEdgeX[fifthMDIndex]; + + y2Vec[1] = mdsInGPU.anchorLowEdgeY[fifthMDIndex]; + y2Vec[2] = mdsInGPU.anchorHighEdgeY[fifthMDIndex]; + } + + float outerRadiusMin2S, outerRadiusMax2S; + computeErrorInRadius(acc, x3Vec, y3Vec, x1Vec, y1Vec, x2Vec, y2Vec, outerRadiusMin2S, outerRadiusMax2S); + + float g, f; + outerRadius = tripletsInGPU.circleRadius[outerTripletIndex]; + bridgeRadius = computeRadiusFromThreeAnchorHits(acc, x2, y2, x3, y3, x4, y4, g, f); + innerRadius = tripletsInGPU.circleRadius[innerTripletIndex]; + g = tripletsInGPU.circleCenterX[innerTripletIndex]; + f = tripletsInGPU.circleCenterY[innerTripletIndex]; + +#ifdef USE_RZCHI2 + float inner_pt = 2 * k2Rinv1GeVf * innerRadius; + + bool passRZChi2 = passT5RZConstraint(acc, + modulesInGPU, + mdsInGPU, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + fourthMDIndex, + fifthMDIndex, + lowerModuleIndex1, + lowerModuleIndex2, + lowerModuleIndex3, + lowerModuleIndex4, + lowerModuleIndex5, + rzChiSquared, + inner_pt, + innerRadius, + g, + f, + TightCutFlag); + pass = pass and passRZChi2; + if (not pass) + return pass; +#else + rzChiSquared = -1; +#endif + pass = pass && (innerRadius >= 0.95f * ptCut / (2.f * k2Rinv1GeVf)); + + float innerInvRadiusMin, innerInvRadiusMax, bridgeInvRadiusMin, bridgeInvRadiusMax, outerInvRadiusMin, + outerInvRadiusMax; + + //split by category + bool tempPass; + if (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex3] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex4] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex5] == SDL::Barrel) { + tempPass = matchRadiiBBBBB(acc, + innerRadius, + bridgeRadius, + outerRadius, + innerInvRadiusMin, + innerInvRadiusMax, + bridgeInvRadiusMin, + bridgeInvRadiusMax, + outerInvRadiusMin, + outerInvRadiusMax); + } else if (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex3] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex4] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap) { + tempPass = matchRadiiBBBBE(acc, + innerRadius, + bridgeRadius, + outerRadius, + innerRadiusMin2S, + innerRadiusMax2S, + bridgeRadiusMin2S, + bridgeRadiusMax2S, + outerRadiusMin2S, + outerRadiusMax2S, + innerInvRadiusMin, + innerInvRadiusMax, + bridgeInvRadiusMin, + bridgeInvRadiusMax, + outerInvRadiusMin, + outerInvRadiusMax); + } else if (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex3] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap and + modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap) { + if (modulesInGPU.layers[lowerModuleIndex1] == 1) { + tempPass = matchRadiiBBBEE12378(acc, + innerRadius, + bridgeRadius, + outerRadius, + innerRadiusMin2S, + innerRadiusMax2S, + bridgeRadiusMin2S, + bridgeRadiusMax2S, + outerRadiusMin2S, + outerRadiusMax2S, + innerInvRadiusMin, + innerInvRadiusMax, + bridgeInvRadiusMin, + bridgeInvRadiusMax, + outerInvRadiusMin, + outerInvRadiusMax); + } else if (modulesInGPU.layers[lowerModuleIndex1] == 2) { + tempPass = matchRadiiBBBEE23478(acc, + innerRadius, + bridgeRadius, + outerRadius, + innerRadiusMin2S, + innerRadiusMax2S, + bridgeRadiusMin2S, + bridgeRadiusMax2S, + outerRadiusMin2S, + outerRadiusMax2S, + innerInvRadiusMin, + innerInvRadiusMax, + bridgeInvRadiusMin, + bridgeInvRadiusMax, + outerInvRadiusMin, + outerInvRadiusMax); + } else { + tempPass = matchRadiiBBBEE34578(acc, + innerRadius, + bridgeRadius, + outerRadius, + innerRadiusMin2S, + innerRadiusMax2S, + bridgeRadiusMin2S, + bridgeRadiusMax2S, + outerRadiusMin2S, + outerRadiusMax2S, + innerInvRadiusMin, + innerInvRadiusMax, + bridgeInvRadiusMin, + bridgeInvRadiusMax, + outerInvRadiusMin, + outerInvRadiusMax); + } + } + + else if (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap and + modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap) { + tempPass = matchRadiiBBEEE(acc, + innerRadius, + bridgeRadius, + outerRadius, + innerRadiusMin2S, + innerRadiusMax2S, + bridgeRadiusMin2S, + bridgeRadiusMax2S, + outerRadiusMin2S, + outerRadiusMax2S, + innerInvRadiusMin, + innerInvRadiusMax, + bridgeInvRadiusMin, + bridgeInvRadiusMax, + outerInvRadiusMin, + outerInvRadiusMax); + } else if (modulesInGPU.subdets[lowerModuleIndex1] == SDL::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == SDL::Endcap and + modulesInGPU.subdets[lowerModuleIndex3] == SDL::Endcap and + modulesInGPU.subdets[lowerModuleIndex4] == SDL::Endcap and + modulesInGPU.subdets[lowerModuleIndex5] == SDL::Endcap) { + tempPass = matchRadiiBEEEE(acc, + innerRadius, + bridgeRadius, + outerRadius, + innerRadiusMin2S, + innerRadiusMax2S, + bridgeRadiusMin2S, + bridgeRadiusMax2S, + outerRadiusMin2S, + outerRadiusMax2S, + innerInvRadiusMin, + innerInvRadiusMax, + bridgeInvRadiusMin, + bridgeInvRadiusMax, + outerInvRadiusMin, + outerInvRadiusMax); + } else { + tempPass = matchRadiiEEEEE(acc, + innerRadius, + bridgeRadius, + outerRadius, + innerRadiusMin2S, + innerRadiusMax2S, + bridgeRadiusMin2S, + bridgeRadiusMax2S, + outerRadiusMin2S, + outerRadiusMax2S, + innerInvRadiusMin, + innerInvRadiusMax, + bridgeInvRadiusMin, + bridgeInvRadiusMax, + outerInvRadiusMin, + outerInvRadiusMax); + } + + //compute regression radius right here - this computation is expensive!!! + pass = pass and tempPass; + if (not pass) + return pass; + + float xVec[] = {x1, x2, x3, x4, x5}; + float yVec[] = {y1, y2, y3, y4, y5}; + const uint16_t lowerModuleIndices[] = { + lowerModuleIndex1, lowerModuleIndex2, lowerModuleIndex3, lowerModuleIndex4, lowerModuleIndex5}; + + // 5 categories for sigmas + float sigmas[5], delta1[5], delta2[5], slopes[5]; + bool isFlat[5]; + + computeSigmasForRegression(acc, modulesInGPU, lowerModuleIndices, delta1, delta2, slopes, isFlat); + regressionRadius = computeRadiusUsingRegression( + acc, 5, xVec, yVec, delta1, delta2, slopes, isFlat, regressionG, regressionF, sigmas, chiSquared); + +#ifdef USE_T5_DNN + unsigned int mdIndices[] = {firstMDIndex, secondMDIndex, thirdMDIndex, fourthMDIndex, fifthMDIndex}; + float inference = T5DNN::runInference(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + xVec, + yVec, + mdIndices, + lowerModuleIndices, + innerTripletIndex, + outerTripletIndex, + innerRadius, + outerRadius, + bridgeRadius); + pass = pass and (inference > T5DNN::LSTWP2); // T5-building cut + TightCutFlag = TightCutFlag and (inference > T5DNN::LSTWP2); // T5-in-TC cut + if (not pass) + return pass; +#endif + +#ifdef USE_RPHICHI2 + // extra chi squared cuts! + if (regressionRadius < 5.0f / (2.f * k2Rinv1GeVf)) { + pass = pass and passChiSquaredConstraint(modulesInGPU, + lowerModuleIndex1, + lowerModuleIndex2, + lowerModuleIndex3, + lowerModuleIndex4, + lowerModuleIndex5, + chiSquared); + if (not pass) + return pass; + } +#endif + + //compute the other chisquared + //non anchor is always shifted for tilted and endcap! + float nonAnchorDelta1[5], nonAnchorDelta2[5], nonAnchorSlopes[5]; + float nonAnchorxs[] = {mdsInGPU.outerX[firstMDIndex], + mdsInGPU.outerX[secondMDIndex], + mdsInGPU.outerX[thirdMDIndex], + mdsInGPU.outerX[fourthMDIndex], + mdsInGPU.outerX[fifthMDIndex]}; + float nonAnchorys[] = {mdsInGPU.outerY[firstMDIndex], + mdsInGPU.outerY[secondMDIndex], + mdsInGPU.outerY[thirdMDIndex], + mdsInGPU.outerY[fourthMDIndex], + mdsInGPU.outerY[fifthMDIndex]}; + + computeSigmasForRegression( + acc, modulesInGPU, lowerModuleIndices, nonAnchorDelta1, nonAnchorDelta2, nonAnchorSlopes, isFlat, 5, false); + nonAnchorChiSquared = computeChiSquared(acc, + 5, + nonAnchorxs, + nonAnchorys, + nonAnchorDelta1, + nonAnchorDelta2, + nonAnchorSlopes, + isFlat, + regressionG, + regressionF, + regressionRadius); + return pass; + }; + + struct createQuintupletsInGPUv2 { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::triplets tripletsInGPU, + struct SDL::quintuplets quintupletsInGPU, + struct SDL::objectRanges rangesInGPU, + uint16_t nEligibleT5Modules) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (int iter = globalThreadIdx[0]; iter < nEligibleT5Modules; iter += gridThreadExtent[0]) { + uint16_t lowerModule1 = rangesInGPU.indicesOfEligibleT5Modules[iter]; + short layer2_adjustment; + int layer = modulesInGPU.layers[lowerModule1]; + if (layer == 1) { + layer2_adjustment = 1; + } // get upper segment to be in second layer + else if (layer == 2) { + layer2_adjustment = 0; + } // get lower segment to be in second layer + else { + continue; + } + unsigned int nInnerTriplets = tripletsInGPU.nTriplets[lowerModule1]; + for (unsigned int innerTripletArrayIndex = globalThreadIdx[1]; innerTripletArrayIndex < nInnerTriplets; + innerTripletArrayIndex += gridThreadExtent[1]) { + unsigned int innerTripletIndex = rangesInGPU.tripletModuleIndices[lowerModule1] + innerTripletArrayIndex; + uint16_t lowerModule2 = tripletsInGPU.lowerModuleIndices[3 * innerTripletIndex + 1]; + uint16_t lowerModule3 = tripletsInGPU.lowerModuleIndices[3 * innerTripletIndex + 2]; + unsigned int nOuterTriplets = tripletsInGPU.nTriplets[lowerModule3]; + for (unsigned int outerTripletArrayIndex = globalThreadIdx[2]; outerTripletArrayIndex < nOuterTriplets; + outerTripletArrayIndex += gridThreadExtent[2]) { + unsigned int outerTripletIndex = rangesInGPU.tripletModuleIndices[lowerModule3] + outerTripletArrayIndex; + uint16_t lowerModule4 = tripletsInGPU.lowerModuleIndices[3 * outerTripletIndex + 1]; + uint16_t lowerModule5 = tripletsInGPU.lowerModuleIndices[3 * outerTripletIndex + 2]; + + float innerRadius, outerRadius, bridgeRadius, regressionG, regressionF, regressionRadius, rzChiSquared, + chiSquared, nonAnchorChiSquared; //required for making distributions + + bool TightCutFlag = false; + bool success = runQuintupletDefaultAlgo(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + lowerModule1, + lowerModule2, + lowerModule3, + lowerModule4, + lowerModule5, + innerTripletIndex, + outerTripletIndex, + innerRadius, + outerRadius, + bridgeRadius, + regressionG, + regressionF, + regressionRadius, + rzChiSquared, + chiSquared, + nonAnchorChiSquared, + TightCutFlag); + + if (success) { + int totOccupancyQuintuplets = + alpaka::atomicOp(acc, &quintupletsInGPU.totOccupancyQuintuplets[lowerModule1], 1u); + if (totOccupancyQuintuplets >= rangesInGPU.quintupletModuleOccupancy[lowerModule1]) { +#ifdef Warnings + printf("Quintuplet excess alert! Module index = %d\n", lowerModule1); +#endif + } else { + int quintupletModuleIndex = + alpaka::atomicOp(acc, &quintupletsInGPU.nQuintuplets[lowerModule1], 1u); + //this if statement should never get executed! + if (rangesInGPU.quintupletModuleIndices[lowerModule1] == -1) { +#ifdef Warnings + printf("Quintuplets : no memory for module at module index = %d\n", lowerModule1); +#endif + } else { + unsigned int quintupletIndex = + rangesInGPU.quintupletModuleIndices[lowerModule1] + quintupletModuleIndex; + float phi = + mdsInGPU.anchorPhi[segmentsInGPU.mdIndices[2 * tripletsInGPU.segmentIndices[2 * innerTripletIndex + + layer2_adjustment]]]; + float eta = + mdsInGPU.anchorEta[segmentsInGPU.mdIndices[2 * tripletsInGPU.segmentIndices[2 * innerTripletIndex + + layer2_adjustment]]]; + float pt = (innerRadius + outerRadius) * 3.8f * 1.602f / (2 * 100 * 5.39f); + float scores = chiSquared + nonAnchorChiSquared; + addQuintupletToMemory(tripletsInGPU, + quintupletsInGPU, + innerTripletIndex, + outerTripletIndex, + lowerModule1, + lowerModule2, + lowerModule3, + lowerModule4, + lowerModule5, + innerRadius, + bridgeRadius, + outerRadius, + regressionG, + regressionF, + regressionRadius, + rzChiSquared, + chiSquared, + nonAnchorChiSquared, + pt, + eta, + phi, + scores, + layer, + quintupletIndex, + TightCutFlag); + + tripletsInGPU.partOfT5[quintupletsInGPU.tripletIndices[2 * quintupletIndex]] = true; + tripletsInGPU.partOfT5[quintupletsInGPU.tripletIndices[2 * quintupletIndex + 1]] = true; + } + } + } + } + } + } + } + }; + + struct createEligibleModulesListForQuintupletsGPU { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::triplets tripletsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + // Initialize variables in shared memory and set to 0 + int& nEligibleT5Modulesx = alpaka::declareSharedVar(acc); + int& nTotalQuintupletsx = alpaka::declareSharedVar(acc); + nTotalQuintupletsx = 0; + nEligibleT5Modulesx = 0; + alpaka::syncBlockThreads(acc); + + // Initialize variables outside of the for loop. + int occupancy, category_number, eta_number; + + for (int i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + // Condition for a quintuple to exist for a module + // TCs don't exist for layers 5 and 6 barrel, and layers 2,3,4,5 endcap + short module_rings = modulesInGPU.rings[i]; + short module_layers = modulesInGPU.layers[i]; + short module_subdets = modulesInGPU.subdets[i]; + float module_eta = alpaka::math::abs(acc, modulesInGPU.eta[i]); + + if (tripletsInGPU.nTriplets[i] == 0) + continue; + if (module_subdets == SDL::Barrel and module_layers >= 3) + continue; + if (module_subdets == SDL::Endcap and module_layers > 1) + continue; + + int nEligibleT5Modules = alpaka::atomicOp(acc, &nEligibleT5Modulesx, 1); + + if (module_layers <= 3 && module_subdets == 5) + category_number = 0; + else if (module_layers >= 4 && module_subdets == 5) + category_number = 1; + else if (module_layers <= 2 && module_subdets == 4 && module_rings >= 11) + category_number = 2; + else if (module_layers >= 3 && module_subdets == 4 && module_rings >= 8) + category_number = 2; + else if (module_layers <= 2 && module_subdets == 4 && module_rings <= 10) + category_number = 3; + else if (module_layers >= 3 && module_subdets == 4 && module_rings <= 7) + category_number = 3; + else + category_number = -1; + + if (module_eta < 0.75) + eta_number = 0; + else if (module_eta > 0.75 && module_eta < 1.5) + eta_number = 1; + else if (module_eta > 1.5 && module_eta < 2.25) + eta_number = 2; + else if (module_eta > 2.25 && module_eta < 3) + eta_number = 3; + else + eta_number = -1; + + if (category_number == 0 && eta_number == 0) + occupancy = 336; + else if (category_number == 0 && eta_number == 1) + occupancy = 414; + else if (category_number == 0 && eta_number == 2) + occupancy = 231; + else if (category_number == 0 && eta_number == 3) + occupancy = 146; + else if (category_number == 3 && eta_number == 1) + occupancy = 0; + else if (category_number == 3 && eta_number == 2) + occupancy = 191; + else if (category_number == 3 && eta_number == 3) + occupancy = 106; + else { + occupancy = 0; +#ifdef Warnings + printf("Unhandled case in createEligibleModulesListForQuintupletsGPU! Module index = %i\n", i); +#endif + } + + int nTotQ = alpaka::atomicOp(acc, &nTotalQuintupletsx, occupancy); + rangesInGPU.quintupletModuleIndices[i] = nTotQ; + rangesInGPU.indicesOfEligibleT5Modules[nEligibleT5Modules] = i; + rangesInGPU.quintupletModuleOccupancy[i] = occupancy; + } + + // Wait for all threads to finish before reporting final values + alpaka::syncBlockThreads(acc); + if (globalThreadIdx[2] == 0) { + *rangesInGPU.nEligibleT5Modules = static_cast(nEligibleT5Modulesx); + *rangesInGPU.device_nTotalQuints = static_cast(nTotalQuintupletsx); + } + } + }; + + struct addQuintupletRangesToEventExplicit { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::quintuplets quintupletsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + if (quintupletsInGPU.nQuintuplets[i] == 0 or rangesInGPU.quintupletModuleIndices[i] == -1) { + rangesInGPU.quintupletRanges[i * 2] = -1; + rangesInGPU.quintupletRanges[i * 2 + 1] = -1; + } else { + rangesInGPU.quintupletRanges[i * 2] = rangesInGPU.quintupletModuleIndices[i]; + rangesInGPU.quintupletRanges[i * 2 + 1] = + rangesInGPU.quintupletModuleIndices[i] + quintupletsInGPU.nQuintuplets[i] - 1; + } + } + } + }; +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/Segment.h b/RecoTracker/LSTCore/src/alpaka/Segment.h new file mode 100644 index 0000000000000..61e91eef18a47 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/Segment.h @@ -0,0 +1,1120 @@ +#ifndef Segment_cuh +#define Segment_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "EndcapGeometry.h" +#include "MiniDoublet.h" +#include "Hit.h" + +namespace SDL { + struct segments { + FPX* dPhis; + FPX* dPhiMins; + FPX* dPhiMaxs; + FPX* dPhiChanges; + FPX* dPhiChangeMins; + FPX* dPhiChangeMaxs; + uint16_t* innerLowerModuleIndices; + uint16_t* outerLowerModuleIndices; + unsigned int* seedIdx; + unsigned int* mdIndices; + unsigned int* nMemoryLocations; + unsigned int* innerMiniDoubletAnchorHitIndices; + unsigned int* outerMiniDoubletAnchorHitIndices; + int* charge; + int* superbin; + unsigned int* nSegments; //number of segments per inner lower module + unsigned int* totOccupancySegments; //number of segments per inner lower module + uint4* pLSHitsIdxs; + int8_t* pixelType; + char* isQuad; + char* isDup; + bool* partOfPT5; + float* ptIn; + float* ptErr; + float* px; + float* py; + float* pz; + float* etaErr; + float* eta; + float* phi; + float* score; + float* circleCenterX; + float* circleCenterY; + float* circleRadius; + + template + void setData(TBuff& segmentsbuf) { + dPhis = alpaka::getPtrNative(segmentsbuf.dPhis_buf); + dPhiMins = alpaka::getPtrNative(segmentsbuf.dPhiMins_buf); + dPhiMaxs = alpaka::getPtrNative(segmentsbuf.dPhiMaxs_buf); + dPhiChanges = alpaka::getPtrNative(segmentsbuf.dPhiChanges_buf); + dPhiChangeMins = alpaka::getPtrNative(segmentsbuf.dPhiChangeMins_buf); + dPhiChangeMaxs = alpaka::getPtrNative(segmentsbuf.dPhiChangeMaxs_buf); + innerLowerModuleIndices = alpaka::getPtrNative(segmentsbuf.innerLowerModuleIndices_buf); + outerLowerModuleIndices = alpaka::getPtrNative(segmentsbuf.outerLowerModuleIndices_buf); + seedIdx = alpaka::getPtrNative(segmentsbuf.seedIdx_buf); + mdIndices = alpaka::getPtrNative(segmentsbuf.mdIndices_buf); + nMemoryLocations = alpaka::getPtrNative(segmentsbuf.nMemoryLocations_buf); + innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(segmentsbuf.innerMiniDoubletAnchorHitIndices_buf); + outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(segmentsbuf.outerMiniDoubletAnchorHitIndices_buf); + charge = alpaka::getPtrNative(segmentsbuf.charge_buf); + superbin = alpaka::getPtrNative(segmentsbuf.superbin_buf); + nSegments = alpaka::getPtrNative(segmentsbuf.nSegments_buf); + totOccupancySegments = alpaka::getPtrNative(segmentsbuf.totOccupancySegments_buf); + pLSHitsIdxs = alpaka::getPtrNative(segmentsbuf.pLSHitsIdxs_buf); + pixelType = alpaka::getPtrNative(segmentsbuf.pixelType_buf); + isQuad = alpaka::getPtrNative(segmentsbuf.isQuad_buf); + isDup = alpaka::getPtrNative(segmentsbuf.isDup_buf); + partOfPT5 = alpaka::getPtrNative(segmentsbuf.partOfPT5_buf); + ptIn = alpaka::getPtrNative(segmentsbuf.ptIn_buf); + ptErr = alpaka::getPtrNative(segmentsbuf.ptErr_buf); + px = alpaka::getPtrNative(segmentsbuf.px_buf); + py = alpaka::getPtrNative(segmentsbuf.py_buf); + pz = alpaka::getPtrNative(segmentsbuf.pz_buf); + etaErr = alpaka::getPtrNative(segmentsbuf.etaErr_buf); + eta = alpaka::getPtrNative(segmentsbuf.eta_buf); + phi = alpaka::getPtrNative(segmentsbuf.phi_buf); + score = alpaka::getPtrNative(segmentsbuf.score_buf); + circleCenterX = alpaka::getPtrNative(segmentsbuf.circleCenterX_buf); + circleCenterY = alpaka::getPtrNative(segmentsbuf.circleCenterY_buf); + circleRadius = alpaka::getPtrNative(segmentsbuf.circleRadius_buf); + } + }; + + template + struct segmentsBuffer : segments { + Buf dPhis_buf; + Buf dPhiMins_buf; + Buf dPhiMaxs_buf; + Buf dPhiChanges_buf; + Buf dPhiChangeMins_buf; + Buf dPhiChangeMaxs_buf; + Buf innerLowerModuleIndices_buf; + Buf outerLowerModuleIndices_buf; + Buf seedIdx_buf; + Buf mdIndices_buf; + Buf nMemoryLocations_buf; + Buf innerMiniDoubletAnchorHitIndices_buf; + Buf outerMiniDoubletAnchorHitIndices_buf; + Buf charge_buf; + Buf superbin_buf; + Buf nSegments_buf; + Buf totOccupancySegments_buf; + Buf pLSHitsIdxs_buf; + Buf pixelType_buf; + Buf isQuad_buf; + Buf isDup_buf; + Buf partOfPT5_buf; + Buf ptIn_buf; + Buf ptErr_buf; + Buf px_buf; + Buf py_buf; + Buf pz_buf; + Buf etaErr_buf; + Buf eta_buf; + Buf phi_buf; + Buf score_buf; + Buf circleCenterX_buf; + Buf circleCenterY_buf; + Buf circleRadius_buf; + + template + segmentsBuffer(unsigned int nMemoryLocationsIn, + uint16_t nLowerModules, + unsigned int maxPixelSegments, + TDevAcc const& devAccIn, + TQueue& queue) + : dPhis_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiChanges_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiChangeMins_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + dPhiChangeMaxs_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + innerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + outerLowerModuleIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + seedIdx_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + mdIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn * 2, queue)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1, queue)), + innerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + outerMiniDoubletAnchorHitIndices_buf(allocBufWrapper(devAccIn, nMemoryLocationsIn, queue)), + charge_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + superbin_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + nSegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1, queue)), + totOccupancySegments_buf(allocBufWrapper(devAccIn, nLowerModules + 1, queue)), + pLSHitsIdxs_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + pixelType_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + isQuad_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + isDup_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + ptIn_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + ptErr_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + px_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + py_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + pz_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + etaErr_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + eta_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + phi_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + score_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + circleCenterX_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + circleCenterY_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)), + circleRadius_buf(allocBufWrapper(devAccIn, maxPixelSegments, queue)) { + alpaka::memset(queue, nSegments_buf, 0u); + alpaka::memset(queue, totOccupancySegments_buf, 0u); + alpaka::memset(queue, partOfPT5_buf, false); + alpaka::memset(queue, pLSHitsIdxs_buf, 0u); + alpaka::wait(queue); + } + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules_seg(struct SDL::modules& modulesInGPU, + unsigned int moduleIndex) { + // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing + // This is the same as what was previously considered as"isNormalTiltedModules" + // See Figure 9.1 of https://cds.cern.ch/record/2272264/files/CMS-TDR-014.pdf + short subdet = modulesInGPU.subdets[moduleIndex]; + short layer = modulesInGPU.layers[moduleIndex]; + short side = modulesInGPU.sides[moduleIndex]; + short rod = modulesInGPU.rods[moduleIndex]; + + return (subdet == Barrel) && (((side != Center) && (layer == 3)) || + ((side == NegZ) && (((layer == 2) && (rod > 5)) || ((layer == 1) && (rod > 9)))) || + ((side == PosZ) && (((layer == 2) && (rod < 8)) || ((layer == 1) && (rod < 4))))); + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules_seg(short subdet, short layer, short side, short rod) { + // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing + // This is the same as what was previously considered as"isNormalTiltedModules" + // See Figure 9.1 of https://cds.cern.ch/record/2272264/files/CMS-TDR-014.pdf + return (subdet == Barrel) && (((side != Center) && (layer == 3)) || + ((side == NegZ) && (((layer == 2) && (rod > 5)) || ((layer == 1) && (rod > 9)))) || + ((side == PosZ) && (((layer == 2) && (rod < 8)) || ((layer == 1) && (rod < 4))))); + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float moduleGapSize_seg(short layer, short ring, short subdet, short side, short rod) { + static constexpr float miniDeltaTilted[3] = {0.26f, 0.26f, 0.26f}; + static constexpr float miniDeltaFlat[6] = {0.26f, 0.16f, 0.16f, 0.18f, 0.18f, 0.18f}; + static constexpr float miniDeltaLooseTilted[3] = {0.4f, 0.4f, 0.4f}; + static constexpr float miniDeltaEndcap[5][15] = { + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}, + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}, + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.18f, 0.18f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}, + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.18f, 0.18f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}, + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.18f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}}; + + unsigned int iL = layer - 1; + unsigned int iR = ring - 1; + + float moduleSeparation = 0; + + if (subdet == Barrel and side == Center) { + moduleSeparation = miniDeltaFlat[iL]; + } else if (isTighterTiltedModules_seg(subdet, layer, side, rod)) { + moduleSeparation = miniDeltaTilted[iL]; + } else if (subdet == Endcap) { + moduleSeparation = miniDeltaEndcap[iL][iR]; + } else //Loose tilted modules + { + moduleSeparation = miniDeltaLooseTilted[iL]; + } + + return moduleSeparation; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float moduleGapSize_seg(struct SDL::modules& modulesInGPU, unsigned int moduleIndex) { + static constexpr float miniDeltaTilted[3] = {0.26f, 0.26f, 0.26f}; + static constexpr float miniDeltaFlat[6] = {0.26f, 0.16f, 0.16f, 0.18f, 0.18f, 0.18f}; + static constexpr float miniDeltaLooseTilted[3] = {0.4f, 0.4f, 0.4f}; + static constexpr float miniDeltaEndcap[5][15] = { + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}, + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}, + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.18f, 0.18f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}, + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.18f, 0.18f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}, + {0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.4f, 0.18f, /*10*/ 0.18f, 0.18f, 0.18f, 0.18f, 0.18f}}; + + unsigned int iL = modulesInGPU.layers[moduleIndex] - 1; + unsigned int iR = modulesInGPU.rings[moduleIndex] - 1; + short subdet = modulesInGPU.subdets[moduleIndex]; + short side = modulesInGPU.sides[moduleIndex]; + + float moduleSeparation = 0; + + if (subdet == Barrel and side == Center) { + moduleSeparation = miniDeltaFlat[iL]; + } else if (isTighterTiltedModules_seg(modulesInGPU, moduleIndex)) { + moduleSeparation = miniDeltaTilted[iL]; + } else if (subdet == Endcap) { + moduleSeparation = miniDeltaEndcap[iL][iR]; + } else //Loose tilted modules + { + moduleSeparation = miniDeltaLooseTilted[iL]; + } + + return moduleSeparation; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void dAlphaThreshold(TAcc const& acc, + float* dAlphaThresholdValues, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + float& xIn, + float& yIn, + float& zIn, + float& rtIn, + float& xOut, + float& yOut, + float& zOut, + float& rtOut, + uint16_t& innerLowerModuleIndex, + uint16_t& outerLowerModuleIndex, + unsigned int& innerMDIndex, + unsigned int& outerMDIndex) { + float sdMuls = (modulesInGPU.subdets[innerLowerModuleIndex] == SDL::Barrel) + ? miniMulsPtScaleBarrel[modulesInGPU.layers[innerLowerModuleIndex] - 1] * 3.f / ptCut + : miniMulsPtScaleEndcap[modulesInGPU.layers[innerLowerModuleIndex] - 1] * 3.f / ptCut; + + //more accurate then outer rt - inner rt + float segmentDr = alpaka::math::sqrt(acc, (yOut - yIn) * (yOut - yIn) + (xOut - xIn) * (xOut - xIn)); + + const float dAlpha_Bfield = + alpaka::math::asin(acc, alpaka::math::min(acc, segmentDr * k2Rinv1GeVf / ptCut, sinAlphaMax)); + + bool isInnerTilted = modulesInGPU.subdets[innerLowerModuleIndex] == SDL::Barrel and + modulesInGPU.sides[innerLowerModuleIndex] != SDL::Center; + bool isOuterTilted = modulesInGPU.subdets[outerLowerModuleIndex] == SDL::Barrel and + modulesInGPU.sides[outerLowerModuleIndex] != SDL::Center; + + const float& drdzInner = modulesInGPU.drdzs[innerLowerModuleIndex]; + const float& drdzOuter = modulesInGPU.drdzs[outerLowerModuleIndex]; + float innerModuleGapSize = SDL::moduleGapSize_seg(modulesInGPU, innerLowerModuleIndex); + float outerModuleGapSize = SDL::moduleGapSize_seg(modulesInGPU, outerLowerModuleIndex); + const float innerminiTilt = isInnerTilted + ? (0.5f * pixelPSZpitch * drdzInner / + alpaka::math::sqrt(acc, 1.f + drdzInner * drdzInner) / innerModuleGapSize) + : 0; + + const float outerminiTilt = isOuterTilted + ? (0.5f * pixelPSZpitch * drdzOuter / + alpaka::math::sqrt(acc, 1.f + drdzOuter * drdzOuter) / outerModuleGapSize) + : 0; + + float miniDelta = innerModuleGapSize; + + float sdLumForInnerMini; + float sdLumForOuterMini; + + if (modulesInGPU.subdets[innerLowerModuleIndex] == SDL::Barrel) { + sdLumForInnerMini = innerminiTilt * dAlpha_Bfield; + } else { + sdLumForInnerMini = mdsInGPU.dphis[innerMDIndex] * 15.0f / mdsInGPU.dzs[innerMDIndex]; + } + + if (modulesInGPU.subdets[outerLowerModuleIndex] == SDL::Barrel) { + sdLumForOuterMini = outerminiTilt * dAlpha_Bfield; + } else { + sdLumForOuterMini = mdsInGPU.dphis[outerMDIndex] * 15.0f / mdsInGPU.dzs[outerMDIndex]; + } + + // Unique stuff for the segment dudes alone + float dAlpha_res_inner = + 0.02f / miniDelta * + (modulesInGPU.subdets[innerLowerModuleIndex] == SDL::Barrel ? 1.0f : alpaka::math::abs(acc, zIn) / rtIn); + float dAlpha_res_outer = + 0.02f / miniDelta * + (modulesInGPU.subdets[outerLowerModuleIndex] == SDL::Barrel ? 1.0f : alpaka::math::abs(acc, zOut) / rtOut); + + float dAlpha_res = dAlpha_res_inner + dAlpha_res_outer; + + if (modulesInGPU.subdets[innerLowerModuleIndex] == SDL::Barrel and + modulesInGPU.sides[innerLowerModuleIndex] == SDL::Center) { + dAlphaThresholdValues[0] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls); + } else { + dAlphaThresholdValues[0] = + dAlpha_Bfield + + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls + sdLumForInnerMini * sdLumForInnerMini); + } + + if (modulesInGPU.subdets[outerLowerModuleIndex] == SDL::Barrel and + modulesInGPU.sides[outerLowerModuleIndex] == SDL::Center) { + dAlphaThresholdValues[1] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls); + } else { + dAlphaThresholdValues[1] = + dAlpha_Bfield + + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls + sdLumForOuterMini * sdLumForOuterMini); + } + + //Inner to outer + dAlphaThresholdValues[2] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls); + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(struct SDL::segments& segmentsInGPU, + unsigned int lowerMDIndex, + unsigned int upperMDIndex, + uint16_t innerLowerModuleIndex, + uint16_t outerLowerModuleIndex, + unsigned int innerMDAnchorHitIndex, + unsigned int outerMDAnchorHitIndex, + float& dPhi, + float& dPhiMin, + float& dPhiMax, + float& dPhiChange, + float& dPhiChangeMin, + float& dPhiChangeMax, + unsigned int idx) { + //idx will be computed in the kernel, which is the index into which the + //segment will be written + //nSegments will be incremented in the kernel + //printf("seg: %u %u %u %u\n",lowerMDIndex, upperMDIndex,innerLowerModuleIndex,outerLowerModuleIndex); + segmentsInGPU.mdIndices[idx * 2] = lowerMDIndex; + segmentsInGPU.mdIndices[idx * 2 + 1] = upperMDIndex; + segmentsInGPU.innerLowerModuleIndices[idx] = innerLowerModuleIndex; + segmentsInGPU.outerLowerModuleIndices[idx] = outerLowerModuleIndex; + segmentsInGPU.innerMiniDoubletAnchorHitIndices[idx] = innerMDAnchorHitIndex; + segmentsInGPU.outerMiniDoubletAnchorHitIndices[idx] = outerMDAnchorHitIndex; + + segmentsInGPU.dPhis[idx] = __F2H(dPhi); + segmentsInGPU.dPhiMins[idx] = __F2H(dPhiMin); + segmentsInGPU.dPhiMaxs[idx] = __F2H(dPhiMax); + segmentsInGPU.dPhiChanges[idx] = __F2H(dPhiChange); + segmentsInGPU.dPhiChangeMins[idx] = __F2H(dPhiChangeMin); + segmentsInGPU.dPhiChangeMaxs[idx] = __F2H(dPhiChangeMax); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const& acc, + struct SDL::segments& segmentsInGPU, + struct SDL::miniDoublets& mdsInGPU, + unsigned int innerMDIndex, + unsigned int outerMDIndex, + uint16_t pixelModuleIndex, + unsigned int hitIdxs[4], + unsigned int innerAnchorHitIndex, + unsigned int outerAnchorHitIndex, + float dPhiChange, + unsigned int idx, + unsigned int pixelSegmentArrayIndex, + float score) { + segmentsInGPU.mdIndices[idx * 2] = innerMDIndex; + segmentsInGPU.mdIndices[idx * 2 + 1] = outerMDIndex; + segmentsInGPU.innerLowerModuleIndices[idx] = pixelModuleIndex; + segmentsInGPU.outerLowerModuleIndices[idx] = pixelModuleIndex; + segmentsInGPU.innerMiniDoubletAnchorHitIndices[idx] = innerAnchorHitIndex; + segmentsInGPU.outerMiniDoubletAnchorHitIndices[idx] = outerAnchorHitIndex; + segmentsInGPU.dPhiChanges[idx] = __F2H(dPhiChange); + segmentsInGPU.isDup[pixelSegmentArrayIndex] = false; + segmentsInGPU.score[pixelSegmentArrayIndex] = score; + + segmentsInGPU.pLSHitsIdxs[pixelSegmentArrayIndex].x = hitIdxs[0]; + segmentsInGPU.pLSHitsIdxs[pixelSegmentArrayIndex].y = hitIdxs[1]; + segmentsInGPU.pLSHitsIdxs[pixelSegmentArrayIndex].z = hitIdxs[2]; + segmentsInGPU.pLSHitsIdxs[pixelSegmentArrayIndex].w = hitIdxs[3]; + + //computing circle parameters + /* + The two anchor hits are r3PCA and r3LH. p3PCA pt, eta, phi is hitIndex1 x, y, z + */ + float circleRadius = mdsInGPU.outerX[innerMDIndex] / (2 * k2Rinv1GeVf); + float circlePhi = mdsInGPU.outerZ[innerMDIndex]; + float candidateCenterXs[] = {mdsInGPU.anchorX[innerMDIndex] + circleRadius * alpaka::math::sin(acc, circlePhi), + mdsInGPU.anchorX[innerMDIndex] - circleRadius * alpaka::math::sin(acc, circlePhi)}; + float candidateCenterYs[] = {mdsInGPU.anchorY[innerMDIndex] - circleRadius * alpaka::math::cos(acc, circlePhi), + mdsInGPU.anchorY[innerMDIndex] + circleRadius * alpaka::math::cos(acc, circlePhi)}; + + //check which of the circles can accommodate r3LH better (we won't get perfect agreement) + float bestChiSquared = SDL::SDL_INF; + float chiSquared; + size_t bestIndex; + for (size_t i = 0; i < 2; i++) { + chiSquared = + alpaka::math::abs(acc, + alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[outerMDIndex] - candidateCenterXs[i]) * + (mdsInGPU.anchorX[outerMDIndex] - candidateCenterXs[i]) + + (mdsInGPU.anchorY[outerMDIndex] - candidateCenterYs[i]) * + (mdsInGPU.anchorY[outerMDIndex] - candidateCenterYs[i])) - + circleRadius); + if (chiSquared < bestChiSquared) { + bestChiSquared = chiSquared; + bestIndex = i; + } + } + segmentsInGPU.circleCenterX[pixelSegmentArrayIndex] = candidateCenterXs[bestIndex]; + segmentsInGPU.circleCenterY[pixelSegmentArrayIndex] = candidateCenterYs[bestIndex]; + segmentsInGPU.circleRadius[pixelSegmentArrayIndex] = circleRadius; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgoBarrel(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + uint16_t& innerLowerModuleIndex, + uint16_t& outerLowerModuleIndex, + unsigned int& innerMDIndex, + unsigned int& outerMDIndex, + float& zIn, + float& zOut, + float& rtIn, + float& rtOut, + float& dPhi, + float& dPhiMin, + float& dPhiMax, + float& dPhiChange, + float& dPhiChangeMin, + float& dPhiChangeMax, + float& dAlphaInnerMDSegment, + float& dAlphaOuterMDSegment, + float& dAlphaInnerMDOuterMD, + float& zLo, + float& zHi, + float& sdCut, + float& dAlphaInnerMDSegmentThreshold, + float& dAlphaOuterMDSegmentThreshold, + float& dAlphaInnerMDOuterMDThreshold) { + bool pass = true; + + float sdMuls = (modulesInGPU.subdets[innerLowerModuleIndex] == SDL::Barrel) + ? miniMulsPtScaleBarrel[modulesInGPU.layers[innerLowerModuleIndex] - 1] * 3.f / ptCut + : miniMulsPtScaleEndcap[modulesInGPU.layers[innerLowerModuleIndex] - 1] * 3.f / ptCut; + + float xIn, yIn, xOut, yOut; + + xIn = mdsInGPU.anchorX[innerMDIndex]; + yIn = mdsInGPU.anchorY[innerMDIndex]; + zIn = mdsInGPU.anchorZ[innerMDIndex]; + rtIn = mdsInGPU.anchorRt[innerMDIndex]; + + xOut = mdsInGPU.anchorX[outerMDIndex]; + yOut = mdsInGPU.anchorY[outerMDIndex]; + zOut = mdsInGPU.anchorZ[outerMDIndex]; + rtOut = mdsInGPU.anchorRt[outerMDIndex]; + + float sdSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, sinAlphaMax)); + float sdPVoff = 0.1f / rtOut; + float dzDrtScale = alpaka::math::tan(acc, sdSlope) / sdSlope; //FIXME: need appropriate value + + const float zGeom = modulesInGPU.layers[innerLowerModuleIndex] <= 2 ? 2.f * pixelPSZpitch : 2.f * strip2SZpitch; + + zLo = zIn + (zIn - deltaZLum) * (rtOut / rtIn - 1.f) * (zIn > 0.f ? 1.f : dzDrtScale) - + zGeom; //slope-correction only on outer end + zHi = zIn + (zIn + deltaZLum) * (rtOut / rtIn - 1.f) * (zIn < 0.f ? 1.f : dzDrtScale) + zGeom; + + pass = pass and ((zOut >= zLo) && (zOut <= zHi)); + if (not pass) + return pass; + + sdCut = sdSlope + alpaka::math::sqrt(acc, sdMuls * sdMuls + sdPVoff * sdPVoff); + + dPhi = SDL::phi_mpi_pi(acc, mdsInGPU.anchorPhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); + + pass = pass and (alpaka::math::abs(acc, dPhi) <= sdCut); + if (not pass) + return pass; + + dPhiChange = SDL::phi_mpi_pi(acc, SDL::phi(acc, xOut - xIn, yOut - yIn) - mdsInGPU.anchorPhi[innerMDIndex]); + + pass = pass and (alpaka::math::abs(acc, dPhiChange) <= sdCut); + if (not pass) + return pass; + + float dAlphaThresholdValues[3]; + dAlphaThreshold(acc, + dAlphaThresholdValues, + modulesInGPU, + mdsInGPU, + xIn, + yIn, + zIn, + rtIn, + xOut, + yOut, + zOut, + rtOut, + innerLowerModuleIndex, + outerLowerModuleIndex, + innerMDIndex, + outerMDIndex); + + float innerMDAlpha = mdsInGPU.dphichanges[innerMDIndex]; + float outerMDAlpha = mdsInGPU.dphichanges[outerMDIndex]; + dAlphaInnerMDSegment = innerMDAlpha - dPhiChange; + dAlphaOuterMDSegment = outerMDAlpha - dPhiChange; + dAlphaInnerMDOuterMD = innerMDAlpha - outerMDAlpha; + + dAlphaInnerMDSegmentThreshold = dAlphaThresholdValues[0]; + dAlphaOuterMDSegmentThreshold = dAlphaThresholdValues[1]; + dAlphaInnerMDOuterMDThreshold = dAlphaThresholdValues[2]; + + pass = pass and (alpaka::math::abs(acc, dAlphaInnerMDSegment) < dAlphaInnerMDSegmentThreshold); + if (not pass) + return pass; + pass = pass and (alpaka::math::abs(acc, dAlphaOuterMDSegment) < dAlphaOuterMDSegmentThreshold); + if (not pass) + return pass; + pass = pass and (alpaka::math::abs(acc, dAlphaInnerMDOuterMD) < dAlphaInnerMDOuterMDThreshold); + + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgoEndcap(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + uint16_t& innerLowerModuleIndex, + uint16_t& outerLowerModuleIndex, + unsigned int& innerMDIndex, + unsigned int& outerMDIndex, + float& zIn, + float& zOut, + float& rtIn, + float& rtOut, + float& dPhi, + float& dPhiMin, + float& dPhiMax, + float& dPhiChange, + float& dPhiChangeMin, + float& dPhiChangeMax, + float& dAlphaInnerMDSegment, + float& dAlphaOuterMDSegment, + float& rtLo, + float& rtHi, + float& sdCut, + float& dAlphaInnerMDSegmentThreshold, + float& dAlphaOuterMDSegmentThreshold, + float& dAlphaInnerMDOuterMDThreshold, + float& dAlphaInnerMDOuterMD) { + bool pass = true; + + float xIn, yIn; + float xOut, yOut; + + xIn = mdsInGPU.anchorX[innerMDIndex]; + yIn = mdsInGPU.anchorY[innerMDIndex]; + zIn = mdsInGPU.anchorZ[innerMDIndex]; + rtIn = mdsInGPU.anchorRt[innerMDIndex]; + + xOut = mdsInGPU.anchorX[outerMDIndex]; + yOut = mdsInGPU.anchorY[outerMDIndex]; + zOut = mdsInGPU.anchorZ[outerMDIndex]; + rtOut = mdsInGPU.anchorRt[outerMDIndex]; + + bool outerLayerEndcapTwoS = (modulesInGPU.subdets[outerLowerModuleIndex] == SDL::Endcap) && + (modulesInGPU.moduleType[outerLowerModuleIndex] == SDL::TwoS); + + float sdSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, sinAlphaMax)); + float disks2SMinRadius = 60.f; + + float rtGeom = ((rtIn < disks2SMinRadius && rtOut < disks2SMinRadius) + ? (2.f * pixelPSZpitch) + : ((rtIn < disks2SMinRadius || rtOut < disks2SMinRadius) ? (pixelPSZpitch + strip2SZpitch) + : (2.f * strip2SZpitch))); + + //cut 0 - z compatibility + pass = pass and (zIn * zOut >= 0); + if (not pass) + return pass; + + float dz = zOut - zIn; + // Alpaka: Needs to be moved over + float dLum = SDL::copysignf(deltaZLum, zIn); + float drtDzScale = sdSlope / alpaka::math::tan(acc, sdSlope); + + rtLo = alpaka::math::max( + acc, rtIn * (1.f + dz / (zIn + dLum) * drtDzScale) - rtGeom, rtIn - 0.5f * rtGeom); //rt should increase + rtHi = rtIn * (zOut - dLum) / (zIn - dLum) + + rtGeom; //dLum for luminous; rGeom for measurement size; no tanTheta_loc(pt) correction + + // Completeness + pass = pass and ((rtOut >= rtLo) && (rtOut <= rtHi)); + if (not pass) + return pass; + + dPhi = SDL::phi_mpi_pi(acc, mdsInGPU.anchorPhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); + + sdCut = sdSlope; + if (outerLayerEndcapTwoS) { + float dPhiPos_high = + SDL::phi_mpi_pi(acc, mdsInGPU.anchorHighEdgePhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); + float dPhiPos_low = + SDL::phi_mpi_pi(acc, mdsInGPU.anchorLowEdgePhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); + + dPhiMax = alpaka::math::abs(acc, dPhiPos_high) > alpaka::math::abs(acc, dPhiPos_low) ? dPhiPos_high : dPhiPos_low; + dPhiMin = alpaka::math::abs(acc, dPhiPos_high) > alpaka::math::abs(acc, dPhiPos_low) ? dPhiPos_low : dPhiPos_high; + } else { + dPhiMax = dPhi; + dPhiMin = dPhi; + } + pass = pass and (alpaka::math::abs(acc, dPhi) <= sdCut); + if (not pass) + return pass; + + float dzFrac = dz / zIn; + dPhiChange = dPhi / dzFrac * (1.f + dzFrac); + dPhiChangeMin = dPhiMin / dzFrac * (1.f + dzFrac); + dPhiChangeMax = dPhiMax / dzFrac * (1.f + dzFrac); + + pass = pass and (alpaka::math::abs(acc, dPhiChange) <= sdCut); + if (not pass) + return pass; + + float dAlphaThresholdValues[3]; + dAlphaThreshold(acc, + dAlphaThresholdValues, + modulesInGPU, + mdsInGPU, + xIn, + yIn, + zIn, + rtIn, + xOut, + yOut, + zOut, + rtOut, + innerLowerModuleIndex, + outerLowerModuleIndex, + innerMDIndex, + outerMDIndex); + + dAlphaInnerMDSegmentThreshold = dAlphaThresholdValues[0]; + dAlphaOuterMDSegmentThreshold = dAlphaThresholdValues[1]; + dAlphaInnerMDOuterMDThreshold = dAlphaThresholdValues[2]; + + float innerMDAlpha = mdsInGPU.dphichanges[innerMDIndex]; + float outerMDAlpha = mdsInGPU.dphichanges[outerMDIndex]; + dAlphaInnerMDSegment = innerMDAlpha - dPhiChange; + dAlphaOuterMDSegment = outerMDAlpha - dPhiChange; + dAlphaInnerMDOuterMD = innerMDAlpha - outerMDAlpha; + + pass = pass and (alpaka::math::abs(acc, dAlphaInnerMDSegment) < dAlphaThresholdValues[0]); + if (not pass) + return pass; + pass = pass and (alpaka::math::abs(acc, dAlphaOuterMDSegment) < dAlphaThresholdValues[1]); + if (not pass) + return pass; + pass = pass and (alpaka::math::abs(acc, dAlphaInnerMDOuterMD) < dAlphaThresholdValues[2]); + + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgo(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + uint16_t& innerLowerModuleIndex, + uint16_t& outerLowerModuleIndex, + unsigned int& innerMDIndex, + unsigned int& outerMDIndex, + float& zIn, + float& zOut, + float& rtIn, + float& rtOut, + float& dPhi, + float& dPhiMin, + float& dPhiMax, + float& dPhiChange, + float& dPhiChangeMin, + float& dPhiChangeMax, + float& dAlphaInnerMDSegment, + float& dAlphaOuterMDSegment, + float& dAlphaInnerMDOuterMD, + float& zLo, + float& zHi, + float& rtLo, + float& rtHi, + float& sdCut, + float& dAlphaInnerMDSegmentThreshold, + float& dAlphaOuterMDSegmentThreshold, + float& dAlphaInnerMDOuterMDThreshold) { + zLo = -999.f; + zHi = -999.f; + rtLo = -999.f; + rtHi = -999.f; + + if (modulesInGPU.subdets[innerLowerModuleIndex] == SDL::Barrel and + modulesInGPU.subdets[outerLowerModuleIndex] == SDL::Barrel) { + return runSegmentDefaultAlgoBarrel(acc, + modulesInGPU, + mdsInGPU, + innerLowerModuleIndex, + outerLowerModuleIndex, + innerMDIndex, + outerMDIndex, + zIn, + zOut, + rtIn, + rtOut, + dPhi, + dPhiMin, + dPhiMax, + dPhiChange, + dPhiChangeMin, + dPhiChangeMax, + dAlphaInnerMDSegment, + dAlphaOuterMDSegment, + dAlphaInnerMDOuterMD, + zLo, + zHi, + sdCut, + dAlphaInnerMDSegmentThreshold, + dAlphaOuterMDSegmentThreshold, + dAlphaInnerMDOuterMDThreshold); + } else { + return runSegmentDefaultAlgoEndcap(acc, + modulesInGPU, + mdsInGPU, + innerLowerModuleIndex, + outerLowerModuleIndex, + innerMDIndex, + outerMDIndex, + zIn, + zOut, + rtIn, + rtOut, + dPhi, + dPhiMin, + dPhiMax, + dPhiChange, + dPhiChangeMin, + dPhiChangeMax, + dAlphaInnerMDSegment, + dAlphaOuterMDSegment, + dAlphaInnerMDOuterMD, + rtLo, + rtHi, + sdCut, + dAlphaInnerMDSegmentThreshold, + dAlphaOuterMDSegmentThreshold, + dAlphaInnerMDOuterMDThreshold); + } + }; + + struct createSegmentsInGPUv2 { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalBlockIdx = alpaka::getIdx(acc); + auto const blockThreadIdx = alpaka::getIdx(acc); + auto const gridBlockExtent = alpaka::getWorkDiv(acc); + auto const blockThreadExtent = alpaka::getWorkDiv(acc); + + for (uint16_t innerLowerModuleIndex = globalBlockIdx[2]; innerLowerModuleIndex < (*modulesInGPU.nLowerModules); + innerLowerModuleIndex += gridBlockExtent[2]) { + unsigned int nInnerMDs = mdsInGPU.nMDs[innerLowerModuleIndex]; + if (nInnerMDs == 0) + continue; + + unsigned int nConnectedModules = modulesInGPU.nConnectedModules[innerLowerModuleIndex]; + + for (uint16_t outerLowerModuleArrayIdx = blockThreadIdx[1]; outerLowerModuleArrayIdx < nConnectedModules; + outerLowerModuleArrayIdx += blockThreadExtent[1]) { + uint16_t outerLowerModuleIndex = + modulesInGPU.moduleMap[innerLowerModuleIndex * MAX_CONNECTED_MODULES + outerLowerModuleArrayIdx]; + + unsigned int nOuterMDs = mdsInGPU.nMDs[outerLowerModuleIndex]; + + unsigned int limit = nInnerMDs * nOuterMDs; + + if (limit == 0) + continue; + for (unsigned int hitIndex = blockThreadIdx[2]; hitIndex < limit; hitIndex += blockThreadExtent[2]) { + unsigned int innerMDArrayIdx = hitIndex / nOuterMDs; + unsigned int outerMDArrayIdx = hitIndex % nOuterMDs; + if (outerMDArrayIdx >= nOuterMDs) + continue; + + unsigned int innerMDIndex = rangesInGPU.mdRanges[innerLowerModuleIndex * 2] + innerMDArrayIdx; + unsigned int outerMDIndex = rangesInGPU.mdRanges[outerLowerModuleIndex * 2] + outerMDArrayIdx; + + float zIn, zOut, rtIn, rtOut, dPhi, dPhiMin, dPhiMax, dPhiChange, dPhiChangeMin, dPhiChangeMax, + dAlphaInnerMDSegment, dAlphaOuterMDSegment, dAlphaInnerMDOuterMD; + + unsigned int innerMiniDoubletAnchorHitIndex = mdsInGPU.anchorHitIndices[innerMDIndex]; + unsigned int outerMiniDoubletAnchorHitIndex = mdsInGPU.anchorHitIndices[outerMDIndex]; + dPhiMin = 0; + dPhiMax = 0; + dPhiChangeMin = 0; + dPhiChangeMax = 0; + float zLo, zHi, rtLo, rtHi, sdCut, dAlphaInnerMDSegmentThreshold, dAlphaOuterMDSegmentThreshold, + dAlphaInnerMDOuterMDThreshold; + bool pass = runSegmentDefaultAlgo(acc, + modulesInGPU, + mdsInGPU, + innerLowerModuleIndex, + outerLowerModuleIndex, + innerMDIndex, + outerMDIndex, + zIn, + zOut, + rtIn, + rtOut, + dPhi, + dPhiMin, + dPhiMax, + dPhiChange, + dPhiChangeMin, + dPhiChangeMax, + dAlphaInnerMDSegment, + dAlphaOuterMDSegment, + dAlphaInnerMDOuterMD, + zLo, + zHi, + rtLo, + rtHi, + sdCut, + dAlphaInnerMDSegmentThreshold, + dAlphaOuterMDSegmentThreshold, + dAlphaInnerMDOuterMDThreshold); + + if (pass) { + unsigned int totOccupancySegments = alpaka::atomicOp( + acc, &segmentsInGPU.totOccupancySegments[innerLowerModuleIndex], 1u); + if (static_cast(totOccupancySegments) >= rangesInGPU.segmentModuleOccupancy[innerLowerModuleIndex]) { +#ifdef Warnings + printf("Segment excess alert! Module index = %d\n", innerLowerModuleIndex); +#endif + } else { + unsigned int segmentModuleIdx = + alpaka::atomicOp(acc, &segmentsInGPU.nSegments[innerLowerModuleIndex], 1u); + unsigned int segmentIdx = rangesInGPU.segmentModuleIndices[innerLowerModuleIndex] + segmentModuleIdx; + + addSegmentToMemory(segmentsInGPU, + innerMDIndex, + outerMDIndex, + innerLowerModuleIndex, + outerLowerModuleIndex, + innerMiniDoubletAnchorHitIndex, + outerMiniDoubletAnchorHitIndex, + dPhi, + dPhiMin, + dPhiMax, + dPhiChange, + dPhiChangeMin, + dPhiChangeMax, + segmentIdx); + } + } + } + } + } + } + }; + + struct createSegmentArrayRanges { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::objectRanges rangesInGPU, + struct SDL::miniDoublets mdsInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + // Initialize variables in shared memory and set to 0 + int& nTotalSegments = alpaka::declareSharedVar(acc); + nTotalSegments = 0; + alpaka::syncBlockThreads(acc); + + // Initialize variables outside of the for loop. + int occupancy, category_number, eta_number; + + for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + if (modulesInGPU.nConnectedModules[i] == 0) { + rangesInGPU.segmentModuleIndices[i] = nTotalSegments; + rangesInGPU.segmentModuleOccupancy[i] = 0; + continue; + } + + short module_rings = modulesInGPU.rings[i]; + short module_layers = modulesInGPU.layers[i]; + short module_subdets = modulesInGPU.subdets[i]; + float module_eta = alpaka::math::abs(acc, modulesInGPU.eta[i]); + + if (module_layers <= 3 && module_subdets == 5) + category_number = 0; + else if (module_layers >= 4 && module_subdets == 5) + category_number = 1; + else if (module_layers <= 2 && module_subdets == 4 && module_rings >= 11) + category_number = 2; + else if (module_layers >= 3 && module_subdets == 4 && module_rings >= 8) + category_number = 2; + else if (module_layers <= 2 && module_subdets == 4 && module_rings <= 10) + category_number = 3; + else if (module_layers >= 3 && module_subdets == 4 && module_rings <= 7) + category_number = 3; + else + category_number = -1; + + if (module_eta < 0.75) + eta_number = 0; + else if (module_eta > 0.75 && module_eta < 1.5) + eta_number = 1; + else if (module_eta > 1.5 && module_eta < 2.25) + eta_number = 2; + else if (module_eta > 2.25 && module_eta < 3) + eta_number = 3; + else + eta_number = -1; + + if (category_number == 0 && eta_number == 0) + occupancy = 572; + else if (category_number == 0 && eta_number == 1) + occupancy = 300; + else if (category_number == 0 && eta_number == 2) + occupancy = 183; + else if (category_number == 0 && eta_number == 3) + occupancy = 62; + else if (category_number == 1 && eta_number == 0) + occupancy = 191; + else if (category_number == 1 && eta_number == 1) + occupancy = 128; + else if (category_number == 2 && eta_number == 1) + occupancy = 107; + else if (category_number == 2 && eta_number == 2) + occupancy = 102; + else if (category_number == 3 && eta_number == 1) + occupancy = 64; + else if (category_number == 3 && eta_number == 2) + occupancy = 79; + else if (category_number == 3 && eta_number == 3) + occupancy = 85; + else { + occupancy = 0; +#ifdef Warnings + printf("Unhandled case in createSegmentArrayRanges! Module index = %i\n", i); +#endif + } + + int nTotSegs = alpaka::atomicOp(acc, &nTotalSegments, occupancy); + rangesInGPU.segmentModuleIndices[i] = nTotSegs; + rangesInGPU.segmentModuleOccupancy[i] = occupancy; + } + + // Wait for all threads to finish before reporting final values + alpaka::syncBlockThreads(acc); + if (globalThreadIdx[2] == 0) { + rangesInGPU.segmentModuleIndices[*modulesInGPU.nLowerModules] = nTotalSegments; + *rangesInGPU.device_nTotalSegs = nTotalSegments; + } + } + }; + + struct addSegmentRangesToEventExplicit { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + if (segmentsInGPU.nSegments[i] == 0) { + rangesInGPU.segmentRanges[i * 2] = -1; + rangesInGPU.segmentRanges[i * 2 + 1] = -1; + } else { + rangesInGPU.segmentRanges[i * 2] = rangesInGPU.segmentModuleIndices[i]; + rangesInGPU.segmentRanges[i * 2 + 1] = rangesInGPU.segmentModuleIndices[i] + segmentsInGPU.nSegments[i] - 1; + } + } + } + }; + + struct addPixelSegmentToEventKernel { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::objectRanges rangesInGPU, + struct SDL::hits hitsInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::segments segmentsInGPU, + unsigned int* hitIndices0, + unsigned int* hitIndices1, + unsigned int* hitIndices2, + unsigned int* hitIndices3, + float* dPhiChange, + uint16_t pixelModuleIndex, + const int size) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (int tid = globalThreadIdx[2]; tid < size; tid += gridThreadExtent[2]) { + unsigned int innerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2 * (tid); + unsigned int outerMDIndex = rangesInGPU.miniDoubletModuleIndices[pixelModuleIndex] + 2 * (tid) + 1; + unsigned int pixelSegmentIndex = rangesInGPU.segmentModuleIndices[pixelModuleIndex] + tid; + + addMDToMemory(acc, + mdsInGPU, + hitsInGPU, + modulesInGPU, + hitIndices0[tid], + hitIndices1[tid], + pixelModuleIndex, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + innerMDIndex); + addMDToMemory(acc, + mdsInGPU, + hitsInGPU, + modulesInGPU, + hitIndices2[tid], + hitIndices3[tid], + pixelModuleIndex, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + outerMDIndex); + + //in outer hits - pt, eta, phi + float slope = alpaka::math::sinh(acc, hitsInGPU.ys[mdsInGPU.outerHitIndices[innerMDIndex]]); + float intercept = hitsInGPU.zs[mdsInGPU.anchorHitIndices[innerMDIndex]] - + slope * hitsInGPU.rts[mdsInGPU.anchorHitIndices[innerMDIndex]]; + float score_lsq = (hitsInGPU.rts[mdsInGPU.anchorHitIndices[outerMDIndex]] * slope + intercept) - + (hitsInGPU.zs[mdsInGPU.anchorHitIndices[outerMDIndex]]); + score_lsq = score_lsq * score_lsq; + + unsigned int hits1[4]; + hits1[0] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[innerMDIndex]]; + hits1[1] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[outerMDIndex]]; + hits1[2] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[innerMDIndex]]; + hits1[3] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[outerMDIndex]]; + addPixelSegmentToMemory(acc, + segmentsInGPU, + mdsInGPU, + innerMDIndex, + outerMDIndex, + pixelModuleIndex, + hits1, + hitIndices0[tid], + hitIndices2[tid], + dPhiChange[tid], + pixelSegmentIndex, + tid, + score_lsq); + } + } + }; +} // namespace SDL + +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/TiltedGeometry.dev.cc b/RecoTracker/LSTCore/src/alpaka/TiltedGeometry.dev.cc new file mode 100644 index 0000000000000..862155abeba65 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/TiltedGeometry.dev.cc @@ -0,0 +1,49 @@ +#include "TiltedGeometry.h" + +SDL::TiltedGeometry::TiltedGeometry(std::string filename) { load(filename); } + +void SDL::TiltedGeometry::load(std::string filename) { + drdzs_.clear(); + dxdys_.clear(); + + std::ifstream ifile(filename, std::ios::binary); + if (!ifile.is_open()) { + throw std::runtime_error("Unable to open file: " + filename); + } + + while (!ifile.eof()) { + unsigned int detid; + float drdz, dxdy; + + // Read the detid, drdz, and dxdy from binary file + ifile.read(reinterpret_cast(&detid), sizeof(detid)); + ifile.read(reinterpret_cast(&drdz), sizeof(drdz)); + ifile.read(reinterpret_cast(&dxdy), sizeof(dxdy)); + + if (ifile) { + drdzs_[detid] = drdz; + dxdys_[detid] = dxdy; + } else { + // End of file or read failed + if (!ifile.eof()) { + throw std::runtime_error("Failed to read Tilted Geometry binary data."); + } + } + } +} + +float SDL::TiltedGeometry::getDrDz(unsigned int detid) const { + if (drdzs_.find(detid) != drdzs_.end()) { + return drdzs_.at(detid); + } else { + return 0; + } +} + +float SDL::TiltedGeometry::getDxDy(unsigned int detid) const { + if (dxdys_.find(detid) != dxdys_.end()) { + return dxdys_.at(detid); + } else { + return 0; + } +} diff --git a/RecoTracker/LSTCore/src/alpaka/TiltedGeometry.h b/RecoTracker/LSTCore/src/alpaka/TiltedGeometry.h new file mode 100644 index 0000000000000..51481762c5184 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/TiltedGeometry.h @@ -0,0 +1,40 @@ +#ifndef TiltedGeometry_h +#define TiltedGeometry_h + +#include +#include +#include +#include +#include +#include +#include + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#else +#include "Constants.h" +#endif + +namespace SDL { + template + class TiltedGeometry; + template <> + class TiltedGeometry { + private: + std::map drdzs_; // dr/dz slope + std::map dxdys_; // dx/dy slope + + public: + TiltedGeometry() = default; + TiltedGeometry(std::string filename); + ~TiltedGeometry() = default; + + void load(std::string); + + float getDrDz(unsigned int detid) const; + float getDxDy(unsigned int detid) const; + }; + +} // namespace SDL + +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h new file mode 100644 index 0000000000000..d65464944ab52 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h @@ -0,0 +1,583 @@ +#ifndef TrackCandidate_cuh +#define TrackCandidate_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "Triplet.h" +#include "Segment.h" +#include "MiniDoublet.h" +#include "PixelTriplet.h" +#include "Quintuplet.h" +#include "Hit.h" + +namespace SDL { + struct trackCandidates { + short* trackCandidateType; // 4-T5 5-pT3 7-pT5 8-pLS + unsigned int* directObjectIndices; // Will hold direct indices to each type containers + unsigned int* objectIndices; // Will hold tracklet and triplet indices - check the type!! + unsigned int* nTrackCandidates; + unsigned int* nTrackCandidatespT3; + unsigned int* nTrackCandidatespT5; + unsigned int* nTrackCandidatespLS; + unsigned int* nTrackCandidatesT5; + + uint8_t* logicalLayers; + unsigned int* hitIndices; + int* pixelSeedIndex; + uint16_t* lowerModuleIndices; + + FPX* centerX; + FPX* centerY; + FPX* radius; + + template + void setData(TBuff& trackCandidatesbuf) { + trackCandidateType = alpaka::getPtrNative(trackCandidatesbuf.trackCandidateType_buf); + directObjectIndices = alpaka::getPtrNative(trackCandidatesbuf.directObjectIndices_buf); + objectIndices = alpaka::getPtrNative(trackCandidatesbuf.objectIndices_buf); + nTrackCandidates = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidates_buf); + nTrackCandidatespT3 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespT3_buf); + nTrackCandidatespT5 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespT5_buf); + nTrackCandidatespLS = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatespLS_buf); + nTrackCandidatesT5 = alpaka::getPtrNative(trackCandidatesbuf.nTrackCandidatesT5_buf); + + logicalLayers = alpaka::getPtrNative(trackCandidatesbuf.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(trackCandidatesbuf.hitIndices_buf); + pixelSeedIndex = alpaka::getPtrNative(trackCandidatesbuf.pixelSeedIndex_buf); + lowerModuleIndices = alpaka::getPtrNative(trackCandidatesbuf.lowerModuleIndices_buf); + + centerX = alpaka::getPtrNative(trackCandidatesbuf.centerX_buf); + centerY = alpaka::getPtrNative(trackCandidatesbuf.centerY_buf); + radius = alpaka::getPtrNative(trackCandidatesbuf.radius_buf); + } + }; + + template + struct trackCandidatesBuffer : trackCandidates { + Buf trackCandidateType_buf; + Buf directObjectIndices_buf; + Buf objectIndices_buf; + Buf nTrackCandidates_buf; + Buf nTrackCandidatespT3_buf; + Buf nTrackCandidatespT5_buf; + Buf nTrackCandidatespLS_buf; + Buf nTrackCandidatesT5_buf; + + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf pixelSeedIndex_buf; + Buf lowerModuleIndices_buf; + + Buf centerX_buf; + Buf centerY_buf; + Buf radius_buf; + + template + trackCandidatesBuffer(unsigned int maxTrackCandidates, TDevAcc const& devAccIn, TQueue& queue) + : trackCandidateType_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + directObjectIndices_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + objectIndices_buf(allocBufWrapper(devAccIn, 2 * maxTrackCandidates, queue)), + nTrackCandidates_buf(allocBufWrapper(devAccIn, 1, queue)), + nTrackCandidatespT3_buf(allocBufWrapper(devAccIn, 1, queue)), + nTrackCandidatespT5_buf(allocBufWrapper(devAccIn, 1, queue)), + nTrackCandidatespLS_buf(allocBufWrapper(devAccIn, 1, queue)), + nTrackCandidatesT5_buf(allocBufWrapper(devAccIn, 1, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, 14 * maxTrackCandidates, queue)), + pixelSeedIndex_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 7 * maxTrackCandidates, queue)), + centerX_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + centerY_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)), + radius_buf(allocBufWrapper(devAccIn, maxTrackCandidates, queue)) { + alpaka::memset(queue, nTrackCandidates_buf, 0u); + alpaka::memset(queue, nTrackCandidatesT5_buf, 0u); + alpaka::memset(queue, nTrackCandidatespT3_buf, 0u); + alpaka::memset(queue, nTrackCandidatespT5_buf, 0u); + alpaka::memset(queue, nTrackCandidatespLS_buf, 0u); + alpaka::memset(queue, logicalLayers_buf, 0u); + alpaka::memset(queue, lowerModuleIndices_buf, 0u); + alpaka::memset(queue, hitIndices_buf, 0u); + alpaka::memset(queue, pixelSeedIndex_buf, 0); + alpaka::wait(queue); + } + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addpLSTrackCandidateToMemory(struct SDL::trackCandidates& trackCandidatesInGPU, + unsigned int trackletIndex, + unsigned int trackCandidateIndex, + uint4 hitIndices, + int pixelSeedIndex) { + trackCandidatesInGPU.trackCandidateType[trackCandidateIndex] = 8; + trackCandidatesInGPU.directObjectIndices[trackCandidateIndex] = trackletIndex; + trackCandidatesInGPU.pixelSeedIndex[trackCandidateIndex] = pixelSeedIndex; + + trackCandidatesInGPU.objectIndices[2 * trackCandidateIndex] = trackletIndex; + trackCandidatesInGPU.objectIndices[2 * trackCandidateIndex + 1] = trackletIndex; + + trackCandidatesInGPU.hitIndices[14 * trackCandidateIndex + 0] = + hitIndices.x; // Order explanation in https://github.com/SegmentLinking/TrackLooper/issues/267 + trackCandidatesInGPU.hitIndices[14 * trackCandidateIndex + 1] = hitIndices.z; + trackCandidatesInGPU.hitIndices[14 * trackCandidateIndex + 2] = hitIndices.y; + trackCandidatesInGPU.hitIndices[14 * trackCandidateIndex + 3] = hitIndices.w; + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTrackCandidateToMemory(struct SDL::trackCandidates& trackCandidatesInGPU, + short trackCandidateType, + unsigned int innerTrackletIndex, + unsigned int outerTrackletIndex, + uint8_t* logicalLayerIndices, + uint16_t* lowerModuleIndices, + unsigned int* hitIndices, + int pixelSeedIndex, + float centerX, + float centerY, + float radius, + unsigned int trackCandidateIndex, + unsigned int directObjectIndex) { + trackCandidatesInGPU.trackCandidateType[trackCandidateIndex] = trackCandidateType; + trackCandidatesInGPU.directObjectIndices[trackCandidateIndex] = directObjectIndex; + trackCandidatesInGPU.pixelSeedIndex[trackCandidateIndex] = pixelSeedIndex; + + trackCandidatesInGPU.objectIndices[2 * trackCandidateIndex] = innerTrackletIndex; + trackCandidatesInGPU.objectIndices[2 * trackCandidateIndex + 1] = outerTrackletIndex; + + size_t limits = trackCandidateType == 7 ? 7 : 5; + + //send the starting pointer to the logicalLayer and hitIndices + for (size_t i = 0; i < limits; i++) { + trackCandidatesInGPU.logicalLayers[7 * trackCandidateIndex + i] = logicalLayerIndices[i]; + trackCandidatesInGPU.lowerModuleIndices[7 * trackCandidateIndex + i] = lowerModuleIndices[i]; + } + for (size_t i = 0; i < 2 * limits; i++) { + trackCandidatesInGPU.hitIndices[14 * trackCandidateIndex + i] = hitIndices[i]; + } + trackCandidatesInGPU.centerX[trackCandidateIndex] = __F2H(centerX); + trackCandidatesInGPU.centerY[trackCandidateIndex] = __F2H(centerY); + trackCandidatesInGPU.radius[trackCandidateIndex] = __F2H(radius); + }; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, + unsigned int jx, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::hits& hitsInGPU) { + int phits1[4] = {-1, -1, -1, -1}; + int phits2[4] = {-1, -1, -1, -1}; + phits1[0] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[segmentsInGPU.mdIndices[2 * ix]]]; + phits1[1] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[segmentsInGPU.mdIndices[2 * ix + 1]]]; + phits1[2] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[segmentsInGPU.mdIndices[2 * ix]]]; + phits1[3] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[segmentsInGPU.mdIndices[2 * ix + 1]]]; + + phits2[0] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[segmentsInGPU.mdIndices[2 * jx]]]; + phits2[1] = hitsInGPU.idxs[mdsInGPU.anchorHitIndices[segmentsInGPU.mdIndices[2 * jx + 1]]]; + phits2[2] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[segmentsInGPU.mdIndices[2 * jx]]]; + phits2[3] = hitsInGPU.idxs[mdsInGPU.outerHitIndices[segmentsInGPU.mdIndices[2 * jx + 1]]]; + + int npMatched = 0; + + for (int i = 0; i < 4; i++) { + bool pmatched = false; + if (phits1[i] == -1) + continue; + + for (int j = 0; j < 4; j++) { + if (phits2[j] == -1) + continue; + + if (phits1[i] == phits2[j]) { + pmatched = true; + break; + } + } + if (pmatched) + npMatched++; + } + return npMatched; + }; + + struct crossCleanpT3 { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::objectRanges rangesInGPU, + struct SDL::pixelTriplets pixelTripletsInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::pixelQuintuplets pixelQuintupletsInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets; + for (unsigned int pixelTripletIndex = globalThreadIdx[2]; pixelTripletIndex < nPixelTriplets; + pixelTripletIndex += gridThreadExtent[2]) { + if (pixelTripletsInGPU.isDup[pixelTripletIndex]) + continue; + + // Cross cleaning step + float eta1 = __H2F(pixelTripletsInGPU.eta_pix[pixelTripletIndex]); + float phi1 = __H2F(pixelTripletsInGPU.phi_pix[pixelTripletIndex]); + + int pixelModuleIndex = *modulesInGPU.nLowerModules; + unsigned int prefix = rangesInGPU.segmentModuleIndices[pixelModuleIndex]; + + unsigned int nPixelQuintuplets = *pixelQuintupletsInGPU.nPixelQuintuplets; + for (unsigned int pixelQuintupletIndex = globalThreadIdx[1]; pixelQuintupletIndex < nPixelQuintuplets; + pixelQuintupletIndex += gridThreadExtent[1]) { + unsigned int pLS_jx = pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex]; + float eta2 = segmentsInGPU.eta[pLS_jx - prefix]; + float phi2 = segmentsInGPU.phi[pLS_jx - prefix]; + float dEta = alpaka::math::abs(acc, (eta1 - eta2)); + float dPhi = SDL::calculate_dPhi(phi1, phi2); + + float dR2 = dEta * dEta + dPhi * dPhi; + if (dR2 < 1e-5f) + pixelTripletsInGPU.isDup[pixelTripletIndex] = true; + } + } + } + }; + + struct crossCleanT5 { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::quintuplets quintupletsInGPU, + struct SDL::pixelQuintuplets pixelQuintupletsInGPU, + struct SDL::pixelTriplets pixelTripletsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (int innerInnerInnerLowerModuleArrayIndex = globalThreadIdx[0]; + innerInnerInnerLowerModuleArrayIndex < *(modulesInGPU.nLowerModules); + innerInnerInnerLowerModuleArrayIndex += gridThreadExtent[0]) { + if (rangesInGPU.quintupletModuleIndices[innerInnerInnerLowerModuleArrayIndex] == -1) + continue; + + unsigned int nQuints = quintupletsInGPU.nQuintuplets[innerInnerInnerLowerModuleArrayIndex]; + for (unsigned int innerObjectArrayIndex = globalThreadIdx[1]; innerObjectArrayIndex < nQuints; + innerObjectArrayIndex += gridThreadExtent[1]) { + unsigned int quintupletIndex = + rangesInGPU.quintupletModuleIndices[innerInnerInnerLowerModuleArrayIndex] + innerObjectArrayIndex; + + // Don't add duplicate T5s or T5s that are accounted in pT5s + if (quintupletsInGPU.isDup[quintupletIndex] or quintupletsInGPU.partOfPT5[quintupletIndex]) + continue; +#ifdef Crossclean_T5 + unsigned int loop_bound = *pixelQuintupletsInGPU.nPixelQuintuplets + *pixelTripletsInGPU.nPixelTriplets; + // Cross cleaning step + float eta1 = __H2F(quintupletsInGPU.eta[quintupletIndex]); + float phi1 = __H2F(quintupletsInGPU.phi[quintupletIndex]); + + for (unsigned int jx = globalThreadIdx[2]; jx < loop_bound; jx += gridThreadExtent[2]) { + float eta2, phi2; + if (jx < *pixelQuintupletsInGPU.nPixelQuintuplets) { + eta2 = __H2F(pixelQuintupletsInGPU.eta[jx]); + phi2 = __H2F(pixelQuintupletsInGPU.phi[jx]); + } else { + eta2 = __H2F(pixelTripletsInGPU.eta[jx - *pixelQuintupletsInGPU.nPixelQuintuplets]); + phi2 = __H2F(pixelTripletsInGPU.phi[jx - *pixelQuintupletsInGPU.nPixelQuintuplets]); + } + + float dEta = alpaka::math::abs(acc, eta1 - eta2); + float dPhi = SDL::calculate_dPhi(phi1, phi2); + + float dR2 = dEta * dEta + dPhi * dPhi; + if (dR2 < 1e-3f) + quintupletsInGPU.isDup[quintupletIndex] = true; + } +#endif + } + } + } + }; + + // Using Matt's block for the outer loop and thread for inner loop trick here! + // This will eliminate the need for another kernel just for adding the pLS, because we can __syncthreads() + struct crossCleanpLS { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::objectRanges rangesInGPU, + struct SDL::pixelTriplets pixelTripletsInGPU, + struct SDL::trackCandidates trackCandidatesInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::hits hitsInGPU, + struct SDL::quintuplets quintupletsInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + int pixelModuleIndex = *modulesInGPU.nLowerModules; + unsigned int nPixels = segmentsInGPU.nSegments[pixelModuleIndex]; + for (unsigned int pixelArrayIndex = globalThreadIdx[2]; pixelArrayIndex < nPixels; + pixelArrayIndex += gridThreadExtent[2]) { + if (!segmentsInGPU.isQuad[pixelArrayIndex] || segmentsInGPU.isDup[pixelArrayIndex]) + continue; + + float eta1 = segmentsInGPU.eta[pixelArrayIndex]; + float phi1 = segmentsInGPU.phi[pixelArrayIndex]; + unsigned int prefix = rangesInGPU.segmentModuleIndices[pixelModuleIndex]; + + unsigned int nTrackCandidates = *(trackCandidatesInGPU.nTrackCandidates); + for (unsigned int trackCandidateIndex = globalThreadIdx[1]; trackCandidateIndex < nTrackCandidates; + trackCandidateIndex += gridThreadExtent[1]) { + short type = trackCandidatesInGPU.trackCandidateType[trackCandidateIndex]; + unsigned int innerTrackletIdx = trackCandidatesInGPU.objectIndices[2 * trackCandidateIndex]; + if (type == 4) // T5 + { + unsigned int quintupletIndex = innerTrackletIdx; // T5 index + float eta2 = __H2F(quintupletsInGPU.eta[quintupletIndex]); + float phi2 = __H2F(quintupletsInGPU.phi[quintupletIndex]); + float dEta = alpaka::math::abs(acc, eta1 - eta2); + float dPhi = SDL::calculate_dPhi(phi1, phi2); + + float dR2 = dEta * dEta + dPhi * dPhi; + if (dR2 < 1e-3f) + segmentsInGPU.isDup[pixelArrayIndex] = true; + } + if (type == 5) // pT3 + { + int pLSIndex = pixelTripletsInGPU.pixelSegmentIndices[innerTrackletIdx]; + int npMatched = checkPixelHits(prefix + pixelArrayIndex, pLSIndex, mdsInGPU, segmentsInGPU, hitsInGPU); + if (npMatched > 0) + segmentsInGPU.isDup[pixelArrayIndex] = true; + + int pT3Index = innerTrackletIdx; + float eta2 = __H2F(pixelTripletsInGPU.eta_pix[pT3Index]); + float phi2 = __H2F(pixelTripletsInGPU.phi_pix[pT3Index]); + float dEta = alpaka::math::abs(acc, eta1 - eta2); + float dPhi = SDL::calculate_dPhi(phi1, phi2); + + float dR2 = dEta * dEta + dPhi * dPhi; + if (dR2 < 0.000001f) + segmentsInGPU.isDup[pixelArrayIndex] = true; + } + if (type == 7) // pT5 + { + unsigned int pLSIndex = innerTrackletIdx; + int npMatched = checkPixelHits(prefix + pixelArrayIndex, pLSIndex, mdsInGPU, segmentsInGPU, hitsInGPU); + if (npMatched > 0) { + segmentsInGPU.isDup[pixelArrayIndex] = true; + } + + float eta2 = segmentsInGPU.eta[pLSIndex - prefix]; + float phi2 = segmentsInGPU.phi[pLSIndex - prefix]; + float dEta = alpaka::math::abs(acc, eta1 - eta2); + float dPhi = SDL::calculate_dPhi(phi1, phi2); + + float dR2 = dEta * dEta + dPhi * dPhi; + if (dR2 < 0.000001f) + segmentsInGPU.isDup[pixelArrayIndex] = true; + } + } + } + } + }; + + struct addpT3asTrackCandidatesInGPU { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + uint16_t nLowerModules, + struct SDL::pixelTriplets pixelTripletsInGPU, + struct SDL::trackCandidates trackCandidatesInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets; + unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[nLowerModules]; + for (unsigned int pixelTripletIndex = globalThreadIdx[2]; pixelTripletIndex < nPixelTriplets; + pixelTripletIndex += gridThreadExtent[2]) { + if ((pixelTripletsInGPU.isDup[pixelTripletIndex])) + continue; + + unsigned int trackCandidateIdx = + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + if (trackCandidateIdx >= N_MAX_PIXEL_TRACK_CANDIDATES) // This is done before any non-pixel TCs are added + { +#ifdef Warnings + printf("Track Candidate excess alert! Type = pT3"); +#endif + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + break; + + } else { + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidatespT3, 1u); + + float radius = 0.5f * (__H2F(pixelTripletsInGPU.pixelRadius[pixelTripletIndex]) + + __H2F(pixelTripletsInGPU.tripletRadius[pixelTripletIndex])); + unsigned int pT3PixelIndex = pixelTripletsInGPU.pixelSegmentIndices[pixelTripletIndex]; + addTrackCandidateToMemory(trackCandidatesInGPU, + 5 /*track candidate type pT3=5*/, + pixelTripletIndex, + pixelTripletIndex, + &pixelTripletsInGPU.logicalLayers[5 * pixelTripletIndex], + &pixelTripletsInGPU.lowerModuleIndices[5 * pixelTripletIndex], + &pixelTripletsInGPU.hitIndices[10 * pixelTripletIndex], + segmentsInGPU.seedIdx[pT3PixelIndex - pLS_offset], + __H2F(pixelTripletsInGPU.centerX[pixelTripletIndex]), + __H2F(pixelTripletsInGPU.centerY[pixelTripletIndex]), + radius, + trackCandidateIdx, + pixelTripletIndex); + } + } + } + }; + + struct addT5asTrackCandidateInGPU { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + uint16_t nLowerModules, + struct SDL::quintuplets quintupletsInGPU, + struct SDL::trackCandidates trackCandidatesInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (int idx = globalThreadIdx[1]; idx < nLowerModules; idx += gridThreadExtent[1]) { + if (rangesInGPU.quintupletModuleIndices[idx] == -1) + continue; + + unsigned int nQuints = quintupletsInGPU.nQuintuplets[idx]; + for (unsigned int jdx = globalThreadIdx[2]; jdx < nQuints; jdx += gridThreadExtent[2]) { + unsigned int quintupletIndex = rangesInGPU.quintupletModuleIndices[idx] + jdx; + if (quintupletsInGPU.isDup[quintupletIndex] or quintupletsInGPU.partOfPT5[quintupletIndex]) + continue; + if (!(quintupletsInGPU.TightCutFlag[quintupletIndex])) + continue; + + unsigned int trackCandidateIdx = + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + if (trackCandidateIdx - *trackCandidatesInGPU.nTrackCandidatespT5 - + *trackCandidatesInGPU.nTrackCandidatespT3 >= + N_MAX_NONPIXEL_TRACK_CANDIDATES) // pT5 and pT3 TCs have been added, but not pLS TCs + { +#ifdef Warnings + printf("Track Candidate excess alert! Type = T5"); +#endif + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + break; + } else { + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidatesT5, 1u); + addTrackCandidateToMemory(trackCandidatesInGPU, + 4 /*track candidate type T5=4*/, + quintupletIndex, + quintupletIndex, + &quintupletsInGPU.logicalLayers[5 * quintupletIndex], + &quintupletsInGPU.lowerModuleIndices[5 * quintupletIndex], + &quintupletsInGPU.hitIndices[10 * quintupletIndex], + -1 /*no pixel seed index for T5s*/, + quintupletsInGPU.regressionG[quintupletIndex], + quintupletsInGPU.regressionF[quintupletIndex], + quintupletsInGPU.regressionRadius[quintupletIndex], + trackCandidateIdx, + quintupletIndex); + } + } + } + } + }; + + struct addpLSasTrackCandidateInGPU { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + uint16_t nLowerModules, + struct SDL::trackCandidates trackCandidatesInGPU, + struct SDL::segments segmentsInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + unsigned int nPixels = segmentsInGPU.nSegments[nLowerModules]; + for (unsigned int pixelArrayIndex = globalThreadIdx[2]; pixelArrayIndex < nPixels; + pixelArrayIndex += gridThreadExtent[2]) { +#ifdef TC_PLS_TRIPLETS + if (segmentsInGPU.isDup[pixelArrayIndex]) +#else + if ((!segmentsInGPU.isQuad[pixelArrayIndex]) || (segmentsInGPU.isDup[pixelArrayIndex])) +#endif + continue; + + unsigned int trackCandidateIdx = + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + if (trackCandidateIdx - *trackCandidatesInGPU.nTrackCandidatesT5 >= + N_MAX_PIXEL_TRACK_CANDIDATES) // T5 TCs have already been added + { +#ifdef Warnings + printf("Track Candidate excess alert! Type = pLS"); +#endif + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + break; + + } else { + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidatespLS, 1u); + addpLSTrackCandidateToMemory(trackCandidatesInGPU, + pixelArrayIndex, + trackCandidateIdx, + segmentsInGPU.pLSHitsIdxs[pixelArrayIndex], + segmentsInGPU.seedIdx[pixelArrayIndex]); + } + } + } + }; + + struct addpT5asTrackCandidateInGPU { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + uint16_t nLowerModules, + struct SDL::pixelQuintuplets pixelQuintupletsInGPU, + struct SDL::trackCandidates trackCandidatesInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + int nPixelQuintuplets = *pixelQuintupletsInGPU.nPixelQuintuplets; + unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[nLowerModules]; + for (int pixelQuintupletIndex = globalThreadIdx[2]; pixelQuintupletIndex < nPixelQuintuplets; + pixelQuintupletIndex += gridThreadExtent[2]) { + if (pixelQuintupletsInGPU.isDup[pixelQuintupletIndex]) + continue; + + unsigned int trackCandidateIdx = + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + if (trackCandidateIdx >= N_MAX_PIXEL_TRACK_CANDIDATES) // No other TCs have been added yet + { +#ifdef Warnings + printf("Track Candidate excess alert! Type = pT5"); +#endif + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + break; + + } else { + alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidatespT5, 1u); + + float radius = 0.5f * (__H2F(pixelQuintupletsInGPU.pixelRadius[pixelQuintupletIndex]) + + __H2F(pixelQuintupletsInGPU.quintupletRadius[pixelQuintupletIndex])); + unsigned int pT5PixelIndex = pixelQuintupletsInGPU.pixelIndices[pixelQuintupletIndex]; + addTrackCandidateToMemory(trackCandidatesInGPU, + 7 /*track candidate type pT5=7*/, + pT5PixelIndex, + pixelQuintupletsInGPU.T5Indices[pixelQuintupletIndex], + &pixelQuintupletsInGPU.logicalLayers[7 * pixelQuintupletIndex], + &pixelQuintupletsInGPU.lowerModuleIndices[7 * pixelQuintupletIndex], + &pixelQuintupletsInGPU.hitIndices[14 * pixelQuintupletIndex], + segmentsInGPU.seedIdx[pT5PixelIndex - pLS_offset], + __H2F(pixelQuintupletsInGPU.centerX[pixelQuintupletIndex]), + __H2F(pixelQuintupletsInGPU.centerY[pixelQuintupletIndex]), + radius, + trackCandidateIdx, + pixelQuintupletIndex); + } + } + } + }; +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/src/alpaka/Triplet.h b/RecoTracker/LSTCore/src/alpaka/Triplet.h new file mode 100644 index 0000000000000..1d21815ef2717 --- /dev/null +++ b/RecoTracker/LSTCore/src/alpaka/Triplet.h @@ -0,0 +1,1123 @@ +#ifndef Triplet_cuh +#define Triplet_cuh + +#ifdef LST_IS_CMSSW_PACKAGE +#include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/Module.h" +#else +#include "Constants.h" +#include "Module.h" +#endif + +#include "Segment.h" +#include "MiniDoublet.h" +#include "Hit.h" + +namespace SDL { + struct triplets { + unsigned int* segmentIndices; + uint16_t* lowerModuleIndices; //3 of them now + unsigned int* nTriplets; + unsigned int* totOccupancyTriplets; + unsigned int* nMemoryLocations; + uint8_t* logicalLayers; + unsigned int* hitIndices; + FPX* betaIn; + float* circleRadius; + float* circleCenterX; + float* circleCenterY; + bool* partOfPT5; + bool* partOfT5; + bool* partOfPT3; + +#ifdef CUT_VALUE_DEBUG + //debug variables + float* zOut; + float* rtOut; + float* deltaPhiPos; + float* deltaPhi; + float* zLo; + float* zHi; + float* zLoPointed; + float* zHiPointed; + float* sdlCut; + float* betaInCut; + float* rtLo; + float* rtHi; +#endif + template + void setData(TBuff& tripletsbuf) { + segmentIndices = alpaka::getPtrNative(tripletsbuf.segmentIndices_buf); + lowerModuleIndices = alpaka::getPtrNative(tripletsbuf.lowerModuleIndices_buf); + nTriplets = alpaka::getPtrNative(tripletsbuf.nTriplets_buf); + totOccupancyTriplets = alpaka::getPtrNative(tripletsbuf.totOccupancyTriplets_buf); + nMemoryLocations = alpaka::getPtrNative(tripletsbuf.nMemoryLocations_buf); + logicalLayers = alpaka::getPtrNative(tripletsbuf.logicalLayers_buf); + hitIndices = alpaka::getPtrNative(tripletsbuf.hitIndices_buf); + betaIn = alpaka::getPtrNative(tripletsbuf.betaIn_buf); + circleRadius = alpaka::getPtrNative(tripletsbuf.circleRadius_buf); + circleCenterX = alpaka::getPtrNative(tripletsbuf.circleCenterX_buf); + circleCenterY = alpaka::getPtrNative(tripletsbuf.circleCenterY_buf); + partOfPT5 = alpaka::getPtrNative(tripletsbuf.partOfPT5_buf); + partOfT5 = alpaka::getPtrNative(tripletsbuf.partOfT5_buf); + partOfPT3 = alpaka::getPtrNative(tripletsbuf.partOfPT3_buf); +#ifdef CUT_VALUE_DEBUG + zOut = alpaka::getPtrNative(tripletsbuf.zOut_buf); + rtOut = alpaka::getPtrNative(tripletsbuf.rtOut_buf); + deltaPhiPos = alpaka::getPtrNative(tripletsbuf.deltaPhiPos_buf); + deltaPhi = alpaka::getPtrNative(tripletsbuf.deltaPhi_buf); + zLo = alpaka::getPtrNative(tripletsbuf.zLo_buf); + zHi = alpaka::getPtrNative(tripletsbuf.zHi_buf); + zLoPointed = alpaka::getPtrNative(tripletsbuf.zLoPointed_buf); + zHiPointed = alpaka::getPtrNative(tripletsbuf.zHiPointed_buf); + sdlCut = alpaka::getPtrNative(tripletsbuf.sdlCut_buf); + betaInCut = alpaka::getPtrNative(tripletsbuf.betaInCut_buf); + rtLo = alpaka::getPtrNative(tripletsbuf.rtLo_buf); + rtHi = alpaka::getPtrNative(tripletsbuf.rtHi_buf); +#endif + } + }; + + template + struct tripletsBuffer : triplets { + Buf segmentIndices_buf; + Buf lowerModuleIndices_buf; + Buf nTriplets_buf; + Buf totOccupancyTriplets_buf; + Buf nMemoryLocations_buf; + Buf logicalLayers_buf; + Buf hitIndices_buf; + Buf betaIn_buf; + Buf circleRadius_buf; + Buf circleCenterX_buf; + Buf circleCenterY_buf; + Buf partOfPT5_buf; + Buf partOfT5_buf; + Buf partOfPT3_buf; + +#ifdef CUT_VALUE_DEBUG + Buf zOut_buf; + Buf rtOut_buf; + Buf deltaPhiPos_buf; + Buf deltaPhi_buf; + Buf zLo_buf; + Buf zHi_buf; + Buf zLoPointed_buf; + Buf zHiPointed_buf; + Buf sdlCut_buf; + Buf betaInCut_buf; + Buf rtLo_buf; + Buf rtHi_buf; +#endif + + template + tripletsBuffer(unsigned int maxTriplets, unsigned int nLowerModules, TDevAcc const& devAccIn, TQueue& queue) + : segmentIndices_buf(allocBufWrapper(devAccIn, 2 * maxTriplets, queue)), + lowerModuleIndices_buf(allocBufWrapper(devAccIn, 3 * maxTriplets, queue)), + nTriplets_buf(allocBufWrapper(devAccIn, nLowerModules, queue)), + totOccupancyTriplets_buf(allocBufWrapper(devAccIn, nLowerModules, queue)), + nMemoryLocations_buf(allocBufWrapper(devAccIn, 1, queue)), + logicalLayers_buf(allocBufWrapper(devAccIn, maxTriplets * 3, queue)), + hitIndices_buf(allocBufWrapper(devAccIn, maxTriplets * 6, queue)), + betaIn_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + circleRadius_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + circleCenterX_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + circleCenterY_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + partOfPT5_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + partOfT5_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + partOfPT3_buf(allocBufWrapper(devAccIn, maxTriplets, queue)) +#ifdef CUT_VALUE_DEBUG + , + zOut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + rtOut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + deltaPhiPos_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + deltaPhi_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + zLo_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + zHi_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + zLoPointed_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + zHiPointed_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + sdlCut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + betaInCut_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + rtLo_buf(allocBufWrapper(devAccIn, maxTriplets, queue)), + rtHi_buf(allocBufWrapper(devAccIn, maxTriplets, queue)) +#endif + { + alpaka::memset(queue, nTriplets_buf, 0u); + alpaka::memset(queue, totOccupancyTriplets_buf, 0u); + alpaka::memset(queue, partOfPT5_buf, false); + alpaka::memset(queue, partOfT5_buf, false); + alpaka::memset(queue, partOfPT3_buf, false); + } + }; + +#ifdef CUT_VALUE_DEBUG + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::triplets& tripletsInGPU, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& middleLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + float& zOut, + float& rtOut, + float& deltaPhiPos, + float& deltaPhi, + float& betaIn, + float& circleRadius, + float& circleCenterX, + float& circleCenterY, + float& zLo, + float& zHi, + float& rtLo, + float& rtHi, + float& zLoPointed, + float& zHiPointed, + float& sdlCut, + float& betaInCut, + unsigned int& tripletIndex) +#else + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + struct SDL::triplets& tripletsInGPU, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& middleLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + float& betaIn, + float& circleRadius, + float& circleCenterX, + float& circleCenterY, + unsigned int& tripletIndex) +#endif + { + tripletsInGPU.segmentIndices[tripletIndex * 2] = innerSegmentIndex; + tripletsInGPU.segmentIndices[tripletIndex * 2 + 1] = outerSegmentIndex; + tripletsInGPU.lowerModuleIndices[tripletIndex * 3] = innerInnerLowerModuleIndex; + tripletsInGPU.lowerModuleIndices[tripletIndex * 3 + 1] = middleLowerModuleIndex; + tripletsInGPU.lowerModuleIndices[tripletIndex * 3 + 2] = outerOuterLowerModuleIndex; + + tripletsInGPU.betaIn[tripletIndex] = __F2H(betaIn); + tripletsInGPU.circleRadius[tripletIndex] = circleRadius; + tripletsInGPU.circleCenterX[tripletIndex] = circleCenterX; + tripletsInGPU.circleCenterY[tripletIndex] = circleCenterY; + tripletsInGPU.logicalLayers[tripletIndex * 3] = + modulesInGPU.layers[innerInnerLowerModuleIndex] + (modulesInGPU.subdets[innerInnerLowerModuleIndex] == 4) * 6; + tripletsInGPU.logicalLayers[tripletIndex * 3 + 1] = + modulesInGPU.layers[middleLowerModuleIndex] + (modulesInGPU.subdets[middleLowerModuleIndex] == 4) * 6; + tripletsInGPU.logicalLayers[tripletIndex * 3 + 2] = + modulesInGPU.layers[outerOuterLowerModuleIndex] + (modulesInGPU.subdets[outerOuterLowerModuleIndex] == 4) * 6; + //get the hits + unsigned int firstMDIndex = segmentsInGPU.mdIndices[2 * innerSegmentIndex]; + unsigned int secondMDIndex = segmentsInGPU.mdIndices[2 * innerSegmentIndex + 1]; + unsigned int thirdMDIndex = segmentsInGPU.mdIndices[2 * outerSegmentIndex + 1]; + + tripletsInGPU.hitIndices[tripletIndex * 6] = mdsInGPU.anchorHitIndices[firstMDIndex]; + tripletsInGPU.hitIndices[tripletIndex * 6 + 1] = mdsInGPU.outerHitIndices[firstMDIndex]; + tripletsInGPU.hitIndices[tripletIndex * 6 + 2] = mdsInGPU.anchorHitIndices[secondMDIndex]; + tripletsInGPU.hitIndices[tripletIndex * 6 + 3] = mdsInGPU.outerHitIndices[secondMDIndex]; + tripletsInGPU.hitIndices[tripletIndex * 6 + 4] = mdsInGPU.anchorHitIndices[thirdMDIndex]; + tripletsInGPU.hitIndices[tripletIndex * 6 + 5] = mdsInGPU.outerHitIndices[thirdMDIndex]; +#ifdef CUT_VALUE_DEBUG + tripletsInGPU.zOut[tripletIndex] = zOut; + tripletsInGPU.rtOut[tripletIndex] = rtOut; + tripletsInGPU.deltaPhiPos[tripletIndex] = deltaPhiPos; + tripletsInGPU.deltaPhi[tripletIndex] = deltaPhi; + tripletsInGPU.zLo[tripletIndex] = zLo; + tripletsInGPU.zHi[tripletIndex] = zHi; + tripletsInGPU.rtLo[tripletIndex] = rtLo; + tripletsInGPU.rtHi[tripletIndex] = rtHi; + tripletsInGPU.zLoPointed[tripletIndex] = zLoPointed; + tripletsInGPU.zHiPointed[tripletIndex] = zHiPointed; + tripletsInGPU.sdlCut[tripletIndex] = sdlCut; + tripletsInGPU.betaInCut[tripletIndex] = betaInCut; +#endif + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& middleLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex) { + //get the rt and z + const float& r1 = mdsInGPU.anchorRt[firstMDIndex]; + const float& r2 = mdsInGPU.anchorRt[secondMDIndex]; + const float& r3 = mdsInGPU.anchorRt[thirdMDIndex]; + + const float& z1 = mdsInGPU.anchorZ[firstMDIndex]; + const float& z2 = mdsInGPU.anchorZ[secondMDIndex]; + const float& z3 = mdsInGPU.anchorZ[thirdMDIndex]; + + //following Philip's layer number prescription + const int layer1 = modulesInGPU.sdlLayers[innerInnerLowerModuleIndex]; + const int layer2 = modulesInGPU.sdlLayers[middleLowerModuleIndex]; + const int layer3 = modulesInGPU.sdlLayers[outerOuterLowerModuleIndex]; + + const float residual = z2 - ((z3 - z1) / (r3 - r1) * (r2 - r1) + z1); + + if (layer1 == 12 and layer2 == 13 and layer3 == 14) { + return false; + } else if (layer1 == 1 and layer2 == 2 and layer3 == 3) { + return alpaka::math::abs(acc, residual) < 0.53f; + } else if (layer1 == 1 and layer2 == 2 and layer3 == 7) { + return alpaka::math::abs(acc, residual) < 1; + } else if (layer1 == 13 and layer2 == 14 and layer3 == 15) { + return false; + } else if (layer1 == 14 and layer2 == 15 and layer3 == 16) { + return false; + } else if (layer1 == 1 and layer2 == 7 and layer3 == 8) { + return alpaka::math::abs(acc, residual) < 1; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 4) { + return alpaka::math::abs(acc, residual) < 1.21f; + } else if (layer1 == 2 and layer2 == 3 and layer3 == 7) { + return alpaka::math::abs(acc, residual) < 1.f; + } else if (layer1 == 2 and layer2 == 7 and layer3 == 8) { + return alpaka::math::abs(acc, residual) < 1.f; + } else if (layer1 == 3 and layer2 == 4 and layer3 == 5) { + return alpaka::math::abs(acc, residual) < 2.7f; + } else if (layer1 == 4 and layer2 == 5 and layer3 == 6) { + return alpaka::math::abs(acc, residual) < 3.06f; + } else if (layer1 == 7 and layer2 == 8 and layer3 == 9) { + return alpaka::math::abs(acc, residual) < 1; + } else if (layer1 == 8 and layer2 == 9 and layer3 == 10) { + return alpaka::math::abs(acc, residual) < 1; + } else if (layer1 == 9 and layer2 == 10 and layer3 == 11) { + return alpaka::math::abs(acc, residual) < 1; + } else { + return alpaka::math::abs(acc, residual) < 5; + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& middleLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + float& zOut, + float& rtOut, + unsigned int& innerSegmentIndex, + float& betaIn, + float& betaInCut) { + bool pass = true; + bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); + bool isPSOut = (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::PS); + + float rtIn = mdsInGPU.anchorRt[firstMDIndex]; + float rtMid = mdsInGPU.anchorRt[secondMDIndex]; + rtOut = mdsInGPU.anchorRt[thirdMDIndex]; + + float zIn = mdsInGPU.anchorZ[firstMDIndex]; + float zMid = mdsInGPU.anchorZ[secondMDIndex]; + zOut = mdsInGPU.anchorZ[thirdMDIndex]; + + float alpha1GeVOut = + alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)); + + float rtRatio_OutIn = rtOut / rtIn; // Outer segment beginning rt divided by inner segment beginning rt; + float dzDrtScale = alpaka::math::tan(acc, alpha1GeVOut) / alpha1GeVOut; // The track can bend in r-z plane slightly + float zpitchIn = (isPSIn ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + float zpitchOut = (isPSOut ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + + const float zHi = + zIn + (zIn + SDL::deltaZLum) * (rtRatio_OutIn - 1.f) * (zIn < 0.f ? 1.f : dzDrtScale) + (zpitchIn + zpitchOut); + const float zLo = zIn + (zIn - SDL::deltaZLum) * (rtRatio_OutIn - 1.f) * (zIn > 0.f ? 1.f : dzDrtScale) - + (zpitchIn + zpitchOut); //slope-correction only on outer end + + //Cut 1 - z compatibility + pass = pass and ((zOut >= zLo) && (zOut <= zHi)); + if (not pass) + return pass; + + float drt_OutIn = (rtOut - rtIn); + + float r3In = alpaka::math::sqrt(acc, zIn * zIn + rtIn * rtIn); + float drt_InSeg = rtMid - rtIn; + float dz_InSeg = zMid - zIn; + float dr3_InSeg = + alpaka::math::sqrt(acc, rtMid * rtMid + zMid * zMid) - alpaka::math::sqrt(acc, rtIn * rtIn + zIn + zIn); + + float coshEta = dr3_InSeg / drt_InSeg; + float dzErr = (zpitchIn + zpitchOut) * (zpitchIn + zpitchOut) * 2.f; + + float sdlThetaMulsF = + 0.015f * alpaka::math::sqrt(acc, 0.1f + 0.2f * (rtOut - rtIn) / 50.f) * alpaka::math::sqrt(acc, r3In / rtIn); + float sdlMuls = sdlThetaMulsF * 3.f / SDL::ptCut * 4.f; // will need a better guess than x4? + dzErr += sdlMuls * sdlMuls * drt_OutIn * drt_OutIn / 3.f * coshEta * coshEta; //sloppy + dzErr = alpaka::math::sqrt(acc, dzErr); + + // Constructing upper and lower bound + const float dzMean = dz_InSeg / drt_InSeg * drt_OutIn; + const float zWindow = + dzErr / drt_InSeg * drt_OutIn + + (zpitchIn + zpitchOut); //FIXME for SDL::ptCut lower than ~0.8 need to add curv path correction + const float zLoPointed = zIn + dzMean * (zIn > 0.f ? 1.f : dzDrtScale) - zWindow; + const float zHiPointed = zIn + dzMean * (zIn < 0.f ? 1.f : dzDrtScale) + zWindow; + + // Constructing upper and lower bound + + // Cut #2: Pointed Z (Inner segment two MD points to outer segment inner MD) + pass = pass and ((zOut >= zLoPointed) && (zOut <= zHiPointed)); + + // raw betaIn value without any correction, based on the mini-doublet hit positions + float alpha_InLo = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); + float tl_axis_x = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float tl_axis_y = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + betaIn = alpha_InLo - SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + + //beta computation + float drt_tl_axis = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); + + //innerOuterAnchor - innerInnerAnchor + const float rt_InSeg = + alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) * + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) + + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex]) * + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex])); + betaInCut = + alpaka::math::asin( + acc, alpaka::math::min(acc, (-rt_InSeg + drt_tl_axis) * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / drt_InSeg); + + //Cut #3: first beta cut + pass = pass and (alpaka::math::abs(acc, betaIn) < betaInCut); + + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& middleLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + float& zOut, + float& rtOut, + uint16_t& innerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + float& betaIn, + float& betaInCut) { + bool pass = true; + + bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS); + bool isPSOut = (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::PS); + + float rtIn = mdsInGPU.anchorRt[firstMDIndex]; + float rtMid = mdsInGPU.anchorRt[secondMDIndex]; + rtOut = mdsInGPU.anchorRt[thirdMDIndex]; + + float zIn = mdsInGPU.anchorZ[firstMDIndex]; + float zMid = mdsInGPU.anchorZ[secondMDIndex]; + zOut = mdsInGPU.anchorZ[thirdMDIndex]; + + float alpha1GeV_OutLo = + alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)); + + float dzDrtScale = + alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly + float zpitchIn = (isPSIn ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + float zpitchOut = (isPSOut ? SDL::pixelPSZpitch : SDL::strip2SZpitch); + float zGeom = zpitchIn + zpitchOut; + + // Cut #0: Preliminary (Only here in endcap case) + pass = pass and (zIn * zOut > 0); + if (not pass) + return pass; + + float dLum = SDL::copysignf(SDL::deltaZLum, zIn); + bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::PS; + float rtGeom1 = isOutSgInnerMDPS ? SDL::pixelPSZpitch : SDL::strip2SZpitch; + float zGeom1 = SDL::copysignf(zGeom, zIn); + float rtLo = rtIn * (1.f + (zOut - zIn - zGeom1) / (zIn + zGeom1 + dLum) / dzDrtScale) - + rtGeom1; //slope correction only on the lower end + + //Cut #1: rt condition + float zInForHi = zIn - zGeom1 - dLum; + if (zInForHi * zIn < 0) { + zInForHi = SDL::copysignf(0.1f, zIn); + } + float rtHi = rtIn * (1.f + (zOut - zIn + zGeom1) / zInForHi) + rtGeom1; + + //Cut #2: rt condition + pass = pass and ((rtOut >= rtLo) && (rtOut <= rtHi)); + if (not pass) + return pass; + + float rIn = alpaka::math::sqrt(acc, zIn * zIn + rtIn * rtIn); + + const float drtSDIn = rtMid - rtIn; + const float dzSDIn = zMid - zIn; + const float dr3SDIn = + alpaka::math::sqrt(acc, rtMid * rtMid + zMid * zMid) - alpaka::math::sqrt(acc, rtIn * rtIn + zIn * zIn); + + const float coshEta = dr3SDIn / drtSDIn; //direction estimate + const float dzOutInAbs = alpaka::math::abs(acc, zOut - zIn); + const float multDzDr = dzOutInAbs * coshEta / (coshEta * coshEta - 1.f); + const float zGeom1_another = SDL::pixelPSZpitch; + const float kZ = (zOut - zIn) / dzSDIn; + float drtErr = + zGeom1_another * zGeom1_another * drtSDIn * drtSDIn / dzSDIn / dzSDIn * (1.f - 2.f * kZ + 2.f * kZ * kZ); + const float sdlThetaMulsF = + 0.015f * alpaka::math::sqrt(acc, 0.1f + 0.2 * (rtOut - rtIn) / 50.f) * alpaka::math::sqrt(acc, rIn / rtIn); + const float sdlMuls = sdlThetaMulsF * 3.f / SDL::ptCut * 4.f; //will need a better guess than x4? + drtErr += + sdlMuls * sdlMuls * multDzDr * multDzDr / 3.f * coshEta * coshEta; //sloppy: relative muls is 1/3 of total muls + drtErr = alpaka::math::sqrt(acc, drtErr); + + //Cut #3: rt-z pointed + + pass = pass and (kZ >= 0) && (rtOut >= rtLo) && (rtOut <= rtHi); + + float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; + float rt_InOut = mdsInGPU.anchorRt[secondMDIndex]; + + float sdIn_alpha = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); + + float tl_axis_x = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float tl_axis_y = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + + betaIn = sdIn_alpha - SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + + float betaInRHmin = betaIn; + float betaInRHmax = betaIn; + + float swapTemp; + + if (alpaka::math::abs(acc, betaInRHmin) > alpaka::math::abs(acc, betaInRHmax)) { + swapTemp = betaInRHmin; + betaInRHmin = betaInRHmax; + betaInRHmax = swapTemp; + } + + float sdIn_dr = alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) * + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) + + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex]) * + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex])); + float sdIn_d = rt_InOut - rt_InLo; + + float dr = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); + betaInCut = alpaka::math::asin( + acc, alpaka::math::min(acc, (-sdIn_dr + dr) * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / sdIn_d); + + //Cut #4: first beta cut + pass = pass and (alpaka::math::abs(acc, betaInRHmin) < betaInCut); + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& middleLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + float& zOut, + float& rtOut, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + float& betaIn, + float& betaInCut) { + bool pass = true; + + float rtIn = mdsInGPU.anchorRt[firstMDIndex]; + float rtMid = mdsInGPU.anchorRt[secondMDIndex]; + rtOut = mdsInGPU.anchorRt[thirdMDIndex]; + + float zIn = mdsInGPU.anchorZ[firstMDIndex]; + float zMid = mdsInGPU.anchorZ[secondMDIndex]; + zOut = mdsInGPU.anchorZ[thirdMDIndex]; + + float alpha1GeV_Out = + alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)); + + float dzDrtScale = + alpaka::math::tan(acc, alpha1GeV_Out) / alpha1GeV_Out; // The track can bend in r-z plane slightly + + // Cut #0: Preliminary (Only here in endcap case) + pass = pass and (zIn * zOut > 0); + if (not pass) + return pass; + + float dLum = SDL::copysignf(SDL::deltaZLum, zIn); + bool isOutSgOuterMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::PS; + bool isInSgInnerMDPS = modulesInGPU.moduleType[innerInnerLowerModuleIndex] == SDL::PS; + + float rtGeom = (isInSgInnerMDPS and isOutSgOuterMDPS) ? 2.f * SDL::pixelPSZpitch + : (isInSgInnerMDPS or isOutSgOuterMDPS) ? SDL::pixelPSZpitch + SDL::strip2SZpitch + : 2.f * SDL::strip2SZpitch; + + float dz = zOut - zIn; + const float rtLo = rtIn * (1.f + dz / (zIn + dLum) / dzDrtScale) - rtGeom; //slope correction only on the lower end + const float rtHi = rtIn * (1.f + dz / (zIn - dLum)) + rtGeom; + + //Cut #1: rt condition + pass = pass and ((rtOut >= rtLo) && (rtOut <= rtHi)); + if (not pass) + return pass; + + bool isInSgOuterMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == SDL::PS; + + float drtSDIn = rtMid - rtIn; + float dzSDIn = zMid - zIn; + float dr3SDIn = + alpaka::math::sqrt(acc, rtMid * rtMid + zMid * zMid) - alpaka::math::sqrt(acc, rtIn * rtIn + zIn * zIn); + + float coshEta = dr3SDIn / drtSDIn; //direction estimate + float dzOutInAbs = alpaka::math::abs(acc, zOut - zIn); + float multDzDr = dzOutInAbs * coshEta / (coshEta * coshEta - 1.f); + + float kZ = (zOut - zIn) / dzSDIn; + float sdlThetaMulsF = 0.015f * alpaka::math::sqrt(acc, 0.1f + 0.2f * (rtOut - rtIn) / 50.f); + + float sdlMuls = sdlThetaMulsF * 3.f / SDL::ptCut * 4.f; //will need a better guess than x4? + + float drtErr = alpaka::math::sqrt( + acc, + SDL::pixelPSZpitch * SDL::pixelPSZpitch * 2.f / (dzSDIn * dzSDIn) * (dzOutInAbs * dzOutInAbs) + + sdlMuls * sdlMuls * multDzDr * multDzDr / 3.f * coshEta * coshEta); + + float drtMean = drtSDIn * dzOutInAbs / alpaka::math::abs(acc, dzSDIn); + float rtWindow = drtErr + rtGeom; + float rtLo_point = rtIn + drtMean / dzDrtScale - rtWindow; + float rtHi_point = rtIn + drtMean + rtWindow; + + // Cut #3: rt-z pointed + // https://github.com/slava77/cms-tkph2-ntuple/blob/superDoubletLinked-91X-noMock/doubletAnalysis.C#L3765 + + if (isInSgInnerMDPS and isInSgOuterMDPS) // If both PS then we can point + { + pass = pass and ((kZ >= 0) && (rtOut >= rtLo_point) && (rtOut <= rtHi_point)); + } + + float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; + float rt_InOut = mdsInGPU.anchorRt[secondMDIndex]; + float sdIn_alpha = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); + + float tl_axis_x = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; + float tl_axis_y = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; + + betaIn = sdIn_alpha - SDL::phi_mpi_pi(acc, SDL::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + + float sdIn_alphaRHmin = __H2F(segmentsInGPU.dPhiChangeMins[innerSegmentIndex]); + float sdIn_alphaRHmax = __H2F(segmentsInGPU.dPhiChangeMaxs[innerSegmentIndex]); + float betaInRHmin = betaIn + sdIn_alphaRHmin - sdIn_alpha; + float betaInRHmax = betaIn + sdIn_alphaRHmax - sdIn_alpha; + + float swapTemp; + + if (alpaka::math::abs(acc, betaInRHmin) > alpaka::math::abs(acc, betaInRHmax)) { + swapTemp = betaInRHmin; + betaInRHmin = betaInRHmax; + betaInRHmax = swapTemp; + } + float sdIn_dr = alpaka::math::sqrt(acc, + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) * + (mdsInGPU.anchorX[secondMDIndex] - mdsInGPU.anchorX[firstMDIndex]) + + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex]) * + (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex])); + float sdIn_d = rt_InOut - rt_InLo; + + float dr = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); + betaInCut = alpaka::math::asin( + acc, alpaka::math::min(acc, (-sdIn_dr + dr) * SDL::k2Rinv1GeVf / SDL::ptCut, SDL::sinAlphaMax)) + + (0.02f / sdIn_d); + + //Cut #4: first beta cut + pass = pass and (alpaka::math::abs(acc, betaInRHmin) < betaInCut); + return pass; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& middleLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& firstMDIndex, + unsigned int& secondMDIndex, + unsigned int& thirdMDIndex, + float& zOut, + float& rtOut, + uint16_t& innerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + float& betaIn, + float& betaInCut) { + short innerInnerLowerModuleSubdet = modulesInGPU.subdets[innerInnerLowerModuleIndex]; + short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex]; + short outerOuterLowerModuleSubdet = modulesInGPU.subdets[outerOuterLowerModuleIndex]; + + if (innerInnerLowerModuleSubdet == SDL::Barrel and middleLowerModuleSubdet == SDL::Barrel and + outerOuterLowerModuleSubdet == SDL::Barrel) { + return passPointingConstraintBBB(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + zOut, + rtOut, + innerSegmentIndex, + betaIn, + betaInCut); + } else if (innerInnerLowerModuleSubdet == SDL::Barrel and middleLowerModuleSubdet == SDL::Barrel and + outerOuterLowerModuleSubdet == SDL::Endcap) { + return passPointingConstraintBBE(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + zOut, + rtOut, + innerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + betaIn, + betaInCut); + } else if (innerInnerLowerModuleSubdet == SDL::Barrel and middleLowerModuleSubdet == SDL::Endcap and + outerOuterLowerModuleSubdet == SDL::Endcap) { + return passPointingConstraintBBE(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + zOut, + rtOut, + innerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + betaIn, + betaInCut); + + } + + else if (innerInnerLowerModuleSubdet == SDL::Endcap and middleLowerModuleSubdet == SDL::Endcap and + outerOuterLowerModuleSubdet == SDL::Endcap) { + return passPointingConstraintEEE(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + zOut, + rtOut, + innerSegmentIndex, + outerSegmentIndex, + betaIn, + betaInCut); + } + return false; // failsafe + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeRadiusFromThreeAnchorHits( + TAcc const& acc, float x1, float y1, float x2, float y2, float x3, float y3, float& g, float& f) { + float radius = 0.f; + + //writing manual code for computing radius, which obviously sucks + //TODO:Use fancy inbuilt libraries like cuBLAS or cuSOLVE for this! + //(g,f) -> center + //first anchor hit - (x1,y1), second anchor hit - (x2,y2), third anchor hit - (x3, y3) + + float denomInv = 1.0f / ((y1 - y3) * (x2 - x3) - (x1 - x3) * (y2 - y3)); + + float xy1sqr = x1 * x1 + y1 * y1; + + float xy2sqr = x2 * x2 + y2 * y2; + + float xy3sqr = x3 * x3 + y3 * y3; + + g = 0.5f * ((y3 - y2) * xy1sqr + (y1 - y3) * xy2sqr + (y2 - y1) * xy3sqr) * denomInv; + + f = 0.5f * ((x2 - x3) * xy1sqr + (x3 - x1) * xy2sqr + (x1 - x2) * xy3sqr) * denomInv; + + float c = ((x2 * y3 - x3 * y2) * xy1sqr + (x3 * y1 - x1 * y3) * xy2sqr + (x1 * y2 - x2 * y1) * xy3sqr) * denomInv; + + if (((y1 - y3) * (x2 - x3) - (x1 - x3) * (y2 - y3) == 0) || (g * g + f * f - c < 0)) { +#ifdef Warnings + printf("three collinear points or FATAL! r^2 < 0!\n"); +#endif + radius = -1.f; + } else + radius = alpaka::math::sqrt(acc, g * g + f * f - c); + + return radius; + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const& acc, + struct SDL::modules& modulesInGPU, + struct SDL::miniDoublets& mdsInGPU, + struct SDL::segments& segmentsInGPU, + uint16_t& innerInnerLowerModuleIndex, + uint16_t& middleLowerModuleIndex, + uint16_t& outerOuterLowerModuleIndex, + unsigned int& innerSegmentIndex, + unsigned int& outerSegmentIndex, + float& zOut, + float& rtOut, + float& deltaPhiPos, + float& deltaPhi, + float& betaIn, + float& circleRadius, + float& circleCenterX, + float& circleCenterY, + float& zLo, + float& zHi, + float& rtLo, + float& rtHi, + float& zLoPointed, + float& zHiPointed, + float& sdlCut, + float& betaInCut) { + bool pass = true; + //this cut reduces the number of candidates by a factor of 4, i.e., 3 out of 4 warps can end right here! + if (segmentsInGPU.mdIndices[2 * innerSegmentIndex + 1] != segmentsInGPU.mdIndices[2 * outerSegmentIndex]) + return false; + + unsigned int firstMDIndex = segmentsInGPU.mdIndices[2 * innerSegmentIndex]; + unsigned int secondMDIndex = segmentsInGPU.mdIndices[2 * outerSegmentIndex]; + unsigned int thirdMDIndex = segmentsInGPU.mdIndices[2 * outerSegmentIndex + 1]; + + pass = pass and (passRZConstraint(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex)); + if (not pass) + return pass; + pass = pass and (passPointingConstraint(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + firstMDIndex, + secondMDIndex, + thirdMDIndex, + zOut, + rtOut, + middleLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + betaIn, + betaInCut)); + if (not pass) + return pass; + + float x1 = mdsInGPU.anchorX[firstMDIndex]; + float x2 = mdsInGPU.anchorX[secondMDIndex]; + float x3 = mdsInGPU.anchorX[thirdMDIndex]; + float y1 = mdsInGPU.anchorY[firstMDIndex]; + float y2 = mdsInGPU.anchorY[secondMDIndex]; + float y3 = mdsInGPU.anchorY[thirdMDIndex]; + + circleRadius = computeRadiusFromThreeAnchorHits(acc, x1, y1, x2, y2, x3, y3, circleCenterX, circleCenterY); + return pass; + }; + + struct createTripletsInGPUv2 { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::miniDoublets mdsInGPU, + struct SDL::segments segmentsInGPU, + struct SDL::triplets tripletsInGPU, + struct SDL::objectRanges rangesInGPU, + uint16_t* index_gpu, + uint16_t nonZeroModules) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (uint16_t innerLowerModuleArrayIdx = globalThreadIdx[0]; innerLowerModuleArrayIdx < nonZeroModules; + innerLowerModuleArrayIdx += gridThreadExtent[0]) { + uint16_t innerInnerLowerModuleIndex = index_gpu[innerLowerModuleArrayIdx]; + if (innerInnerLowerModuleIndex >= *modulesInGPU.nLowerModules) + continue; + + uint16_t nConnectedModules = modulesInGPU.nConnectedModules[innerInnerLowerModuleIndex]; + if (nConnectedModules == 0) + continue; + + unsigned int nInnerSegments = segmentsInGPU.nSegments[innerInnerLowerModuleIndex]; + for (unsigned int innerSegmentArrayIndex = globalThreadIdx[1]; innerSegmentArrayIndex < nInnerSegments; + innerSegmentArrayIndex += gridThreadExtent[1]) { + unsigned int innerSegmentIndex = + rangesInGPU.segmentRanges[innerInnerLowerModuleIndex * 2] + innerSegmentArrayIndex; + + // middle lower module - outer lower module of inner segment + uint16_t middleLowerModuleIndex = segmentsInGPU.outerLowerModuleIndices[innerSegmentIndex]; + + unsigned int nOuterSegments = segmentsInGPU.nSegments[middleLowerModuleIndex]; + for (unsigned int outerSegmentArrayIndex = globalThreadIdx[2]; outerSegmentArrayIndex < nOuterSegments; + outerSegmentArrayIndex += gridThreadExtent[2]) { + unsigned int outerSegmentIndex = + rangesInGPU.segmentRanges[2 * middleLowerModuleIndex] + outerSegmentArrayIndex; + + uint16_t outerOuterLowerModuleIndex = segmentsInGPU.outerLowerModuleIndices[outerSegmentIndex]; + + float zOut, rtOut, deltaPhiPos, deltaPhi, betaIn, circleRadius, circleCenterX, circleCenterY; + float zLo, zHi, rtLo, rtHi, zLoPointed, zHiPointed, sdlCut, betaInCut; + + bool success = runTripletConstraintsAndAlgo(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + innerSegmentIndex, + outerSegmentIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + circleRadius, + circleCenterX, + circleCenterY, + zLo, + zHi, + rtLo, + rtHi, + zLoPointed, + zHiPointed, + sdlCut, + betaInCut); + + if (success) { + unsigned int totOccupancyTriplets = alpaka::atomicOp( + acc, &tripletsInGPU.totOccupancyTriplets[innerInnerLowerModuleIndex], 1u); + if (static_cast(totOccupancyTriplets) >= + rangesInGPU.tripletModuleOccupancy[innerInnerLowerModuleIndex]) { +#ifdef Warnings + printf("Triplet excess alert! Module index = %d\n", innerInnerLowerModuleIndex); +#endif + } else { + unsigned int tripletModuleIndex = + alpaka::atomicOp(acc, &tripletsInGPU.nTriplets[innerInnerLowerModuleIndex], 1u); + unsigned int tripletIndex = + rangesInGPU.tripletModuleIndices[innerInnerLowerModuleIndex] + tripletModuleIndex; +#ifdef CUT_VALUE_DEBUG + addTripletToMemory(modulesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + innerSegmentIndex, + outerSegmentIndex, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + zOut, + rtOut, + deltaPhiPos, + deltaPhi, + betaIn, + circleRadius, + circleCenterX, + circleCenterY, + zLo, + zHi, + rtLo, + rtHi, + zLoPointed, + zHiPointed, + sdlCut, + betaInCut, + tripletIndex); +#else + addTripletToMemory(modulesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + innerSegmentIndex, + outerSegmentIndex, + innerInnerLowerModuleIndex, + middleLowerModuleIndex, + outerOuterLowerModuleIndex, + betaIn, + circleRadius, + circleCenterX, + circleCenterY, + tripletIndex); +#endif + } + } + } + } + } + } + }; + + struct createTripletArrayRanges { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::objectRanges rangesInGPU, + struct SDL::segments segmentsInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + // Initialize variables in shared memory and set to 0 + int& nTotalTriplets = alpaka::declareSharedVar(acc); + nTotalTriplets = 0; + alpaka::syncBlockThreads(acc); + + // Initialize variables outside of the for loop. + int occupancy, category_number, eta_number; + + for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + if (segmentsInGPU.nSegments[i] == 0) { + rangesInGPU.tripletModuleIndices[i] = nTotalTriplets; + rangesInGPU.tripletModuleOccupancy[i] = 0; + continue; + } + + short module_rings = modulesInGPU.rings[i]; + short module_layers = modulesInGPU.layers[i]; + short module_subdets = modulesInGPU.subdets[i]; + float module_eta = alpaka::math::abs(acc, modulesInGPU.eta[i]); + + if (module_layers <= 3 && module_subdets == 5) + category_number = 0; + else if (module_layers >= 4 && module_subdets == 5) + category_number = 1; + else if (module_layers <= 2 && module_subdets == 4 && module_rings >= 11) + category_number = 2; + else if (module_layers >= 3 && module_subdets == 4 && module_rings >= 8) + category_number = 2; + else if (module_layers <= 2 && module_subdets == 4 && module_rings <= 10) + category_number = 3; + else if (module_layers >= 3 && module_subdets == 4 && module_rings <= 7) + category_number = 3; + else + category_number = -1; + + if (module_eta < 0.75) + eta_number = 0; + else if (module_eta > 0.75 && module_eta < 1.5) + eta_number = 1; + else if (module_eta > 1.5 && module_eta < 2.25) + eta_number = 2; + else if (module_eta > 2.25 && module_eta < 3) + eta_number = 3; + else + eta_number = -1; + + if (category_number == 0 && eta_number == 0) + occupancy = 543; + else if (category_number == 0 && eta_number == 1) + occupancy = 235; + else if (category_number == 0 && eta_number == 2) + occupancy = 88; + else if (category_number == 0 && eta_number == 3) + occupancy = 46; + else if (category_number == 1 && eta_number == 0) + occupancy = 755; + else if (category_number == 1 && eta_number == 1) + occupancy = 347; + else if (category_number == 2 && eta_number == 1) + occupancy = 0; + else if (category_number == 2 && eta_number == 2) + occupancy = 0; + else if (category_number == 3 && eta_number == 1) + occupancy = 38; + else if (category_number == 3 && eta_number == 2) + occupancy = 46; + else if (category_number == 3 && eta_number == 3) + occupancy = 39; + else { + occupancy = 0; +#ifdef Warnings + printf("Unhandled case in createTripletArrayRanges! Module index = %i\n", i); +#endif + } + + rangesInGPU.tripletModuleOccupancy[i] = occupancy; + unsigned int nTotT = alpaka::atomicOp(acc, &nTotalTriplets, occupancy); + rangesInGPU.tripletModuleIndices[i] = nTotT; + } + + // Wait for all threads to finish before reporting final values + alpaka::syncBlockThreads(acc); + if (globalThreadIdx[2] == 0) { + *rangesInGPU.device_nTotalTrips = nTotalTriplets; + } + } + }; + + struct addTripletRangesToEventExplicit { + template + ALPAKA_FN_ACC void operator()(TAcc const& acc, + struct SDL::modules modulesInGPU, + struct SDL::triplets tripletsInGPU, + struct SDL::objectRanges rangesInGPU) const { + auto const globalThreadIdx = alpaka::getIdx(acc); + auto const gridThreadExtent = alpaka::getWorkDiv(acc); + + for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + if (tripletsInGPU.nTriplets[i] == 0) { + rangesInGPU.tripletRanges[i * 2] = -1; + rangesInGPU.tripletRanges[i * 2 + 1] = -1; + } else { + rangesInGPU.tripletRanges[i * 2] = rangesInGPU.tripletModuleIndices[i]; + rangesInGPU.tripletRanges[i * 2 + 1] = rangesInGPU.tripletModuleIndices[i] + tripletsInGPU.nTriplets[i] - 1; + } + } + } + }; +} // namespace SDL +#endif diff --git a/RecoTracker/LSTCore/standalone/.gitignore b/RecoTracker/LSTCore/standalone/.gitignore new file mode 100644 index 0000000000000..aae6f2984e8cc --- /dev/null +++ b/RecoTracker/LSTCore/standalone/.gitignore @@ -0,0 +1,43 @@ +mtv +*~ +results/ +*.o +debug.root +*.pdf +plots/ +plots_*/ +scripts/moduleconnection*.txt +*.root +.make.log* +bin/doAnalysis +bin/sdl +bin/sdl_cuda +bin/sdl_cpu +bin/sdl_rocm +code/rooutil/librooutil.so +code/rooutil/rooutil.so +.gitversion.txt +efficiency/doAnalysis +.jobs.txt +efficiency/results* +efficiencies/ +efficiency/bin/createEffNumDenPlots +efficiency/bin/createPerfNumDenHists +efficiency/compare +efficiency/summary +*.txt +*.pyc +output* +movetoweb.sh +*.nvvp +*.ipynb +*.log +*.nsys-rep +*.sqlite +*.ncu-rep +*.swp + +*.nfs* +.directoryhash +performance/ +notebooks/ diff --git a/RecoTracker/LSTCore/standalone/Makefile b/RecoTracker/LSTCore/standalone/Makefile new file mode 100644 index 0000000000000..7e925dbf64f2d --- /dev/null +++ b/RecoTracker/LSTCore/standalone/Makefile @@ -0,0 +1,78 @@ +# Simple makefile + +EXES := bin/sdl_cpu bin/sdl_cuda + +SOURCES=$(wildcard code/core/*.cc) +OBJECTS_CPU=$(SOURCES:.cc=_cpu.o) +OBJECTS_CUDA=$(SOURCES:.cc=_cuda.o) +OBJECTS_ROCM=$(SOURCES:.cc=_rocm.o) +OBJECTS=$(OBJECTS_CPU) $(OBJECTS_CUDA) $(OBJECTS_ROCM) + +CXX = g++ +CXXFLAGS = -g -O2 -Wall -fPIC -Wshadow -Woverloaded-virtual -Wno-unused-function -fno-var-tracking -std=c++17 -DLST_IS_CMSSW_PACKAGE +INCLUDEFLAGS= -ISDL -I$(shell pwd) -Icode -Icode/core -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include $(shell rooutil-config --include) -I$(shell root-config --incdir) -I${CMSSW_BASE}/src -I../interface/alpaka/ -I../src/alpaka/ +ifdef CMSSW_RELEASE_BASE +INCLUDEFLAGS:= ${INCLUDEFLAGS} -I${CMSSW_RELEASE_BASE}/src +endif +LDFLAGS = -g -O2 $(SDLLIB) -L${TRACKLOOPERDIR}/SDL $(shell rooutil-config --libs) $(shell root-config --libs) +LDFLAGS_CUDA= -L${CUDA_HOME}/lib64 -lcudart +LDFLAGS_ROCM= -L${ROCM_ROOT}/lib -lamdhip64 +ALPAKAFLAGS = -DALPAKA_DEBUG=0 +CUDAINCLUDE = -I${CUDA_HOME}/include +ROCMINCLUDE = -I${ROCM_ROOT}/include +ALPAKA_CPU = -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +ALPAKA_CUDA = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_HOST_ONLY +ALPAKA_ROCM = -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_HOST_ONLY -DALPAKA_DISABLE_VENDOR_RNG -D__HIP_PLATFORM_HCC__ -D__HIP_PLATFORM_AMD__ +EXTRAFLAGS = -ITMultiDrawTreePlayer -Wunused-variable -lTMVA -lEG -lGenVector -lXMLIO -lMLP -lTreePlayer -fopenmp +DOQUINTUPLET = +PTCUTFLAG = +CUTVALUEFLAG = +CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG + +PRIMITIVEFLAG = +PRIMITIVEFLAG_FLAGS = -DPRIMITIVE_STUDY + +all: rooutil efficiency $(EXES) + +cutvalue: CUTVALUEFLAG = ${CUTVALUEFLAG_FLAGS} +cutvalue: rooutil efficiency $(EXES) + +primitive: PRIMITIVEFLAG = ${PRIMITIVEFLAG_FLAGS} +primitive: rooutil efficiency $(EXES) + +cutvalue_primitive: CUTVALUEFLAG = ${CUTVALUEFLAG_FLAGS} +cutvalue_primitive: PRIMITIVEFLAG = ${PRIMITIVEFLAG_FLAGS} +cutvalue_primitive: rooutil efficiency $(EXES) + + +bin/sdl_cpu: SDLLIB=-lsdl_cpu +bin/sdl_cpu: bin/sdl_cpu.o $(OBJECTS_CPU) + $(CXX) $(LDFLAGS) $(EXTRAFLAGS) $(INCLUDEFLAGS) $(ALPAKAFLAGS) $^ $(ROOTLIBS) $(PTCUTFLAG) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKA_CPU) -o $@ +bin/sdl_cuda: SDLLIB=-lsdl_cuda +bin/sdl_cuda: bin/sdl_cuda.o $(OBJECTS_CUDA) + $(CXX) $(LDFLAGS) $(EXTRAFLAGS) $(INCLUDEFLAGS) $(ALPAKAFLAGS) $^ $(ROOTLIBS) $(PTCUTFLAG) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKA_CUDA) $(LDFLAGS_CUDA) -o $@ +bin/sdl_rocm: SDLLIB=-lsdl_rocm +bin/sdl_rocm: bin/sdl_rocm.o $(OBJECTS_ROCM) + $(CXX) $(LDFLAGS) $(EXTRAFLAGS) $(INCLUDEFLAGS) $(ALPAKAFLAGS) $^ $(ROOTLIBS) $(PTCUTFLAG) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKA_ROCM) $(LDFLAGS_ROCM) -o $@ + +%_cpu.o: %.cc rooutil + $(CXX) $(CXXFLAGS) $(EXTRAFLAGS) $(INCLUDEFLAGS) $(ALPAKAFLAGS) $(PTCUTFLAG) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKA_CPU) $< -c -o $@ +%_cuda.o: %.cc rooutil + $(CXX) $(CXXFLAGS) $(EXTRAFLAGS) $(INCLUDEFLAGS) $(ALPAKAFLAGS) $(PTCUTFLAG) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKA_CUDA) $(CUDAINCLUDE) $< -c -o $@ +%_rocm.o: %.cc rooutil + $(CXX) $(CXXFLAGS) $(EXTRAFLAGS) $(INCLUDEFLAGS) $(ALPAKAFLAGS) $(PTCUTFLAG) $(CUTVALUEFLAG) $(PRIMITIVEFLAG) $(DOQUINTUPLET) $(ALPAKA_ROCM) $(ROCMINCLUDE) $< -c -o $@ + +rooutil: + $(MAKE) -C code/rooutil/ + +efficiency: rooutil + $(MAKE) -C efficiency/ + +clean: + rm -f $(OBJECTS) bin/*.o $(EXES) bin/sdl + rm -f code/rooutil/*.so code/rooutil/*.o + rm -f bin/sdl.o + rm -f SDL/*.o + cd efficiency/ && make clean + +.PHONY: rooutil efficiency diff --git a/RecoTracker/LSTCore/standalone/README.md b/RecoTracker/LSTCore/standalone/README.md new file mode 100644 index 0000000000000..02fbef943f697 --- /dev/null +++ b/RecoTracker/LSTCore/standalone/README.md @@ -0,0 +1,291 @@ +# TrackLooper + + +## Quick Start + + +### Setting up LSTPerformanceWeb (only for lnx7188 and lnx4555) + +For lnx7188 and lnx4555 this needs to be done once + + cd /cdat/tem/${USER}/ + git clone git@github.com:SegmentLinking/LSTPerformanceWeb.git + +### Setting up container (only for lnx7188) + +For lnx7188 this needs to be done before compiling or running the code: + + singularity shell --nv --bind /mnt/data1:/data --bind /data2/segmentlinking/ --bind /opt --bind /nfs --bind /mnt --bind /usr/local/cuda/bin/ --bind /cvmfs /cvmfs/unpacked.cern.ch/registry.hub.docker.com/cmssw/el8:x86_64 + +### Setting up the code + + git clone git@github.com:SegmentLinking/TrackLooper.git + cd TrackLooper/ + # Source one of the commands below, depending on the site + source setup.sh # if on UCSD or Cornell + source setup_hpg.sh # if on Florida + +### Running the code + + sdl_make_tracklooper -mc + sdl_ -i PU200 -o LSTNtuple.root + createPerfNumDenHists -i LSTNtuple.root -o LSTNumDen.root + lst_plot_performance.py LSTNumDen.root -t "myTag" + # python3 efficiency/python/lst_plot_performance.py LSTNumDen.root -t "myTag" # if you are on cgpu-1 or Cornell + +The above can be even simplified + + sdl_run -f -mc -s PU200 -n -1 -t myTag + +The `-f` flag can be omitted when the code has already been compiled. If multiple backends were compiled, then the `-b` flag can be used to specify a backend. For example + + sdl_run -b cpu -s PU200 -n -1 -t myTag + +## Command explanations + +Compile the code with option flags. If none of `C,G,R,A` are used, then it defaults to compiling for CUDA and CPU. + + sdl_make_tracklooper -mc + -m: make clean binaries + -c: run with the cmssw caching allocator + -C: compile CPU backend + -G: compile CUDA backend + -R: compile ROCm backend + -A: compile all backends + -h: show help screen with all options + +Run the code + + sdl_ -n -v -w -s -i -o + + -i: PU200; muonGun, etc + -n: number of events; default: all + -v: 0-no printout; 1- timing printout only; 2- multiplicity printout; default: 0 + -s: number of streams/events in flight; default: 1 + -w: 0- no writeout; 1- minimum writeout; default: 1 + -o: provide an output root file name (e.g. LSTNtuple.root); default: debug.root + -l: add lower level object (pT3, pT5, T5, etc.) branches to the output + +Plotting numerators and denominators of performance plots + + createPerfNumDenHists -i -o [-g -n ] + + -i: Path to LSTNtuple.root + -o: provide an output root file name (e.g. num_den_hist.root) + -n: (optional) number of events + -g: (optional) comma separated pdgids to add more efficiency plots with different sim particle slices + +Plotting performance plots + + lst_plot_performance.py num_den_hist.root -t "mywork" + +There are several options you can provide to restrict number of plots being produced. +And by default, it creates a certain set of objects. +One can specifcy the type, range, metric, etc. +To see the full information type + + lst_plot_performance.py --help + +To give an example of plotting efficiency, object type of lower level T5, for |eta| < 2.5 only. + + lst_plot_performance.py num_den_hist.root -t "mywork" -m eff -o T5_lower -s loweta + +NOTE: in order to plot lower level object, ```-l``` option must have been used during ```sdl``` step! + +When running on ```cgpu-1``` remember to specify python3 as there is no python. +The shebang on the ```lst_plot_performance.py``` is not updated as ```lnx7188``` works with python2... + + python3 efficiency/python/lst_plot_performance.py num_den_hist.root -t "mywork" # If running on cgpu-1 + +Comparing two different runs + + lst_plot_performance.py \ + num_den_hist_1.root \ # Reference + num_den_hist_2.root \ # New work + -L BaseLine,MyNewWork \ # Labeling + -t "mywork" \ + --compare + +## CMSSW Integration +This is the a complete set of instruction on how the TrackLooper code +can be linked as an external tool in CMSSW: + +### Build TrackLooper +```bash +git clone git@github.com:SegmentLinking/TrackLooper.git +cd TrackLooper/ +# Source one of the commands below, depending on the site +source setup.sh # if on UCSD or Cornell +source setup_hpg.sh # if on Florida +sdl_make_tracklooper -mc +cd .. +``` + +### Set up `TrackLooper` as an external +```bash +mkdir workingFolder # Create the folder you will be working in +cd workingFolder +cmsrel CMSSW_14_1_0_pre3 +cd CMSSW_14_1_0_pre3/src +cmsenv +git cms-init +git remote add SegLink git@github.com:SegmentLinking/cmssw.git +git fetch SegLink CMSSW_14_1_0_pre3_LST_X +git cms-addpkg RecoTracker Configuration +git checkout CMSSW_14_1_0_pre3_LST_X +#To include both the CPU library and GPU library into CMSSW, create 3 xml files (headers file has no library). +#Before writing the following xml file, check that libsdl_cpu.so and libsdl_gpu.so can be found under the ../../../TrackLooper/SDL/ folder. +cat <lst_headers.xml + + + + + + + +EOF +cat <lst_cpu.xml + + + + + + + + + +EOF +cat <lst_cuda.xml + + + + + + + + + +EOF +scram setup lst_headers.xml +scram setup lst_cpu.xml +scram setup lst_cuda.xml +cmsenv +git cms-checkdeps -a -A +scram b -j 12 +``` + +### Run the LST reconstruction in CMSSW +A simple test configuration of the LST reconstruction can be run with the command: +```bash +cmsRun RecoTracker/LST/test/LSTAlpakaTester.py +``` + +For a more complete workflow, one can run a modified version of the 21034.1 workflow. +To get the commands of this workflow, one can run: +```bash +runTheMatrix.py -w upgrade -n -e -l 21034.1 +``` + +For convenience, the workflow has been run for 100 events and the output is stored here: +```bash +/data2/segmentlinking/CMSSW_14_1_0_pre0/step2_21034.1_100Events.root +``` + +For enabling the LST reconstruction in the CMSSW tracking workflow, a modified step3 needs to be run. +This is based on the step3 command of the 21034.1 workflow with the following changes: + - Remove the `--pileup_input` and `--pileup` flags. + - The number of threads and streams for the job can be optionally controlled by the `--nThreads` and `--nStreams` command line options respectively (`1` ends up being the actual default value for both, and more info can be found by running `cmsDriver.py --help`). + - Add at the end of the command: `--procModifiers gpu,trackingLST,trackingIters01 --no_exec` + +Run the command and modify the output configuration file with the following: + - If want to run a cpu version, remove the ```gpu``` in the line defining the `process` object: + ```python + process = cms.Process('RECO',...,gpu,...) + ``` + - Add the following lines below the part where the import of the standard configurations happens: + ```python + process.load('Configuration.StandardSequences.Accelerators_cff') + process.load("HeterogeneousCore.AlpakaCore.ProcessAcceleratorAlpaka_cfi") + ``` + - Modify the input and output file names accordingly, as well as the number of events. + +Then, run the configuration file with `cmsRun`. + +To get the DQM files, one would have to run step4 of the 21034.1 workflow with the following modifications: + - Add `--no_exec` to the end of command and then run it. + - Modify the output configuration file by changing the input file (the one containing `inDQM` from the previous step) and number of events accordingly. + +Running the configuration file with `cmsRun`, the output file will have a name starting with `DQM`. The name is the same every time this step runs, +so it is good practice to rename the file, e.g. to `tracking_Iters01LST.root`. +The MTV plots can be produced with the command: +```bash +makeTrackValidationPlots.py --extended tracking_Iters01LST.root +``` +Comparison plots can be made by including multiple ROOT files as arguments. + +**Note:** In case one wants to run step2 as well, similar modifications as in step4 (`--no_exec` flag and input file/number of events) need to be applied. Moreover, the PU files have better be modified to point to local ones. This can be done by inserting a dummy file when running the command (set the argument of the `--pileup_input` flag to `file:file.root`), and then change the PU input files in the configuration to the following line (by means of replacing the corresponding line in the configuration): +```python +process.mix.input.fileNames = cms.untracked.vstring(['file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/066fc95d-1cef-4469-9e08-3913973cd4ce.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/07928a25-231b-450d-9d17-e20e751323a1.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/26bd8fb0-575e-4201-b657-94cdcb633045.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/4206a9c5-44c2-45a5-aab2-1a8a6043a08a.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/55a372bf-a234-4111-8ce0-ead6157a1810.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/59ad346c-f405-4288-96d7-795f81c43fe8.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/7280f5ec-b71d-4579-a730-7ce2de0ff906.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/b93adc85-715f-477a-afc9-65f3241933ee.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/c7a0aa46-f55c-4b01-977f-34a397b71fba.root', 'file:/data2/segmentlinking/PUSamplesForCMSSW1263/CMSSW_12_3_0_pre5/RelValMinBias_14TeV/GEN-SIM/123X_mcRun4_realistic_v4_2026D88noPU-v1/e77fa467-97cb-4943-884f-6965b4eb0390.root']) +``` + +### Inclusion of LST in other CMSSW packages +Including the line +``` + +``` +in the relevant package `BuildFile.xml` allows for +including our headers in the code of that package. + +## Running LST in a CVMFS-less setup + +The setup scripts included in this repository assume that the [CernVM File System (CVMFS)](https://cernvm.cern.ch/fs/) is installed. This provides a convenient way to fetch the required dependencies, but it is not necessary to run LST in standalone mode. Here, we briefly describe how to build and run it when CVMFS is not available. + +The necessary dependencies are CUDA, ROOT, the Boost libraries, Alpaka, and some CMSSW headers. CUDA, ROOT, and Boost, are fairly standard libraries and are available from multiple package managers. For the remaining necessary headers you will need to clone the [Alpaka](https://github.com/alpaka-group/alpaka) and [CMSSW](https://github.com/cms-sw/cmssw) repositories. The Alpaka repository is reasonably sized, but the CMSSW one extremely large, especially considering that we only need a tiny fraction of its files to build LST. We can get only the Alpaka interface headers from CMSSW by running the following commands. + +``` bash +git clone --filter=blob:none --no-checkout --depth 1 --sparse --branch CMSSW_14_1_X https://github.com/cms-sw/cmssw.git +cd cmssw +git sparse-checkout add HeterogeneousCore/AlpakaInterface +git checkout +``` + +Then all that is left to do is set some environment variables. We give an example of how to do this in lnx7188/cgpu-1. + +```bash +# These two lines are only needed to set the right version of gcc and nvcc. They are not needed for standard installations. +export PATH=/cvmfs/cms.cern.ch/el8_amd64_gcc12/external/gcc/12.3.1-40d504be6370b5a30e3947a6e575ca28/bin:/cvmfs/cms.cern.ch/el8_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre3/external/el8_amd64_gcc12/bin:$PATH +export LD_LIBRARY_PATH=/cvmfs/cms.cern.ch/el8_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre3/biglib/el8_amd64_gcc12:/cvmfs/cms.cern.ch/el8_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre3/lib/el8_amd64_gcc12:/cvmfs/cms.cern.ch/el8_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre3/external/el8_amd64_gcc12/lib:/cvmfs/cms.cern.ch/el8_amd64_gcc12/external/gcc/12.3.1-40d504be6370b5a30e3947a6e575ca28/lib64:/cvmfs/cms.cern.ch/el8_amd64_gcc12/external/gcc/12.3.1-40d504be6370b5a30e3947a6e575ca28/lib:$LD_LIBRARY_PATH + +# These are the lines that you need to manually change for a CVMFS-less setup. +# In this example we use cvmfs paths since that is where the dependencies are in lnx7188/cgpu1, but they can point to local directories. +export BOOST_ROOT=/cvmfs/cms.cern.ch/el8_amd64_gcc12/external/boost/1.80.0-60a217837b5db1cff00c7d88ec42f53a +export ALPAKA_ROOT=/cvmfs/cms.cern.ch/el8_amd64_gcc12/external/alpaka/1.1.0-7d0324257db47fde2d27987e7ff98fb4 +export CUDA_HOME=/cvmfs/cms.cern.ch/el8_amd64_gcc12/external/cuda/12.4.1-06cde0cd9f95a73a1ea05c8535f60bde +export ROOT_ROOT=/cvmfs/cms.cern.ch/el8_amd64_gcc12/lcg/root/6.30.07-21947a33e64ceb827a089697ad72e468 +export CMSSW_BASE=/cvmfs/cms.cern.ch/el8_amd64_gcc12/cms/cmssw/CMSSW_14_1_0_pre3 + +# These lines are needed to account for some extra environment variables that are exported in the setup script. +export LD_LIBRARY_PATH=$PWD/SDL/cuda:$PWD/SDL/cpu:$PWD:$LD_LIBRARY_PATH +export PATH=$PWD/bin:$PATH +export PATH=$PWD/efficiency/bin:$PATH +export PATH=$PWD/efficiency/python:$PATH +export TRACKLOOPERDIR=$PWD +export TRACKINGNTUPLEDIR=/data2/segmentlinking/CMSSW_12_2_0_pre2/ +export LSTOUTPUTDIR=. +source $PWD/code/rooutil/thisrooutil.sh + +# After this, you can compile and run LST as usual. +sdl_run -f -mc -s PU200 -n -1 -t myTag +``` + +## Code formatting and checking + +The makefile in the `SDL` directory includes phony targets to run `clang-format` and `clang-tidy` on the code using the formatting and checks used in CMSSW. The following are the available commands. + +- `make format` + Formats the code in the `SDL` directory using `clang-format` following the rules specified in `.clang-format`. +- `make check` + Runs `clang-tidy` on the code in the `SDL` directory to performs the checks specified in `.clang-tidy`. +- `make check-fix` + Same as `make check`, but fixes the issues that it knows how to fix. + \ No newline at end of file diff --git a/RecoTracker/LSTCore/standalone/SDL/.gitignore b/RecoTracker/LSTCore/standalone/SDL/.gitignore new file mode 100644 index 0000000000000..32429d8358fb5 --- /dev/null +++ b/RecoTracker/LSTCore/standalone/SDL/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +.vscode/ diff --git a/RecoTracker/LSTCore/standalone/SDL/Makefile b/RecoTracker/LSTCore/standalone/SDL/Makefile new file mode 100644 index 0000000000000..c50eaca859160 --- /dev/null +++ b/RecoTracker/LSTCore/standalone/SDL/Makefile @@ -0,0 +1,140 @@ +# +# stuff to make +# + +CCSOURCES=$(filter-out ../../src.alpaka/LST.dev.cc, $(wildcard ../../src/alpaka/*.dev.cc)) +CCOBJECTS_CPU=$(patsubst ../../src/alpaka/%.dev.cc, %_cpu.o, $(CCSOURCES)) +CCOBJECTS_CUDA=$(patsubst ../../src/alpaka/%.dev.cc, %_cuda.o, $(CCSOURCES)) +CCOBJECTS_ROCM=$(patsubst ../../src/alpaka/%.dev.cc, %_rocm.o, $(CCSOURCES)) + +LSTSOURCES=../../src/alpaka/LST.dev.cc +LSTOBJECTS_CPU=$(patsubst ../../src/alpaka/%.dev.cc, %_cpu.o, $(LSTSOURCES)) +LSTOBJECTS_CUDA=$(patsubst ../../src/alpaka/%.dev.cc, %_cuda.o, $(LSTSOURCES)) +LSTOBJECTS_ROCM=$(patsubst ../../src/alpaka/%.dev.cc, %_rocm.o, $(LSTSOURCES)) + +# Default to CPU and CUDA backends +ifeq ($(BACKEND),) + LIB_CPU=libsdl_cpu.so + LIB_CUDA=libsdl_cuda.so +endif + +ifneq ($(findstring cpu,$(BACKEND)),) + LIB_CPU=libsdl_cpu.so +endif +ifneq ($(findstring cuda,$(BACKEND)),) + LIB_CUDA=libsdl_cuda.so +endif +ifneq ($(findstring rocm,$(BACKEND)),) + LIB_ROCM=libsdl_rocm.so +endif +ifneq ($(findstring all,$(BACKEND)),) + LIB_CPU=libsdl_cpu.so + LIB_CUDA=libsdl_cuda.so + LIB_ROCM=libsdl_rocm.so +endif + +LIBS=$(LIB_CPU) $(LIB_CUDA) $(LIB_ROCM) + +# +# flags to keep track of +# + +# Different architectures to optimize for +GENCODE_CUDA := -gencode arch=compute_70,code=[sm_70,compute_70] -gencode arch=compute_89,code=[sm_89,compute_89] + +CXX = g++ +CXXFLAGS_CPU = -march=native -mtune=native -Ofast -fno-reciprocal-math -fopenmp-simd -g -Wall -Wshadow -Woverloaded-virtual -fPIC -fopenmp -I.. +CXXFLAGS_CUDA = -O3 -g --compiler-options -Wall --compiler-options -Wshadow --compiler-options -Woverloaded-virtual --compiler-options -fPIC --compiler-options -fopenmp -dc -lineinfo --ptxas-options=-v --cudart shared $(GENCODE_CUDA) --use_fast_math --default-stream per-thread -I.. +CXXFLAGS_ROCM = -O3 -g -Wall -Wshadow -Woverloaded-virtual -fPIC -I${ROCM_ROOT}/include -I.. +CMSSWINCLUDE := -I${CMSSW_BASE}/src -DLST_IS_CMSSW_PACKAGE +ifdef CMSSW_RELEASE_BASE +CMSSWINCLUDE := ${CMSSWINCLUDE} -I${CMSSW_RELEASE_BASE}/src +endif +ALPAKAINCLUDE = -I${ALPAKA_ROOT}/include -I/${BOOST_ROOT}/include -std=c++17 ${CMSSWINCLUDE} +ALPAKASERIAL = -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED +ALPAKACUDA = -DALPAKA_ACC_GPU_CUDA_ENABLED -DALPAKA_ACC_GPU_CUDA_ONLY --expt-relaxed-constexpr +ALPAKAROCM = -DALPAKA_ACC_GPU_HIP_ENABLED -DALPAKA_ACC_GPU_HIP_ONLY -DALPAKA_DISABLE_VENDOR_RNG +ROOTINCLUDE = -I$(ROOT_ROOT)/include +ROOTCFLAGS = -pthread -m64 $(ROOTINCLUDE) +PRINTFLAG = -DT4FromT3 +DUPLICATES = -DDUP_pLS -DDUP_T5 -DDUP_pT5 -DDUP_pT3 -DCrossclean_T5 -DCrossclean_pT3 #-DFP16_Base +CACHEFLAG = +PTCUTFLAG = +LSTWARNINGSFLAG = +CMSSW_WERRORS_CPU = -Werror=pointer-arith -Werror=overlength-strings -Werror=return-type -Werror=missing-braces -Werror=unused-value -Werror=unused-label \ + -Werror=address -Werror=format -Werror=sign-compare -Werror=write-strings -Werror=delete-non-virtual-dtor -Werror=strict-aliasing -Werror=narrowing \ + -Werror=unused-but-set-variable -Werror=reorder -Werror=unused-variable -Werror=conversion-null -Werror=return-local-addr -Wnon-virtual-dtor -Werror=switch \ + -Werror=main -Werror=overflow -Werror=format-contains-nul -Werror=type-limits -Wreturn-type -Wextra -Wpessimizing-move -Wclass-memaccess -Wunused \ + -Wparentheses -Wno-vla -Wno-non-template-friend -Wno-long-long -Wno-cast-function-type -Wno-unused-but-set-parameter -Wno-ignored-qualifiers \ + -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-attributes +CMSSW_WERRORS_CUDA = $(patsubst %,-Xcompiler %,$(CMSSW_WERRORS_CPU)) +CMSSW_WERRORS_ROCM = $(CMSSW_WERRORS_CPU) +CACHEFLAG_FLAGS = -DCACHE_ALLOC +T5CUTFLAGS = $(T5DNNFLAG) $(T5RZCHI2FLAG) $(T5RPHICHI2FLAG) + +LD_CPU = g++ +SOFLAGS_CPU = -g -shared -fPIC +ALPAKABACKEND_CPU = $(ALPAKASERIAL) +COMPILE_CMD_CPU = $(LD_CPU) -c + +LD_CUDA = nvcc +SOFLAGS_CUDA = -g -shared --compiler-options -fPIC --cudart shared $(GENCODE_CUDA) +ALPAKABACKEND_CUDA = $(ALPAKACUDA) +COMPILE_CMD_CUDA = $(LD_CUDA) -x cu + +LD_ROCM = hipcc +SOFLAGS_ROCM = -g -shared -fPIC +ALPAKABACKEND_ROCM = $(ALPAKAROCM) +COMPILE_CMD_ROCM = $(LD_ROCM) -c + +CUTVALUEFLAG = +CUTVALUEFLAG_FLAGS = -DCUT_VALUE_DEBUG + +%_cpu.o: ../../src/alpaka/%.dev.cc + $(COMPILE_CMD_CPU) $(CXXFLAGS_CPU) $(ROOTINCLUDE) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(LSTWARNINGSFLAG) $(CMSSW_WERRORS_CPU) $(T5CUTFLAGS) $(NOPLSDUPCLEANFLAG) $(TCPLSTRIPLETSFLAG) $(PTCUTFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKABACKEND_CPU) $< -o $@ + +%_cuda.o: ../../src/alpaka/%.dev.cc + $(COMPILE_CMD_CUDA) $(CXXFLAGS_CUDA) $(ROOTINCLUDE) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(LSTWARNINGSFLAG) $(CMSSW_WERRORS_CUDA) $(T5CUTFLAGS) $(NOPLSDUPCLEANFLAG) $(TCPLSTRIPLETSFLAG) $(PTCUTFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKABACKEND_CUDA) $< -o $@ + +%_rocm.o: ../../src/alpaka/%.dev.cc + $(COMPILE_CMD_ROCM) $(CXXFLAGS_ROCM) $(ROOTINCLUDE) $(PRINTFLAG) $(CACHEFLAG) $(CUTVALUEFLAG) $(LSTWARNINGSFLAG) $(CMSSW_WERRORS_ROCM) $(T5CUTFLAGS) $(NOPLSDUPCLEANFLAG) $(TCPLSTRIPLETSFLAG) $(PTCUTFLAG) $(DUPLICATES) $(ALPAKAINCLUDE) $(ALPAKABACKEND_ROCM) $< -o $@ + +$(LIB_CPU): $(CCOBJECTS_CPU) $(LSTOBJECTS_CPU) + $(LD_CPU) $(SOFLAGS_CPU) $^ -o $@ + +$(LIB_CUDA): $(CCOBJECTS_CUDA) $(LSTOBJECTS_CUDA) + $(LD_CUDA) $(SOFLAGS_CUDA) $^ -o $@ + +$(LIB_ROCM): $(CCOBJECTS_ROCM) $(LSTOBJECTS_ROCM) + $(LD_ROCM) $(SOFLAGS_ROCM) $^ -o $@ + +explicit: $(LIBS) + +explicit_cache: CACHEFLAG += $(CACHEFLAG_FLAGS) +explicit_cache: $(LIBS) + +explicit_cache_cutvalue: CUTVALUEFLAG = $(CUTVALUEFLAG_FLAGS) +explicit_cache_cutvalue: CACHEFLAG += $(CACHEFLAG_FLAGS) +explicit_cache_cutvalue: $(LIBS) + +clean: + rm -f *.opp + rm -f *.o + rm -f *.d + rm -f *.so + +.PHONY: clean explicit explicit_cache explicit_cache_cutvalue format check check-fix + +format: + clang-format --style=file:../.clang-format -i *.cc *.h + +# Collect all the include paths from the compiler. +# The .../gcc/x86_64-redhat-linux-gnu/*/include path is excluded since .../gcc/x86_64-redhat-linux-gnu/*/include-fixed should be used instead. +TIDYINCLUDEFLAGS := $(shell g++ -E -x c++ - -v < /dev/null 2>&1 | awk '/#include <...>/,/^End of search/{if (/^ / && !/x86_64-redhat-linux-gnu\/[0-9.]+\/include$$/) print "-I"$$1}' | tr '\n' ' ') +TIDYFLAGS := --language=c++ $(CXXFLAGS_CPU) $(ALPAKAINCLUDE) $(ALPAKASERIAL) $(ROOTCFLAGS) $(PRINTFLAG) $(DUPLICATED) $(CACHEFLAG_FLAGS) $(TIDYINCLUDEFLAGS) + +check: + clang-tidy --config-file=../.clang-tidy *.cc *.h -- $(TIDYFLAGS) + +check-fix: + clang-tidy --config-file=../.clang-tidy --format-style=file:../.clang-format --fix --fix-errors --fix-notes *.cc *.h -- $(TIDYFLAGS) diff --git a/RecoTracker/LSTCore/standalone/bin/sdl.cc b/RecoTracker/LSTCore/standalone/bin/sdl.cc new file mode 100644 index 0000000000000..3aed3e3c4d3b7 --- /dev/null +++ b/RecoTracker/LSTCore/standalone/bin/sdl.cc @@ -0,0 +1,509 @@ +#include "sdl.h" + +#include + +//___________________________________________________________________________________________________________________________________________________________________________________________ +int main(int argc, char **argv) { + //******************************************************************************** + // + // 0. Preliminary operations + // + //******************************************************************************** + + // Checking the TRACKLOOPERDIR is set + ana.track_looper_dir_path = gSystem->Getenv("TRACKLOOPERDIR"); + if (ana.track_looper_dir_path.IsNull()) { + RooUtil::error( + "TRACKLOOPERDIR is not set! Did you run $ source setup.sh from TrackLooper/ main repository directory?"); + } + RooUtil::print(TString::Format("TRACKLOOPERDIR=%s", ana.track_looper_dir_path.Data())); + + // Write the command line used to run it + // N.B. This needs to be before the argument parsing as it will change some values + std::vector allArgs(argv, argv + argc); + ana.full_cmd_line = ""; + for (auto &str : allArgs) { + ana.full_cmd_line += TString::Format(" %s", str.c_str()); + } + + //******************************************************************************** + // + // 1. Parsing options + // + //******************************************************************************** + + // cxxopts is just a tool to parse argc, and argv easily + + // Grand option setting + cxxopts::Options options("\n $ sdl", + "\n **********************\n * *\n * " + "Looper *\n * *\n **********************\n"); + + // Read the options + options.add_options()("m,mode", "Run mode (NOT DEFINED)", cxxopts::value()->default_value("5"))( + "i,input", + "Comma separated input file list OR if just a directory is provided it will glob all in the directory BUT must " + "end with '/' for the path", + cxxopts::value()->default_value("muonGun"))( + "t,tree", + "Name of the tree in the root file to open and loop over", + cxxopts::value()->default_value("trackingNtuple/tree"))( + "o,output", "Output file name", cxxopts::value())( + "N,nmatch", "N match for MTV-like matching", cxxopts::value()->default_value("9"))( + "n,nevents", "N events to loop over", cxxopts::value()->default_value("-1"))( + "x,event_index", "specific event index to process", cxxopts::value()->default_value("-1"))( + "g,pdg_id", "The simhit pdgId match option (default = 0)", cxxopts::value()->default_value("0"))( + "v,verbose", + "Verbose mode (0: no print, 1: only final timing, 2: object multiplitcity", + cxxopts::value()->default_value("0"))( + "w,write_ntuple", "Write Ntuple", cxxopts::value()->default_value("1"))( + "s,streams", "Set number of streams (default=1)", cxxopts::value()->default_value("1"))( + "d,debug", "Run debug job. i.e. overrides output option to 'debug.root' and 'recreate's the file.")( + "l,lower_level", "write lower level objects ntuple results")("G,gnn_ntuple", "write gnn input variable ntuple")( + "j,nsplit_jobs", "Enable splitting jobs by N blocks (--job_index must be set)", cxxopts::value())( + "I,job_index", + "job_index of split jobs (--nsplit_jobs must be set. index starts from 0. i.e. 0, 1, 2, 3, etc...)", + cxxopts::value())("h,help", "Print help"); + + auto result = options.parse(argc, argv); + + // NOTE: When an option was provided (e.g. -i or --input), then the result.count("