@@ -641,6 +641,34 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
641641 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
642642 GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
643643
644+ // Maximum of 4 lanes supported
645+ HighResTimer* nnTimers[12] = {
646+ &getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_ONNXClassification_0_", 0),
647+ &getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_ONNXRegression_1_", 1),
648+ &getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_ONNXRegression2_2_", 2),
649+ &getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_ONNXClassification_0_", 3),
650+ &getTimer<GPUTPCNNClusterizer, 4>("GPUTPCNNClusterizer_ONNXRegression_1_", 4),
651+ &getTimer<GPUTPCNNClusterizer, 5>("GPUTPCNNClusterizer_ONNXRegression2_2_", 5),
652+ &getTimer<GPUTPCNNClusterizer, 6>("GPUTPCNNClusterizer_ONNXClassification_0_", 6),
653+ &getTimer<GPUTPCNNClusterizer, 7>("GPUTPCNNClusterizer_ONNXRegression_1_", 7),
654+ &getTimer<GPUTPCNNClusterizer, 8>("GPUTPCNNClusterizer_ONNXRegression2_2_", 8),
655+ &getTimer<GPUTPCNNClusterizer, 9>("GPUTPCNNClusterizer_ONNXClassification_0_", 9),
656+ &getTimer<GPUTPCNNClusterizer, 10>("GPUTPCNNClusterizer_ONNXRegression_1_", 10),
657+ &getTimer<GPUTPCNNClusterizer , 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11)
658+ };
659+ HighResTimer* nnFillInputTimers[4] {
660+ &getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_fillInputNNSingleElement_0_", 0),
661+ &getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_fillInputNNSingleElement_1_", 1),
662+ &getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_fillInputNNSingleElement_2_", 2),
663+ &getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_fillInputNNSingleElement_3_", 3)
664+ };
665+ HighResTimer* nnPublishingTimers[4] {
666+ &getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_publish_0_", 0),
667+ &getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_publish_1_", 1),
668+ &getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_publish_2_", 2),
669+ &getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_publish_3_", 3)
670+ };
671+
644672 if (GetProcessingSettings().nn.applyNNclusterizer) {
645673 int32_t deviceId = -1;
646674 int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
@@ -1001,7 +1029,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10011029 size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
10021030
10031031 // auto start0 = std::chrono::high_resolution_clock::now();
1032+ if(GetProcessingSettings().debugLevel >= 1) { nnFillInputTimers[lane]->Start(); }
10041033 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data
1034+ if(GetProcessingSettings().debugLevel >= 1) { nnFillInputTimers[lane]->Stop(); }
10051035
10061036 if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
10071037 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the regression data
@@ -1011,6 +1041,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10111041 // auto start1 = std::chrono::high_resolution_clock::now();
10121042
10131043 // NN evaluations
1044+ if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane]->Start(); }
10141045 if (clustererNNShadow.mNnInferenceInputDType == 0) {
10151046 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
10161047 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
@@ -1024,7 +1055,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10241055 (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
10251056 }
10261057 }
1058+ if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane]->Stop(); }
10271059 if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1060+ if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 1]->Start(); }
10281061 if (clustererNNShadow.mNnInferenceInputDType == 0) {
10291062 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
10301063 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
@@ -1038,7 +1071,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10381071 (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
10391072 }
10401073 }
1074+ if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 1]->Stop(); }
10411075 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1076+ if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 2]->Start(); }
10421077 if (clustererNNShadow.mNnInferenceInputDType == 0) {
10431078 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
10441079 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
@@ -1052,12 +1087,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10521087 (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
10531088 }
10541089 }
1090+ if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 2]->Stop(); }
10551091 }
10561092 }
10571093
10581094 // auto stopNNs = std::chrono::high_resolution_clock::now();
10591095
10601096 // Publishing kernels
1097+ if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Start(); }
10611098 if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
10621099 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels
10631100 } else {
@@ -1069,6 +1106,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10691106 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
10701107 }
10711108 }
1109+ if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Stop(); }
10721110
10731111 // for(int i = 0; i < iSize; ++i) {
10741112 // if(clustererNNShadow.mOutputDataClass[i + batchStart] > 1) {
@@ -1090,7 +1128,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10901128 }
10911129 if (clustererNNShadow.mNnClusterizerUseCfRegression) {
10921130 // auto start1 = std::chrono::high_resolution_clock::now();
1131+ if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Start(); }
10931132 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1133+ if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Stop(); }
10941134 // auto stop1 = std::chrono::high_resolution_clock::now();
10951135 // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
10961136 }
0 commit comments