Skip to content

Commit

Permalink
intel_gpu: properly reset metrics
Browse files Browse the repository at this point in the history
Reset the appropriate fields in the context struct when PAPI_cleanup()
is called.
This corresponds to intel_gpu_update_control_state() being called with
the value 'count' equal to zero.

Also reset the internal metric counts when PAPI_start() is called, so
that the previous values do not persist after the next PAPI_start().

These changes have been tested on the Intel Ponte Vecchio architecture.
  • Loading branch information
dbarry9 committed Oct 3, 2024
1 parent 437cee0 commit 1942be9
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 62 deletions.
52 changes: 0 additions & 52 deletions src/components/intel_gpu/internal/src/GPUMetricHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1230,18 +1230,6 @@ int GPUMetricHandler::EnableTimeBasedStream(uint32_t timePeriod, uint32_t numRep
} else {
DebugPrintError("EnableTimeBasedStream: failed on device [%p], status 0x%x\n",
m_device, status);
if (m_metricStreamer) {
status = zetMetricStreamerCloseFunc(m_metricStreamer);
m_metricStreamer = nullptr;
}
if (m_event) {
status = zeEventDestroyFunc(m_event);
m_event = nullptr;
}
if (m_eventPool) {
status = zeEventPoolDestroyFunc(m_eventPool);
m_eventPool = nullptr;
}
status = zetContextActivateMetricGroupsFunc(m_context, m_device, 0, nullptr);
m_status = COLLECTION_INIT;
ret = 1;
Expand Down Expand Up @@ -1331,22 +1319,6 @@ int GPUMetricHandler::EnableEventBasedQuery()
ret = 0;
} else {
DebugPrintError("EnableEventBasedQuery: failed with status 0x%x, abort.\n", status);
if (m_tracer) {
status = zetTracerExpDestroyFunc(m_tracer);
m_tracer = nullptr;
}
if (m_event) {
status = zeEventDestroyFunc(m_event);
m_event = nullptr;
}
if (m_eventPool) {
status = zeEventPoolDestroyFunc(m_eventPool);
m_eventPool = nullptr;
}
if (m_queryPool) {
status = zetMetricQueryPoolDestroyFunc(m_queryPool);
m_queryPool = nullptr;
}
status = zetContextActivateMetricGroupsFunc(m_context, m_device, 0, nullptr);
m_status = COLLECTION_INIT;
ret = retError;
Expand Down Expand Up @@ -1375,30 +1347,6 @@ GPUMetricHandler::DisableMetricGroup()
}
m_status = COLLECTION_DISABLED;

if (m_groupType == ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_TIME_BASED) {
if (m_metricStreamer) {
zetMetricStreamerCloseFunc(m_metricStreamer);
m_metricStreamer = nullptr;
}
}
if (m_groupType == ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_EVENT_BASED) {
if (m_tracer) {
zetTracerExpDestroyFunc(m_tracer);
m_tracer = nullptr;
}
if (m_queryPool) {
zetMetricQueryPoolDestroyFunc(m_queryPool);
m_queryPool = nullptr;
}
}
if (m_event) {
zeEventDestroyFunc(m_event);
m_event = nullptr;
}
if (m_eventPool) {
zeEventPoolDestroyFunc(m_eventPool);
m_eventPool = nullptr;
}
zetContextActivateMetricGroupsFunc(m_context, m_device, 0, nullptr);
m_lock.unlock();
return;
Expand Down
62 changes: 52 additions & 10 deletions src/components/intel_gpu/linux_intel_gpu_metrics.c
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,22 @@ addMetricToDevice(uint32_t code, int rootDev) {
return i;
}

/*!
* @brief Reset metric counts to zero.
*/
void
metricReset( MetricContext *mContext )
{
for (uint32_t i=0; i<num_avail_devices; i++) {
uint32_t dev_idx = mContext->active_devices[i];
if (dev_idx < num_active_devices) {
DeviceContext *dev = &active_devices[dev_idx];
GPUSetMetricControl(dev->handle, METRIC_RESET);
}
}
}



/************************* PAPI Functions **********************************/

Expand Down Expand Up @@ -277,12 +293,40 @@ intel_gpu_update_control_state( hwd_control_state_t *ctl,
(void)ctl;

// use local maintained context,
if (!count ||!native) {
MetricContext *mContext = (MetricContext *)ctx;

/* This check accounts for calls to PAPI_cleanup_eventset(). */
if ( !count ) {
metricReset(mContext);
/* Free devices. */
for (uint32_t i=0; i<mContext->num_devices; i++) {
uint32_t dev_idx = mContext->active_devices[i];
if (dev_idx >= num_active_devices) {
return PAPI_ENOMEM;
}
DeviceContext *dev = &active_devices[dev_idx];
DEVICE_HANDLE handle = dev->handle;
if( !handle ) {
GPUFreeDevice(handle);
}
mContext->active_devices[i] = 0;
}
mContext->num_devices = 0;
/* Free metric slots. */
for (uint32_t midx=0; midx<mContext->num_metrics; midx++) {
mContext->metric_idx[midx] = 0;
mContext->metric_values[midx] = 0;
mContext->dev_ctx_idx[midx] = 0;
mContext->subdev_idx[midx] = 0;
}
mContext->num_metrics = 0;
return PAPI_OK;
}

if ( !native ) {
return PAPI_OK;
}

MetricContext *mContext = (MetricContext *)ctx;

#if defined(_DEBUG)
for (int i=0; i<count; i++) {
GPUDEBUG("\t i=%d, ni_event 0x%x, ni_papi_code 0x%x, ni_position %d, ni_owners %d \n",
Expand Down Expand Up @@ -354,6 +398,9 @@ intel_gpu_start( hwd_context_t * ctx, hwd_control_state_t * ctl )
MetricContext *mContext = (MetricContext *)ctx;

int ret = PAPI_OK;

metricReset(mContext);

if (mContext->num_metrics == 0) {
GPUDEBUG("intel_gpu_start : No metric selected, abort.\n");
return PAPI_EINVAL;
Expand Down Expand Up @@ -521,13 +568,8 @@ intel_gpu_reset( hwd_context_t *ctx, hwd_control_state_t *ctl)
GPUDEBUG("Entering intel_gpu_reset\n");
MetricContext *mContext = (MetricContext *)ctx;

for (uint32_t i=0; i<num_avail_devices; i++) {
uint32_t dev_idx = mContext->active_devices[i];
if (dev_idx < num_active_devices) {
DeviceContext *dev = &active_devices[dev_idx];
GPUSetMetricControl(dev->handle, METRIC_RESET);
}
}
metricReset(mContext);

return PAPI_OK;
}

Expand Down

0 comments on commit 1942be9

Please sign in to comment.