From fda05d6c69d9093001888365a82c39c06e5fc541 Mon Sep 17 00:00:00 2001
From: Maxim Fedorov <maximfca@gmail.com>
Date: Tue, 17 Jan 2023 17:13:12 -0800
Subject: [PATCH] erlperf: fixes and updates to 2.1.0

Major change is to detect lock contention and apply busy wait
in the primary process. It does not reduce total throughput
too much, because CPU is not actually busy, so spinning on it
does not introduce too much of a problem.

Changes:
 * bumped argparse to 1.2.4
 * fixed -w (--warmup) argument missing from command line
 * synchronised worker startup when adding concurrency
 * concurrent worker shutdown when reducing concurrency
 * elevated job & benchmark process priority to avoid result skew
 * implemented scheduling problem detection (e.g. lock contention),
   added a busy loop method workaround
 * backout from erlef GitHub actions CI to enable OTP 23 support
---
 .github/workflows/erlang.yml    |  13 ++--
 CHANGELOG.md                    |   8 +++
 README.md                       |  11 +++
 rebar.lock                      |   6 +-
 src/erlperf.app.src             |   2 +-
 src/erlperf.erl                 | 114 +++++++++++++++++++++++++-------
 src/erlperf_app.erl             |   2 +-
 src/erlperf_cli.erl             |   4 +-
 src/erlperf_cluster_monitor.erl |   2 +-
 src/erlperf_file_log.erl        |   2 +-
 src/erlperf_history.erl         |   2 +-
 src/erlperf_job.erl             |  68 ++++++++++++++-----
 src/erlperf_job_sup.erl         |   2 +-
 src/erlperf_monitor.erl         |   2 +-
 src/erlperf_sup.erl             |   2 +-
 test/erlperf_SUITE.erl          |  32 +++++++--
 test/erlperf_file_log_SUITE.erl |   2 +-
 test/erlperf_history_SUITE.erl  |   2 +-
 test/erlperf_job_SUITE.erl      |   4 +-
 test/erlperf_monitor_SUITE.erl  |   2 +-
 20 files changed, 213 insertions(+), 69 deletions(-)

diff --git a/.github/workflows/erlang.yml b/.github/workflows/erlang.yml
index 035a551..ab838e5 100644
--- a/.github/workflows/erlang.yml
+++ b/.github/workflows/erlang.yml
@@ -11,16 +11,17 @@ jobs:
   linux:
     name: Test on OTP ${{ matrix.otp_version }} and ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
+
     strategy:
       matrix:
-        otp_version: ['23', '24', '25']
+        otp_version: [23, 24, 25]
         os: [ubuntu-latest]
+
+    container:
+      image: erlang:${{ matrix.otp_version }}
+
     steps:
       - uses: actions/checkout@v2
-      - uses: erlef/setup-beam@v1
-        with:
-          otp-version: ${{matrix.otp_version}}
-          rebar3-version: '3.17.0'
       - name: Run tests
         run: rebar3 ct
       - name: Generate documentation
@@ -33,4 +34,4 @@ jobs:
         run: rebar3 as prod escriptize
       - shell: bash
         name: Smoke test
-        run: ./erlperf 'timer:sleep(1).'
+        run: ./erlperf 'timer:sleep(1).'
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf77551..f4c71fd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## 2.1.0
+- fixed -w (--warmup) argument missing from command line
+- synchronised worker startup when adding concurrency
+- concurrent worker shutdown when reducing concurrency
+- elevated job & benchmark process priority to avoid result skew
+- implemented scheduling problem detection (e.g. lock contention),
+  added a busy loop method workaround
+
 ## 2.0.2
 - added convenience command line options: init_all, done_all, init_runner_all
 
diff --git a/README.md b/README.md
index 07c52fc..faa8f98 100644
--- a/README.md
+++ b/README.md
@@ -289,6 +289,17 @@ halved:
     rand:uniform().          1    2771 Ki      72 ns
 ```
 
+## Benchmarking under lock contention
+ERTS cannot guarantee precise timing when there is severe lock contention happening,
+and scheduler utilisation is 100%. This often happens with ETS:
+```bash
+    $ ./erlperf -c 50 'ets:insert(ac_tab, {1, 2}).'
+```
+Running 50 concurrent processes trying to overwrite the very same key of an ETS
+table leads to lock contention on a shared resource (ETS table/bucket lock).
+
+
+
 ## Concurrency test (squeeze)
 Sometimes it's necessary to measure code running multiple concurrent
 processes, and find out when it saturates the node. It can be used to
diff --git a/rebar.lock b/rebar.lock
index 5eec03f..3e81148 100644
--- a/rebar.lock
+++ b/rebar.lock
@@ -1,8 +1,8 @@
 {"1.2.0",
-[{<<"argparse">>,{pkg,<<"argparse">>,<<"1.2.3">>},0}]}.
+[{<<"argparse">>,{pkg,<<"argparse">>,<<"1.2.4">>},0}]}.
 [
 {pkg_hash,[
- {<<"argparse">>, <<"20578A5D86250D7FB1BF6D116054BF707C247395E740BC8FC1384CC49DFB06F7">>}]},
+ {<<"argparse">>, <<"A31E7C5D9F8814AFCD9B42C1B98C21DA6F851F93F2C8C00C107F6201668A0A7D">>}]},
 {pkg_hash_ext,[
- {<<"argparse">>, <<"BFE94654811112FB86880C47B3600313D8D2281A1CCE4DADA2B09E5C3C9988D3">>}]}
+ {<<"argparse">>, <<"AC6FDB7183EA20ADEB7EB66E34B21F2E8C4C6925913EE0C0765D339D97009FFE">>}]}
 ].
diff --git a/src/erlperf.app.src b/src/erlperf.app.src
index 31e079a..9ffec3b 100644
--- a/src/erlperf.app.src
+++ b/src/erlperf.app.src
@@ -1,6 +1,6 @@
 {application, erlperf,
  [{description, "Erlang Performance & Benchmarking Suite"},
-  {vsn, "2.0.2"},
+  {vsn, "2.1.0"},
   {registered, [
     erlperf_sup, erlperf_job_sup, erlperf_monitor,
    erlperf_history, erlperf_file_log, erlperf_cluster_monitor
diff --git a/src/erlperf.erl b/src/erlperf.erl
index 493beb4..b0ad011 100644
--- a/src/erlperf.erl
+++ b/src/erlperf.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%%   Application API. Benchmark/squeeze implementation.
 %%% @end
@@ -43,6 +43,11 @@
     %% experimental feature allowing to benchmark processes with
     %%  wildly jumping throughput
     cv => float(),
+    %% sets the supervising process priority. Default is 'high',
+    %% allowing test runner to be scheduled even under high scheduler
+    %% utilisation (which is expected for a benchmark running many
+    %% concurrent processes)
+    priority => erlang:priority_level(),
     %% report form for single benchmark: when set to 'extended',
     %%  all non-warmup samples are returned as a list.
     %% When missing, only the average QPS is returned.
@@ -113,12 +118,16 @@ benchmark(Codes, #{isolation := _Isolation} = Options, ConOpts) ->
 
 %% no isolation requested, do normal in-BEAM test
 benchmark(Codes, Options, ConOpts) ->
-    {Jobs, Samples, Monitors} = start_jobs(Codes, []),
+    %% elevate priority to reduce timer skew
+    SetPrio = maps:get(priority, Options, high),
+    PrevPriority = process_flag(priority, SetPrio),
+    Jobs = start_jobs(Codes, []),
+    {JobPids, Samples, _} = lists:unzip3(Jobs),
     try
-        benchmark(Jobs, Options, ConOpts, Samples)
+        benchmark(JobPids, Options, ConOpts, Samples)
     after
-        [erlang:demonitor(Mon, [flush]) || Mon <- Monitors],
-        [(catch gen:stop(Pid)) || Pid <- Jobs]
+        stop_jobs(Jobs),
+        process_flag(priority, PrevPriority)
     end.
 
 %% @doc
@@ -249,7 +258,7 @@ stop_nodes(Peers, _Nodes) ->
     [peer:stop(Peer) || Peer <- Peers].
 
 start_jobs([], Jobs) ->
-    lists:unzip3(lists:reverse(Jobs));
+    lists:reverse(Jobs);
 start_jobs([Code | Codes], Jobs) ->
     try
         {ok, Pid} = erlperf_job:start(Code),
@@ -258,10 +267,18 @@ start_jobs([Code | Codes], Jobs) ->
         start_jobs(Codes, [{Pid, Sample, MonRef} | Jobs])
     catch Class:Reason:Stack ->
         %% stop jobs that were started
-        [(catch gen:stop(Pid)) || {Pid, _, _} <- Jobs],
+        stop_jobs(Jobs),
         erlang:raise(Class, Reason, Stack)
     end.
 
+stop_jobs(Jobs) ->
+    %% do not use gen:stop/1,2 or sys:terminate/2,3 here, as they spawn process running
+    %%  with normal priority, and they don't get scheduled fast enough when there is severe
+    %%  lock contention
+    WaitFor = [begin erlperf_job:request_stop(Pid), {Pid, Mon} end || {Pid, _, Mon} <- Jobs, is_process_alive(Pid)],
+    %% now wait for all monitors to fire
+    [receive {'DOWN', Mon, process, Pid, _R} -> ok end || {Pid, Mon} <- WaitFor].
+
 -define(DEFAULT_SAMPLE_DURATION, 1000).
 
 %% low-overhead benchmark
@@ -291,19 +308,34 @@ benchmark(Jobs, Options, ConOpts, Handles) ->
 %%  * (optional) for the last 'samples' cycles coefficient of variation did not exceed 'cv'
 perform_benchmark(Jobs, Handles, Options) ->
     Interval = maps:get(sample_duration, Options, ?DEFAULT_SAMPLE_DURATION),
-    % sleep for "warmup * sample_duration"
-    timer:sleep(Interval * maps:get(warmup, Options, 0)),
-    % do at least 'samples' cycles
+    % warmup: intended to figure out sleep method (whether to apply busy_wait immediately)
+    NowTime = os:system_time(millisecond),
+    SleepMethod = warmup(maps:get(warmup, Options, 0), NowTime, NowTime + Interval, Interval, sleep),
+    % find all options - or take their defaults, TODO: maybe do that at a higher level?
+    JobMap = maps:from_list([{J, []} || J <- Jobs]),
+    CV = maps:get(cv, Options, undefined),
+    SampleCount = maps:get(samples, Options, 3),
+    Report = maps:get(report, Options, false),
+    % remember initial counters in Before
+    StartedAt = os:system_time(millisecond),
     Before = [[erlperf_job:sample(Handle)] || Handle <- Handles],
-    JobMap = maps:from_list([{J, []} || J <- Jobs]), %%
-    Samples = measure_impl(JobMap, Before, Handles, Interval,
-        maps:get(samples, Options, 3), maps:get(cv, Options, undefined)),
-    report_benchmark(Samples, maps:get(report, Options, false)).
-
-measure_impl(_Jobs, Before, _Handles, _Interval, 0, undefined) ->
+    Samples = measure_impl(JobMap, Before, Handles, StartedAt, StartedAt + Interval, Interval,
+        SleepMethod, SampleCount, CV),
+    report_benchmark(Samples, Report).
+
+%% warmup procedure: figure out if sleep/4 can work without falling back to busy wait
+warmup(0, _LastSampleTime, _NextSampleTime, _Interval, Method) ->
+    Method;
+warmup(Count, LastSampleTime, NextSampleTime, Interval, Method) ->
+    SleepFor = NextSampleTime - LastSampleTime,
+    NextMethod = sleep(Method, SleepFor, NextSampleTime, #{}),
+    NowTime = os:system_time(millisecond),
+    warmup(Count - 1, NowTime, NextSampleTime + Interval, Interval, NextMethod).
+
+measure_impl(_Jobs, Before, _Handles, _LastSampleTime, _NextSampleTime, _Interval, _SleepMethod, 0, undefined) ->
     normalise(Before);
 
-measure_impl(Jobs, Before, Handles, Interval, 0, CV) ->
+measure_impl(Jobs, Before, Handles, LastSampleTime, NextSampleTime, Interval, SleepMethod, 0, CV) ->
     %% Complication: some jobs may need a long time to stabilise compared to others.
     %% Decision: wait for all jobs to stabilise (could wait for just one?)
     case
@@ -323,18 +355,52 @@ measure_impl(Jobs, Before, Handles, Interval, 0, CV) ->
         true ->
             % imitate queue - drop last sample, push another in the head
             TailLess = [lists:droplast(L) || L <- Before],
-            measure_impl(Jobs, TailLess, Handles, Interval, 1, CV)
+            measure_impl(Jobs, TailLess, Handles, LastSampleTime, NextSampleTime + Interval,
+                Interval, SleepMethod, 1, CV)
     end;
 
-measure_impl(Jobs, Before, Handles, Interval, Count, CV) ->
+%% LastSampleTime: system time of the last sample
+%% NextSampleTime: system time when to take the next sample
+%% Interval: to calculate the next NextSampleTime
+%% Count: how many more samples to take
+%% CV: coefficient of variation
+measure_impl(Jobs, Before, Handles, LastSampleTime, NextSampleTime, Interval, SleepMethod, Count, CV) ->
+    SleepFor = NextSampleTime - LastSampleTime,
+    NextSleepMethod = sleep(SleepMethod, SleepFor, NextSampleTime, Jobs),
+    NowTime = os:system_time(millisecond),
+    Counts = [erlperf_job:sample(Handle) || Handle <- Handles],
+    measure_impl(Jobs, merge(Counts, Before), Handles, NowTime, NextSampleTime + Interval, Interval,
+        NextSleepMethod, Count - 1, CV).
+
+%% ERTS real-time properties are easily broken by lock contention (e.g. ETS misuse)
+%% When it happens, even the 'max' priority process may not run for an extended
+%% period of time.
+
+sleep(sleep, SleepFor, _WaitUntil, Jobs) when SleepFor > 0 ->
     receive
         {'DOWN', _Ref, process, Pid, Reason} when is_map_key(Pid, Jobs) ->
             erlang:error({benchmark, {'EXIT', Pid, Reason}})
-    after Interval->
-        ok
-    end,
-    Counts = [erlperf_job:sample(Handle) || Handle <- Handles],
-    measure_impl(Jobs, merge(Counts, Before), Handles, Interval, Count - 1, CV).
+    after SleepFor ->
+        sleep
+    end;
+sleep(_Mode, _SleepFor, WaitUntil, Jobs) ->
+    busy_wait(WaitUntil, Jobs).
+
+%% When sleep detects significant difference in the actual sleep time vs. expected,
+%% loop is switched to the busy wait.
+%% Once switched to busy wait, erlperf stays there until the end of the test.
+busy_wait(WaitUntil, Jobs) ->
+    receive
+        {'DOWN', _Ref, process, Pid, Reason} when is_map_key(Pid, Jobs) ->
+            erlang:error({benchmark, {'EXIT', Pid, Reason}})
+    after 0 ->
+        case os:system_time(millisecond) of
+            Now when Now > WaitUntil ->
+                busy_wait;
+            _ ->
+                busy_wait(WaitUntil, Jobs)
+        end
+    end.
 
 merge([], []) ->
     [];
diff --git a/src/erlperf_app.erl b/src/erlperf_app.erl
index 4a62d45..8872ba3 100644
--- a/src/erlperf_app.erl
+++ b/src/erlperf_app.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%% Continuous benchmarking application behaviour.
 %%% @end
diff --git a/src/erlperf_cli.erl b/src/erlperf_cli.erl
index 04634be..bd7c2e5 100644
--- a/src/erlperf_cli.erl
+++ b/src/erlperf_cli.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%% Command line interface adapter.
 %%% @end
@@ -37,7 +37,7 @@ main(Args) ->
                     RunOpts0;
                 {ok, Str} ->
                     [erlang:error({loop, Option}) || Option <-
-                        [concurrency, sample_duration, samples, waarmup, cv], is_map_key(Option, RunOpts0)],
+                        [concurrency, sample_duration, samples, warmup, cv], is_map_key(Option, RunOpts0)],
                     RunOpts0#{loop => parse_loop(Str)}
                 end,
 
diff --git a/src/erlperf_cluster_monitor.erl b/src/erlperf_cluster_monitor.erl
index 74dada9..cbc392a 100644
--- a/src/erlperf_cluster_monitor.erl
+++ b/src/erlperf_cluster_monitor.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%% Logs monitoring events for the entire cluster, to file or device.
 %%%  Requires erlperf_history service running, fails otherwise.
diff --git a/src/erlperf_file_log.erl b/src/erlperf_file_log.erl
index 996e229..8904917 100644
--- a/src/erlperf_file_log.erl
+++ b/src/erlperf_file_log.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%%   Writes monitoring events to I/O device.
 %%% @end
diff --git a/src/erlperf_history.erl b/src/erlperf_history.erl
index a4dd707..2996c88 100644
--- a/src/erlperf_history.erl
+++ b/src/erlperf_history.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%% Collects, accumulates &amp; filters cluster-wide monitoring events.
 %%% Essentially a simple in-memory database for quick cluster overview.
diff --git a/src/erlperf_job.erl b/src/erlperf_job.erl
index e38e948..28c21f2 100644
--- a/src/erlperf_job.erl
+++ b/src/erlperf_job.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%% Job runner, taking care of init/done, workers added and removed.
 %%% Works just like a simple_one_for_one supervisor (children are
@@ -31,12 +31,14 @@
 -export([
     start/1,
     start_link/1,
+    request_stop/1,
     concurrency/1,
     set_concurrency/2,
     measure/2,
     sample/1,
     handle/1,
-    source/1
+    source/1,
+    set_priority/2
 ]).
 
 %% gen_server callbacks
@@ -89,6 +91,13 @@ start(#{runner := _MustHave} = Code) ->
 start_link(#{runner := _MustHave} = Code) ->
     gen_server:start_link(?MODULE, generate(Code), []).
 
+%% @doc
+%% Requests this job to stop. Caller should monitor the job process
+%% to find our when the job has actually stopped.
+-spec request_stop(server_ref()) -> ok.
+request_stop(JobId) ->
+    gen_server:cast(JobId, stop).
+
 %% @doc
 -spec concurrency(server_ref()) -> Concurrency :: non_neg_integer().
 concurrency(JobId) ->
@@ -96,10 +105,11 @@ concurrency(JobId) ->
 
 %% @doc
 %% Change concurrency setting for this job.
-%% Does not reset counting.
+%% Does not reset counting. May never return if init_runner
+%% does not return.
 -spec set_concurrency(server_ref(), non_neg_integer()) -> ok.
 set_concurrency(JobId, Concurrency) ->
-    gen_server:call(JobId, {set_concurrency, Concurrency}).
+    gen_server:call(JobId, {set_concurrency, Concurrency}, infinity).
 
 %% @doc
 %% Executes the runner SampleCount times, returns time in microseconds it
@@ -129,6 +139,16 @@ sample({Module, Arity}) ->
 source(JobId) ->
     gen_server:call(JobId, source).
 
+%% @doc
+%% Sets job process priority when there are workers running.
+%% Worker processes may utilise all schedulers, making job
+%%  process to lose control over starting and stopping workers.
+%% By default, job process sets 'high' priority when there are
+%%  any workers running.
+%% Returns the previous setting.
+-spec set_priority(server_ref(), erlang:priority_level()) -> erlang:priority_level().
+set_priority(JobId, Priority) ->
+    gen_server:call(JobId, {priority, Priority}).
 
 %%--------------------------------------------------------------------
 %% Internal definitions
@@ -156,7 +176,11 @@ source(JobId) ->
     %% continuous workers
     workers = [] :: [pid()],
     %% temporary workers (for sample_count call)
-    sample_workers = #{} :: #{pid() => {pid(), reference()}}
+    sample_workers = #{} :: #{pid() => {pid(), reference()}},
+    %% priority to return to when no workers left
+    initial_priority :: erlang:priority_level(),
+    %% priority to set when workers are running
+    priority = high :: erlang:priority_level()
 }).
 
 -type state() :: #erlperf_job_state{}.
@@ -182,7 +206,8 @@ init(#exec{name = Mod, binary = Bin, init = Init, runner = {_Fun, Arity}} = Exec
     ok = erlperf_monitor:register(self(), {Mod, Arity}, 0),
     %% start tracing this module runner function
     1 = erlang:trace_pattern({Mod, Mod, Arity}, true, [local, call_count]),
-    {ok, #erlperf_job_state{exec = Exec, init_result = InitRet}}.
+    {priority, Prio} = erlang:process_info(self(), priority),
+    {ok, #erlperf_job_state{exec = Exec, init_result = InitRet, initial_priority = Prio}}.
 
 -spec handle_call(term(), {pid(), reference()}, state()) -> {reply, term(), state()}.
 handle_call(handle, _From, #erlperf_job_state{exec = #exec{name = Name, runner = {_Fun, Arity}}} = State) ->
@@ -203,11 +228,14 @@ handle_call({measure, _SampleCount}, _From, #erlperf_job_state{} = State) ->
 handle_call(source, _From, #erlperf_job_state{exec = #exec{source = Source}} = State) ->
     {reply, Source, State};
 
+handle_call({priority, Prio}, _From, #erlperf_job_state{priority = Old} = State) ->
+    {reply, Old, State#erlperf_job_state{priority = Prio}};
+
 handle_call({set_concurrency, Concurrency}, _From, #erlperf_job_state{workers = Workers} = State) ->
     {reply, ok, State#erlperf_job_state{workers = set_concurrency_impl(length(Workers), Concurrency, State)}}.
 
-handle_cast(_Request, _State) ->
-    error(notsup).
+handle_cast(stop, State) ->
+    {stop, normal, State}.
 
 -spec handle_info(term(), state()) -> {noreply, state()}.
 handle_info({'EXIT', SampleWorker, Reason},
@@ -254,19 +282,26 @@ start_sample_count(SampleCount, ReplyTo, InitRunner, InitRet, {SampleRunner, _})
     ),
     #{Child => ReplyTo}.
 
-set_concurrency_impl(OldConcurrency, Concurrency, #erlperf_job_state{workers = Workers, init_result = IR, exec = Exec}) ->
+set_concurrency_impl(OldConcurrency, Concurrency, #erlperf_job_state{workers = Workers, init_result = IR, exec = Exec,
+    priority = Prio, initial_priority = InitialPrio}) ->
     case Concurrency - OldConcurrency of
         0 ->
             Workers;
         NeedMore when NeedMore > 0 ->
+            %% this process must run with higher priority to avoid being de-scheduled by runners
+            OldConcurrency =:= 0 andalso erlang:process_flag(priority, Prio),
             Workers ++ add_workers(NeedMore, Exec, IR, []);
         NeedLess ->
-            remove_workers(NeedLess, Workers)
+            {Fire, Keep} = lists:split(-NeedLess, Workers),
+            stop_workers(Fire),
+            Keep =:= [] andalso erlang:process_flag(priority, InitialPrio),
+            Keep
     end.
 
 add_workers(0, _ExecMap, _InitRet, NewWorkers) ->
     %% ensure all new workers completed their InitRunner routine
     [receive {Worker, init_runner} -> ok end || Worker <- NewWorkers],
+    [Worker ! go || Worker <- NewWorkers],
     NewWorkers;
 add_workers(More, #exec{init_runner = InitRunner, runner = {Runner, _RunnerArity}} = Exec, InitRet, NewWorkers) ->
     Control = self(),
@@ -275,18 +310,15 @@ add_workers(More, #exec{init_runner = InitRunner, runner = {Runner, _RunnerArity
         fun () ->
             %% need to send a message even if init_runner fails, hence 'after'
             IRR = try InitRunner(InitRet) after Control ! {self(), init_runner} end,
+            receive go -> ok end,
             Runner(IRR)
         end),
     add_workers(More - 1, Exec, InitRet, [Worker | NewWorkers]).
 
-remove_workers(0, Workers) ->
-    Workers;
-remove_workers(Less, [Worker | Workers]) ->
-    exit(Worker, kill),
-    receive
-        {'EXIT', Worker, _Reason} ->
-            remove_workers(Less + 1, Workers)
-    end.
+stop_workers(Workers) ->
+    %% try to stop concurrently
+    [exit(Worker, kill) || Worker <- Workers],
+    [receive {'EXIT', Worker, _Reason} -> ok end || Worker <- Workers].
 
 %%%===================================================================
 %%% Internal: code generation
diff --git a/src/erlperf_job_sup.erl b/src/erlperf_job_sup.erl
index 2ff9ff1..87644db 100644
--- a/src/erlperf_job_sup.erl
+++ b/src/erlperf_job_sup.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% Supervises statically started jobs.
 %%% @end
 -module(erlperf_job_sup).
diff --git a/src/erlperf_monitor.erl b/src/erlperf_monitor.erl
index 2c5516d..b9d4dc2 100644
--- a/src/erlperf_monitor.erl
+++ b/src/erlperf_monitor.erl
@@ -1,5 +1,5 @@
 %%%-------------------------------------------------------------------
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%%   System monitor: scheduler, RAM, and benchmarks throughput
 %%%  samples.
diff --git a/src/erlperf_sup.erl b/src/erlperf_sup.erl
index 64be90f..246ae75 100644
--- a/src/erlperf_sup.erl
+++ b/src/erlperf_sup.erl
@@ -1,4 +1,4 @@
-%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @copyright (C) 2019-2023, Maxim Fedorov
 %%% @doc
 %%% Top-level supervisor. Always starts process group scope
 %%%  for `erlperf'. Depending on the configuration starts
diff --git a/test/erlperf_SUITE.erl b/test/erlperf_SUITE.erl
index f79ac76..4c3d2d9 100644
--- a/test/erlperf_SUITE.erl
+++ b/test/erlperf_SUITE.erl
@@ -1,6 +1,6 @@
 %%%-------------------------------------------------------------------
 %%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @copyright (c) 2019-2023 Maxim Fedorov
 %%% @doc
 %%%     Tests benchmark module, machine-readable output for benchmarks.
 %%% @end
@@ -28,6 +28,8 @@
     crasher/0, crasher/1, undefer/0, undefer/1, compare/1,
     errors/0, errors/1]).
 
+-export([overhead_benchmark/0, overhead_benchmark/1]).
+
 -export([
     cmd_line_simple/1, cmd_line_verbose/1, cmd_line_compare/1,
     cmd_line_usage/1, cmd_line_init/1,
@@ -70,7 +72,10 @@ groups() ->
             compare,
             errors
         ]},
-        {cmdline, [sequential], [
+        {overhead, [], [
+            overhead_benchmark
+        ]},
+        {cmdline, [], [
             cmd_line_simple,
             cmd_line_verbose,
             cmd_line_compare,
@@ -87,13 +92,13 @@ groups() ->
         {squeeze, [], [
             mfa_squeeze
         ]},
-        {replay, [sequential], [
+        {replay, [], [
             replay
         ]}
     ].
 
 all() ->
-    [{group, benchmark}, {group, cmdline}, {group, squeeze}, {group, replay}].
+    [{group, benchmark}, {group, overhead}, {group, cmdline}, {group, squeeze}, {group, replay}].
 
 %%--------------------------------------------------------------------
 %% Helpers: gen_server implementation
@@ -284,6 +289,25 @@ mfa_squeeze(Config) when is_list(Config) ->
             ?assert(CPU > 1)
     end.
 
+overhead_benchmark() ->
+    [{doc, "Ensures that benchmarking overhead when running multiple concurrent processes is not too high"},
+        {timetrap, {seconds, 20}}].
+
+overhead_benchmark(Config) when is_list(Config) ->
+    Init = fun() -> ets:new(tab, [public, named_table]) end,
+    Done = fun(Tab) -> ets:delete(Tab) end,
+    Runner = fun() -> true = ets:insert(tab, {key, value}) end, %% this inevitably causes lock contention
+    %% take 10 samples of 100 ms, which should complete in about a second, or at least less than two
+    %% extra 3 samples of 100 ms are taken for 'warmup' cycles, to calibrate the sleep/4 function, that
+    %% is expected to switch into busy_wait mode.
+    Before = os:system_time(millisecond),
+    QPS = erlperf:run(#{runner => Runner, init => Init, done => Done},
+        #{concurrency => 50, samples => 10, sample_duration => 100, warmup => 3}),
+    TimeSpent = os:system_time(millisecond) - Before,
+    ?assert(QPS > 0, {qps, QPS}),
+    ?assert(TimeSpent > 500, {too_quick, TimeSpent, expected, 1000}),
+    ?assert(TimeSpent < 2000, {too_slow, TimeSpent, expected, 1000}).
+
 %%--------------------------------------------------------------------
 %% command-line testing
 
diff --git a/test/erlperf_file_log_SUITE.erl b/test/erlperf_file_log_SUITE.erl
index 4e85132..2730898 100644
--- a/test/erlperf_file_log_SUITE.erl
+++ b/test/erlperf_file_log_SUITE.erl
@@ -1,4 +1,4 @@
-%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @copyright (c) 2019-2023 Maxim Fedorov
 %%% @doc
 %%%     Tests erlperf_file_log
 %%% @end
diff --git a/test/erlperf_history_SUITE.erl b/test/erlperf_history_SUITE.erl
index 0c1ad28..f339785 100644
--- a/test/erlperf_history_SUITE.erl
+++ b/test/erlperf_history_SUITE.erl
@@ -1,4 +1,4 @@
-%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @copyright (c) 2019-2023 Maxim Fedorov
 %%% @doc
 %%% Tests combination of erlperf_monitor, erlperf_cluster_monitor,
 %%%  erlperf_history and erlperf_job. This is an integration test
diff --git a/test/erlperf_job_SUITE.erl b/test/erlperf_job_SUITE.erl
index e708b50..7170e65 100644
--- a/test/erlperf_job_SUITE.erl
+++ b/test/erlperf_job_SUITE.erl
@@ -1,4 +1,4 @@
-%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @copyright (c) 2019-2023 Maxim Fedorov
 %%% @doc
 %%% Tests all combinations of code maps accepted by erlperf_job
 %%% @end
@@ -47,10 +47,12 @@ overhead(Config) when is_list(Config) ->
     %% special case: code with no init/state/MFA/fun/... can be optimised
     %%  to a simple "for-loop" and get executed with the lease overhead
     {ok, Job} = erlperf_job:start_link(#{runner => "rand:uniform(1000)."}),
+    high = erlperf_job:set_priority(Job, max),
     Sampler = erlperf_job:handle(Job),
     TimeUs = erlperf_job:measure(Job, SampleCount),
     %% measure the same thing now with continuous benchmark
     ok = erlperf_job:set_concurrency(Job, 1),
+    {priority, max} = erlang:process_info(Job, priority),
     %% fetch a sample
     Start = erlperf_job:sample(Sampler),
     timer:sleep(1000),
diff --git a/test/erlperf_monitor_SUITE.erl b/test/erlperf_monitor_SUITE.erl
index 029bed2..11e2edf 100644
--- a/test/erlperf_monitor_SUITE.erl
+++ b/test/erlperf_monitor_SUITE.erl
@@ -1,6 +1,6 @@
 %%%-------------------------------------------------------------------
 %%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @copyright (c) 2019-2023 Maxim Fedorov
 %%% @doc
 %%%     Tests monitor
 %%% @end