diff --git a/.github/workflows/erlang.yml b/.github/workflows/erlang.yml
index 692d05d..947dd3b 100644
--- a/.github/workflows/erlang.yml
+++ b/.github/workflows/erlang.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        otp_version: ['22', '23', '24']
+        otp_version: ['23', '24']
         os: [ubuntu-latest]
     steps:
       - uses: actions/checkout@v2
@@ -21,10 +21,16 @@ jobs:
         with:
           otp-version: ${{matrix.otp_version}}
           rebar3-version: '3.17.0'
-      - name: Compile
-        run: rebar3 compile
       - name: Run tests
-        run: rebar3 do edoc,ct
+        run: rebar3 ct
+      - name: Generate documentation
+        run: rebar3 edoc
       - shell: bash
         name: Dialyzer
         run: rebar3 dialyzer
+      - shell: bash
+        name: Escriptize
+        run: rebar3 as prod escriptize
+      - shell: bash
+        name: Smoke test
+        run: ./erlperf 'timer:sleep(1).'
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..c1c73e5
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,40 @@
+stages:
+  - test
+  - deploy
+
+test-default-docker:
+  tags:
+    - linux
+    - x86_64
+  image: ${CI_DEPENDENCY_PROXY_GROUP_IMAGE_PREFIX}/erlang:latest
+  stage: test
+  script:
+    - rebar3 compile
+    - rebar3 edoc
+    - rebar3 dialyzer
+    - rebar3 ct
+    - rebar3 as prod escriptize
+    - ./erlperf 'timer:sleep(1).'
+  after_script:
+    - mv "_build/test/logs" ./public
+  artifacts:
+    when: always
+    paths:
+      - "./public"
+    expire_in: 3 days
+    reports:
+      junit:
+        - "./public/last/junit_report.xml"
+
+# Pages: publishing Common Test results
+pages:
+  stage: deploy
+  needs:
+    - test-default-docker
+  script:
+    - echo "Uploading to pages"
+  artifacts:
+    paths:
+      - public
+  rules:
+    - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
\ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..3ba4223
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,32 @@
+# Changelog
+
+## 2.0
+- incompatible change: `erlperf` requires runner arity to be defined explicitly.
+  Code example: `erlperf:run(#{runner => {timer, sleep, []}, init_runner => "1."})`,
+  with `erlperf` making a guess that `init_runner` is defined, therefore its return
+  value can be passed as the argument to `timer:sleep/1`. This behaviour was confusing
+  and is no longer supported.
+- incompatible change: crashed runner causes entire job to stop (error contains the
+  reason and stack trace)
+- incompatible change: removed fprof/profiling support in favour of JIT + `perf`
+- `erlperf` application is no longer required to be started for one-off benchmark runs
+
+## 1.1.5:
+- support for OTP 25 (peer replacing slave)
+
+## 1.1.4:
+- fixed an issue with pg already started
+- moved profiling to spawned process
+
+## 1.1.3:
+- addressed deprecation, updated to argparse 1.1.4
+
+## 1.1.2:
+- updated command line parser to new argparse
+
+## 1.1.1:
+- added support for OTP 24
+- added edoc documentation
+
+## 1.0.0:
+- initial release
diff --git a/README.md b/README.md
index 67bb32f..63358da 100644
--- a/README.md
+++ b/README.md
@@ -15,13 +15,21 @@ Build:
 Find out how many times per second a function can be run  (beware of shell escaping your code!):
 
 ```bash
-    $ ./erlperf 'timer:sleep(1).'
+    $ ../erlperf 'rand:uniform().'
     Code                    ||        QPS     Rel
-    timer:sleep(1).          1        500    100%
+    rand:uniform().          1    9798 Ki    100%
 ```
 
-Run erlperf with two concurrent samples of code
+Run four processes executing `rand:uniform()` in a tight loop, and see that code is indeed
+concurrent:
 
+```bash
+    $ ./erlperf 'rand:uniform().' -c 4
+    Code                    ||        QPS     Rel
+    rand:uniform().          4   28596 Ki    100%
+```
+
+Benchmark one function vs another, taking average of 10 seconds and skipping first second:
 
 ```bash
     $ ./erlperf 'rand:uniform().' 'crypto:strong_rand_bytes(2).' --samples 10 --warmup 1
@@ -30,39 +38,67 @@ Run erlperf with two concurrent samples of code
     crypto:strong_rand_bytes(2).           1      1485 Ki        35%
 ```
 
-Or just measure how concurrent your code is (example below shows saturation with only 1 process):
+Run a function passing the state into the next iteration. This code demonstrates performance difference
+between `rand:uniform_s` with state passed explicitly, and `rand:uniform` reading state from the process
+dictionary:
+
+```bash
+    $ ./erlperf 'r(_Init, S) -> {_, NS} = rand:uniform_s(S), NS.' --init_runner 'rand:seed(exsss).' 'r() -> rand:uniform().'
+    Code                                                    ||        QPS       Time     Rel
+    r(_Init, S) -> {_, NS} = rand:uniform_s(S), NS.          1   20272 Ki      49 ns    100%
+    r() -> rand:uniform().                                   1   15081 Ki      66 ns     74%
+```
+
+Squeeze mode: 
+measure how concurrent your code is. In the example below, `code:is_loaded/1` is implemented as
+`gen_server:call`, and all calculations are done in a single process. It is still possible to
+squeeze a bit more from a single process by putting work into the queue from multiple runners,
+therefore the example may show higher concurrency.
 
 ```bash
     $ ./erlperf 'code:is_loaded(local_udp).' --init 'code:ensure_loaded(local_udp).' --squeeze
     Code                           ||        QPS
     code:is_loaded(local_udp).      1     614 Ki
 ```
-    
-If you need some initialisation done before running the test, and clean up after:
+
+Start a server (`pg` scope in this example), use it in benchmark, and shut down after:
 
 ```bash
-    $ ./erlperf 'pg:join(scope, self()), pg:leave(scope, self()).' --init 'pg:start_link(scope).' --done 'gen_server:stop(scope).'
-    Code                                                     ||        QPS     Rel
-    pg:join(scope, self()), pg:leave(scope, self()).          1     287 Ki    100%
+    $ ./erlperf 'pg:join(scope, group, self()), pg:leave(scope, group, self()).' --init 'pg:start_link(scope).' --done 'gen_server:stop(scope).'
+    Code                                                                   ||        QPS     Rel
+    pg:join(scope, group, self()), pg:leave(scope, group, self()).          1     321 Ki    100%
 ```
 
-Run two versions of code against each other, with some init prior:
+Run the same code with different arguments, returned from `init_runner` function:
 
 ```bash
-    $ ./erlperf 'runner(X) -> timer:sleep(X).' --init '1.' 'runner(Y) -> timer:sleep(Y).' --init '2.'
-    Code                                                     ||        QPS     Rel
-    pg:join(scope, self()), pg:leave(scope, self()).          1     287 Ki    100%
+    $ ./erlperf 'runner(X) -> timer:sleep(X).' --init_runner '1.' 'runner(X) -> timer:sleep(X).' --init_runner '2.'
+    Code                                 ||        QPS     Rel
+    runner(X) -> timer:sleep(X).          1        492    100%
+    runner(Y) -> timer:sleep(Y).          1        331     67%
 ```
     
-Determine how well pg2 (removed in OTP 24) is able to have concurrent group modifications when there are no nodes in the cluster:
+Determine how many times a process can join/leave pg2 group on a single node:
 
 ```bash
-    $ ./erlperf 'runner(Arg) -> ok = pg2:join(Arg, self()), ok = pg2:leave(Arg, self()).' --init_runner 'G = {foo, rand:uniform(10000)}, pg2:create(G), G.' -q
-    Code                                                               ||        QPS
-    runner(Arg) -> ok = pg2:join(Arg, self()), ok = pg2:leave(Arg,     13      76501
+    $ ./erlperf 'ok = pg2:join(g, self()), ok = pg2:leave(g, self()).' --init 'pg2:create(g).'
+    Code                                                         ||        QPS     Rel
+    ok = pg2:join(g, self()), ok = pg2:leave(g, self()).          1      64253    100%
+```
+
+Compare `pg` with `pg2` running two nodes (note the `-i` argument spawning an extra node to
+run benchmark in):
+
+```bash
+    ./erlperf 'ok = pg2:join(g, self()), ok = pg2:leave(g, self()).' --init 'pg2:create(g).' 'ok = pg:join(g, self()), ok = pg:leave(g, self()).' --init 'pg:start(pg).' -i
+    Code                                                         ||        QPS     Rel
+    ok = pg:join(g, self()), ok = pg:leave(g, self()).            1     235 Ki    100%
+    ok = pg2:join(g, self()), ok = pg2:leave(g, self()).          1       1817      0%
 ```
-    
-Watch the progress of your test running (use -v option):
+
+Watch the progress of your test running (use -v option) with extra information: scheduler utilisation, dirty CPU & IO
+schedulers, number of running processes, ports, ETS tables, and memory consumption. Last column is the job throughput.
+When there are multiple jobs, multiple columns are printed.
 
 ```bash
     $ ./erlperf 'rand:uniform().' -q -v
@@ -77,97 +113,125 @@ Watch the progress of your test running (use -v option):
     Code                ||        QPS
     rand:uniform().      8   14918 Ki
 ```
-    
 
 Command-line benchmarking does not save results anywhere. It is designed to provide a quick answer to the question
-"is that piece of code faster". 
+"is that piece of code faster".
 
-# Continuous benchmarking
-Starting erlperf as an Erlang application enables continuous benchmarking. This process is designed to help during
-development and testing stages, allowing to quickly notice performance regressions.
- 
-Usage example (assuming you're running an OTP release, or rebar3 shell for your application):
+## Minimal overhead mode
+Since 2.0, `erlperf` includes "low overhead" mode. It cannot be used for continuous benchmarking. In this mode
+runner code is executed specified amount of times in a tight loop:
+
+```bash
+    ./erlperf 'rand:uniform().' 'rand:uniform(1000).' -l 10M
+    Code                        ||        QPS       Time     Rel
+    rand:uniform().              1   16319 Ki      61 ns    100%
+    rand:uniform(1000).          1   15899 Ki      62 ns     97%
+```
+
+This mode effectively runs following code: `loop(0) -> ok; loop(Count) -> rand:uniform(), loop(Count - 1).`
+Continuous mode adds 1-2 ns to each iteration.
+
+# Benchmarking existing application
+`erlperf` can be used to measure performance of your application running in production, or code that is stored
+on disk.
+
+## Running with existing codebase
+Use `-pa` argument to add extra code path. Example:
+```bash
+    $ ./erlperf 'argparse:parse([], #{}).' -pa _build/test/lib/argparse/ebin
+    Code                             ||        QPS     Rel
+    argparse:parse([], #{}).          1     859 Ki    100%
+```
 
+If you need to add multiple released applications, supply `ERL_LIBS` environment variable instead:
 ```bash
+    $ ERL_LIBS="_build/test/lib" erlperf 'argparse:parse([], #{}).'
+    Code                             ||        QPS     Rel
+    argparse:parse([], #{}).          1     843 Ki    100%
+```
 
-        $ rebar3 shell --sname mynode
+## Usage in production
+It is possible to use `erlperf` to benchmark a running application (even in production, assuming necessary safety
+precautions). To achieve this, add `erlperf` as a dependency, and use remote shell:
+
+```bash
+    # run a mock production node with `erl -sname production`
+    # connect a remote shell to the production node
+    erl -remsh production
+    (production@max-au)3> erlperf:run(timer, sleep, [1]).
+    488
 ```
+
+## Continuous benchmarking
+You can run a job continuously, to examine performance gains or losses while doing 
+hot code reload. This process is designed to help during development and testing stages, 
+allowing to quickly notice performance regressions.
+
+Example source code:
 ```erlang
-    > application:start(erlperf).
-    ok.
-    
-    > {ok, Logger} = ep_file_log:start(group_leader()).
+    -module(mymod).
+    -export([do/1]).
+    do(Arg) -> timer:sleep(Arg).
+```
+ 
+Example below assumes you have `erlperf` application started (e.g. in a `rebar3 shell`)
+
+```erlang
+    % start a logger that prints VM monitoring information
+    > {ok, Logger} = erlperf_file_log:start_link(group_leader()).
     {ok,<0.235.0>}
     
-    > {ok, JobPid} = ep_job:start(#{name => myname, 
-        init_runner => "myapp:generate_seed().", 
-        runner => "runner(Arg) -> Var = mymodule:call(Arg), mymodule:done(Var).",
-        initial_concurrency => 1}).
+    % start a job that will continuously benchmark mymod:do(),
+    %  with initial concurrency 2.
+    > JobPid = erlperf:start(#{init_runner => "rand:uniform(10).", 
+        runner => "runner(Arg) -> mymod:do(Arg)."}, 2).
     {ok,<0.291.0>}
     
-    > ep_job:set_concurrency(JobPid, 2).
+    % increase concurrency to 4
+    > erlperf_job:set_concurrency(JobPid, 4).
     ok.
     
-    % watch your job performance (local node)
+    % watch your job performance
 
-    % modify your application code, do hot code reload
-    > c:l(mymodule).
-    {module, mymodule}.
-    
-    % stop logging for local node
-    > ep_file_log:stop(Logger).
-    ok
+    % modify your application code, 
+    % set do(Arg) -> timer:sleep(2*Arg), do hot code reload
+    > c(mymod).
+    {module, mymod}.
     
-    % stop the job
-    > ep_job:stop(JobPid).
-    ok.
-    
-    % restart the job (code is saved for your convenience, and can be accessed by name)
-    % makes it easy to restart the job if it crashed due to bad code loaded
-    > ep_job:start(erlperf:load(myname)).
-    {ok,<0.1121.0>}
-    
-    % when finished, just shut everything down with one liner:
-    > application:stop(erlperf).
-    ok.
+    % see that after hot code reload throughput halved!
 ```
 
-
 # Reference Guide
 
 ## Terms
 
-* **runner**: process repeatedly executing the same set of calls
-* **hooks**: functions run when the new job starts (*init*), new runner
- starts (*init_runner*), or when job is stopped (*done*)
-* **job**: single instance of the running benchmark (multiple processes)
+* **runner**: code that gets continuously executed
+* **init**: code that runs one when the job starts (for example, start some registered process or create an ETS table)
+* **done**: code that runs when the job is about to stop (used for cleanup, e.g. stop some registered process)
+* **init_runner**: code that is executed in every runner process (e.g. add something to process dictionary)
+* **job**: single instance of the running benchmark (multiple runners)
 * **concurrency**: how many processes are running concurrently, executing *runner* code
 * **throughput**: total number of calls per sampling interval (for all concurrent processes)
+* **cv**: coefficient of variation, the ratio of the standard deviation to the mean. Used to stop the concurrency 
+(squeeze) test, the lower the *cv*, the longer it will take to stabilise and complete the test
 
-## Usage
-Benchmarking is done either using already compiled code, or a free-form
-code source. Latter case involves an extra step to form an Erlang module, 
-compile it and load into VM. 
-
-## One-time throughput run
+## Using `erlperf` from `rebar3 shell` or `erl` REPL
 Supported use-cases:
- * single run for MFA: ```erlperf:run({rand, uniform, [1000]}).```
- * call chain: ```erlperf:run([{rand, uniform, [10]}, {erlang, node, []}]).```,
- see [recording call chain](#recording-call-chain)
+ * single run for MFA: ```erlperf:run({rand, uniform, [1000]}).``` or ```erlperf:run(rand, uniform, []).```
  * anonymous function: ```erlperf:run(fun() -> rand:uniform(100) end).```
- * anonymous object with an argument: ```erlperf:run(fun(Init) -> io_lib:format("~tp", [Init]) end).```
+ * anonymous function with an argument: ```erlperf:run(fun(Init) -> io_lib:format("~tp", [Init]) end).```
  * source code: ```erlperf:run("runner() -> rand:uniform(20).").```
+ * (experimental) call chain: ```erlperf:run([{rand, uniform, [10]}, {erlang, node, []}]).```,
+     see [recording call chain](#recording-call-chain). Call chain may contain only complete
+     MFA tuples and cannot be mixed with functions.
  
-Source code cannot be mixed with MFA. Call chain may contain only complete
-MFA tuples and cannot be mixed with functions.
-
 Startup and teardown 
- * init, done and init_runner hooks are available (there is no done_runner,
+ * init, done and init_runner are supported (there is no done_runner,
  because it is never stopped in a graceful way)
  * init_runner and done may be defined with arity 0 and 1 (in the latter case,
  result of init/0 passed as an argument)
- * runner could be of any arity, erlang:function_exported/3 used to 
- determine if it accepts exact or +1 argument from init_runner
+ * runner can be of arity 0, 1 (accepting init_runner return value) or 2 (first
+ argument is init_runner return value, and second is state passed between runner invocations)
  
 Example with mixed MFA:
 ```erlang
@@ -201,64 +265,89 @@ erlperf:run(
 ```
 
 ## Measurement options
-Benchmarking is done by counting number of erlang:apply() calls done
-for a specified period of time (**sample_duration**). 
+Benchmarking is done by counting number of *runner* iterations done over
+a specified period of time (**sample_duration**). 
 By default, erlperf performs no **warmup** cycle, then takes 3 consecutive 
 **samples**, using **concurrency** of 1 (single runner). It is possible 
 to tune this behaviour by specifying run_options:
 ```erlang
     erlperf:run({erlang, node, []}, #{concurrency => 2, samples => 10, warmup => 1}).
 ```
-For list of options available, refer to benchmark reference.
 
-## Concurrency test
+Next example takes 10 samples with 100 ms duration. Note that throughput is reported
+per *sample_duration*: if you shorten duration in half, throughput report will also be
+halved:
+
+```bash
+    $ ./erlperf 'rand:uniform().' -d 100 -s 20
+    Code                    ||        QPS     Rel
+    rand:uniform().          1     970 Ki    100%
+    $ ./erlperf 'rand:uniform().' -d 200 -s 20
+    Code                    ||        QPS     Rel
+    rand:uniform().          1    1973 Ki    100%
+```
+
+## Concurrency test (squeeze)
 Sometimes it's necessary to measure code running multiple concurrent
 processes, and find out when it saturates the node. It can be used to
 detect bottlenecks, e.g. lock contention, single dispatcher process
 bottleneck etc.. Example (with maximum concurrency limited to 50):
 
 ```erlang
-    erlperf:run(Code, #{warmup => 1}, #{max => 50}).
+    > erlperf:run({code, is_loaded, [local_udp]}, #{warmup => 1}, #{max => 50}).
+    {1284971,7}
 ```
 
-# Continuous benchmarking
-All currently running benchmarking jobs can be accessed via programmatic
-interface. It is possible to run a job continuously, to examine performance
-gains or losses while doing hot code reload.
-
-## Saving benchmarks for future use
-Benchmark code database can be maintained for this purpose. Persisting
-code samples may not present repeatable tests, but allow re-running
-in the future.
+In this example, 7 concurrent processes were able to squeeze 1284971 calls per second
+for `code:is_loaded(local_udp)`.
 
 ## Benchmarking overhead
-Not yet measured. There may be a better way (using call time tracing instead of 
-counter injections), to be determined later.
+Benchmarking overhead varies depending on ERTS version and the way runner code is supplied. See the example: 
+
+```erlang
+    (erlperf@max-au)7> erlperf:benchmark([
+            #{runner => "runner(X) -> is_float(X).", init_runner=>"2."}, 
+            #{runner => {erlang, is_float, [2]}},
+            #{runner => fun (X) -> is_float(X) end, init_runner => "2."}], 
+        #{}, undefined).
+    [105824351,66424280,5057372]
+```
+
+This difference is caused by the ERTS itself: running compiled code (first variant) with OTP 25 is 
+two times faster than applying a function, and 20 times faster than repeatedly calling anonymous `fun`. Use
+the same invocation method to get a relevant result.
+
+Absolute benchmarking overhead may be significant for very fast functions taking just a few nanoseconds.
+Use "low overhead mode" for such occasions.
 
 ## Experimental: recording call chain
-Benchmarking can only work with exported functions (unlike ep_prof module,
-that is also able to record arguments of local function calls).
+This experimental feature allows capturing a sequence of calls as a list of
+`{Module, Function, [Args]}`. The trace can be supplied as a `runner` argument
+to `erlperf` for benchmarking purposes:
 
 ```erlang
-    > f(Trace), Trace = erlperf:record(pg2, '_', '_', 1000).
+    > f(Trace), Trace = erlperf:record(pg, '_', '_', 1000).
     ...
     
     % for things working with ETS, isolation is recommended
-    > erlperf:run(Trace, #{isolation => #{}}).
+    > erlperf:run(#{runner => Trace}, #{isolation => #{}}).
     ...
     
     % Trace can be saved to file before executing:
-    > erlperf:run(#{runner => Trace, name => "pg2"}).
+    > file:write("pg.trace", term_to_binary(Trace)).
     
-    % run the trace with
-    > erlperf:run(erlperf:load("pg2")).
+    % run the saved trace
+    > {ok, Bin} = file:read_file("pg.trace"),
+    > erlperf:run(#{runner => binary_to_term(Trace)}).
 ```
 
 It's possible to create a Common Test testcase using recorded samples.
 Just put the recorded file into xxx_SUITE_data:
 ```erlang
-    QPS = erlperf:run(erlperf:load(?config(data_dir, "pg2"))),
-    ?assert(QPS > 500). % catches regression for QPS falling below 500
+    benchmark_check(Config) ->
+        {ok, Bin} = file:read_file(filename:join(?config(data_dir, Config), "pg.trace")),
+        QPS = erlperf:run(#{runner => binary_to_term(Bin)}),
+        ?assert(QPS > 500). % catches regression for QPS falling below 500
 ```
 
 ## Experimental: starting jobs in a cluster
@@ -267,48 +356,58 @@ It's possible to run a job on a separate node in the cluster.
 
 ```erlang
     % watch the entire cluster (printed to console)
-    > {ok, ClusterLogger} = ep_cluster_monitor:start().
-    {ok, <0.211.0>}
+    (node1@host)> {ok, _} = erlperf_history:start_link().
+    {ok,<0.213.0>}
+    (node1@host)> {ok, ClusterLogger} = erlperf_cluster_monitor:start_link(group_leader(), [sched_util, jobs]).
+    {ok, <0.216.0>}
     
-    % also log cluster-wide reports (jobs & sched_util)
-    > {ok, ClusterLogger} = ep_cluster_monitor:start("/tmp/cluster", [time, sched_util, jobs]).
+    % also log cluster-wide reports to file (jobs & sched_util)
+    (node1@host)> {ok, FileLogger} = erlperf_cluster_monitor:start_link("/tmp/cluster", [time, sched_util, jobs]).
     {ok, <0.223.0>}
 
-    % start the benchmarking process in a separate beam
-    > {ok, Node, Job} = erlperf:start(#{
-        runner => {timer, sleep, [2]}, initial_concurrency => 4}, 
-        #{}).
-    {ok,job_53480@centos,<16735.104.0>}
-
-    % tail the file with "tail -f /tmp/cluster"
-
-    % change concurrency:
-    > rpc:call(Node, ep_job, set_concurrency, [Job, 8]).
-    ok
-    
-    % watch the log file tail
+    % run the benchmarking process in a different node of your cluster
+    (node1@host)> rpc:call('node2@host', erlperf, run, [#{runner => {rand, uniform, []}}]).
 ```
 
 Cluster-wide monitoring will reflect changes accordingly.
 
-## Changelog
 
-Version 1.1.5:
-- support for OTP 25 (peer replacing slave)
+# Implementation details
+Starting with 2.0, `erlperf` uses call counting for continuous benchmarking purposes. This allows
+the tightest possible loop without extra runtime calls. Running
+`erlperf 'rand:uniform().' --init '1'. --done '2.' --init_runner '3.'` results in creating,
+compiling and loading a module with this source code:
+
+```erlang
+    -module(unique_name).
+    -export([init/0, init_runner/0, done/0, run/0]).
+
+    init() ->
+        1.
+
+    init_runner() ->
+        3.
 
-Version 1.1.4:
-- fixed an issue with pg already started
-- moved profiling to spawned process
+    done() ->
+        2.
 
-Version 1.1.3:
-- addressed deprecation, updated to argparse 1.1.4
+    run() ->
+        runner(), 
+        run().
 
-Version 1.1.2:
-- updated command line parser to new argparse
+    runner() ->
+        rand:uniform().
+```
 
-Version 1.1.1:
-- added support for OTP 24
-- added edoc documentation
+Number of `run/0` calls per second is reported as throughput. Before 2.0, `erlperf` 
+used `atomics` to maintain a counter shared between all runner processes, introducing
+unnecessary BIF call overhead. 
 
-Version 1.0.0:
-- initial release
+Low-overhead mode tightens it even further, turning runner into this function:
+```erlang
+runner(0) ->
+    ok;
+runner(Count) ->
+    rand:uniform(),
+    runner(Count - 1).
+```
\ No newline at end of file
diff --git a/doc/overview.edoc b/doc/overview.edoc
index 651c9e2..b64ed70 100644
--- a/doc/overview.edoc
+++ b/doc/overview.edoc
@@ -1,6 +1,6 @@
  ** this is the overview.doc file for the application 'erlperf' **
 
-   @version 1.1.5
+   @version 2.0.0
    @author Maxim Fedorov, <maximfca@gmail.com>
    @title erlperf: Erlang Performance & Benchmarking Suite.
 
@@ -21,13 +21,21 @@
    Find out how many times per second a function can be run  (beware of shell escaping your code!):
 
    ```
-       $ ./erlperf 'timer:sleep(1).'
+       $ ../erlperf 'rand:uniform().'
        Code                    ||        QPS     Rel
-       timer:sleep(1).          1        500    100%
+       rand:uniform().          1    9798 Ki    100%
    '''
 
-   Run erlperf with two concurrent samples of code
+   Run four processes executing `rand:uniform()' in a tight loop, and see that code is indeed
+   concurrent:
 
+   ```
+       $ ./erlperf 'rand:uniform().' -c 4
+       Code                    ||        QPS     Rel
+       rand:uniform().          4   28596 Ki    100%
+   '''
+
+   Benchmark one function vs another, taking average of 10 seconds and skipping first second:
 
    ```
        $ ./erlperf 'rand:uniform().' 'crypto:strong_rand_bytes(2).' --samples 10 --warmup 1
@@ -36,7 +44,22 @@
        crypto:strong_rand_bytes(2).           1      1485 Ki        35%
    '''
 
-   Or just measure how concurrent your code is (example below shows saturation with only 1 process):
+   Run a function passing the state into the next iteration. This code demonstrates performance difference
+   between `rand:uniform_s' with state passed explicitly, and `rand:uniform' reading state from the process
+   dictionary:
+
+   ```
+       $ ./erlperf 'r(_Init, S) -> {_, NS} = rand:uniform_s(S), NS.' --init_runner 'rand:seed(exsss).' 'r() -> rand:uniform().'
+       Code                                                    ||        QPS       Time     Rel
+       r(_Init, S) -> {_, NS} = rand:uniform_s(S), NS.          1   20272 Ki      49 ns    100%
+       r() -> rand:uniform().                                   1   15081 Ki      66 ns     74%
+   '''
+
+   Squeeze mode:
+   measure how concurrent your code is. In the example below, `code:is_loaded/1' is implemented as
+   `gen_server:call', and all calculations are done in a single process. It is still possible to
+   squeeze a bit more from a single process by putting work into the queue from multiple runners,
+   therefore the example may show higher concurrency.
 
    ```
        $ ./erlperf 'code:is_loaded(local_udp).' --init 'code:ensure_loaded(local_udp).' --squeeze
@@ -44,31 +67,44 @@
        code:is_loaded(local_udp).      1     614 Ki
    '''
 
-   If you need some initialisation done before running the test, and clean up after:
+   Start a server (`pg' scope in this example), use it in benchmark, and shut down after:
 
    ```
-       $ ./erlperf 'pg:join(scope, self()), pg:leave(scope, self()).' --init 'pg:start_link(scope).' --done 'gen_server:stop(scope).'
-       Code                                                     ||        QPS     Rel
-       pg:join(scope, self()), pg:leave(scope, self()).          1     287 Ki    100%
+       $ ./erlperf 'pg:join(scope, group, self()), pg:leave(scope, group, self()).' --init 'pg:start_link(scope).' --done 'gen_server:stop(scope).'
+       Code                                                                   ||        QPS     Rel
+       pg:join(scope, group, self()), pg:leave(scope, group, self()).          1     321 Ki    100%
    '''
 
-   Run two versions of code against each other, with some init prior:
+   Run the same code with different arguments, returned from `init_runner' function:
 
    ```
-       $ ./erlperf 'runner(X) -> timer:sleep(X).' --init '1.' 'runner(Y) -> timer:sleep(Y).' --init '2.'
-       Code                                                     ||        QPS     Rel
-       pg:join(scope, self()), pg:leave(scope, self()).          1     287 Ki    100%
+       $ ./erlperf 'runner(X) -> timer:sleep(X).' --init_runner '1.' 'runner(X) -> timer:sleep(X).' --init_runner '2.'
+       Code                                 ||        QPS     Rel
+       runner(X) -> timer:sleep(X).          1        492    100%
+       runner(Y) -> timer:sleep(Y).          1        331     67%
    '''
 
-   Determine how well pg2 (removed in OTP 24) is able to have concurrent group modifications when there are no nodes in the cluster:
+   Determine how many times a process can join/leave pg2 group on a single node:
 
    ```
-       $ ./erlperf 'runner(Arg) -> ok = pg2:join(Arg, self()), ok = pg2:leave(Arg, self()).' --init_runner 'G = {foo, rand:uniform(10000)}, pg2:create(G), G.' -q
-       Code                                                               ||        QPS
-       runner(Arg) -> ok = pg2:join(Arg, self()), ok = pg2:leave(Arg,     13      76501
+       $ ./erlperf 'ok = pg2:join(g, self()), ok = pg2:leave(g, self()).' --init 'pg2:create(g).'
+       Code                                                         ||        QPS     Rel
+       ok = pg2:join(g, self()), ok = pg2:leave(g, self()).          1      64253    100%
    '''
 
-   Watch the progress of your test running (use -v option):
+   Compare `pg' with `pg2' running two nodes (note the `-i' argument spawning an extra node to
+   run benchmark in):
+
+   ```
+       ./erlperf 'ok = pg2:join(g, self()), ok = pg2:leave(g, self()).' --init 'pg2:create(g).' 'ok = pg:join(g, self()), ok = pg:leave(g, self()).' --init 'pg:start(pg).' -i
+       Code                                                         ||        QPS     Rel
+       ok = pg:join(g, self()), ok = pg:leave(g, self()).            1     235 Ki    100%
+       ok = pg2:join(g, self()), ok = pg2:leave(g, self()).          1       1817      0%
+   '''
+
+   Watch the progress of your test running (use -v option) with extra information: scheduler utilisation, dirty CPU &amp; IO
+   schedulers, number of running processes, ports, ETS tables, and memory consumption. Last column is the job throughput.
+   When there are multiple jobs, multiple columns are printed.
 
    ```
        $ ./erlperf 'rand:uniform().' -q -v
@@ -84,101 +120,124 @@
        rand:uniform().      8   14918 Ki
    '''
 
-
    Command-line benchmarking does not save results anywhere. It is designed to provide a quick answer to the question
    "is that piece of code faster".
 
-   # Continuous benchmarking
-   Starting erlperf as an Erlang application enables continuous benchmarking. This process is designed to help during
-   development and testing stages, allowing to quickly notice performance regressions.
+   === Minimal overhead mode ===
+   Since 2.0, `erlperf' includes "low overhead" mode. It cannot be used for continuous benchmarking. In this mode
+   runner code is executed specified amount of times in a tight loop:
 
-   Usage example (assuming you're running an OTP release, or rebar3 shell for your application):
+   ```
+       ./erlperf 'rand:uniform().' 'rand:uniform(1000).' -l 10M
+       Code                        ||        QPS       Time     Rel
+       rand:uniform().              1   16319 Ki      61 ns    100%
+       rand:uniform(1000).          1   15899 Ki      62 ns     97%
+   '''
 
+   This mode effectively runs following code: `loop(0) -> ok; loop(Count) -> rand:uniform(), loop(Count - 1).'
+   Continuous mode adds 1-2 ns to each iteration.
+
+   == Benchmarking existing application ==
+   `erlperf' can be used to measure performance of your application running in production, or code that is stored
+   on disk.
+
+   === Running with existing codebase ===
+   Use `-pa' argument to add extra code path. Example:
    ```
-       $ rebar3 shell --sname mynode
+       $ ./erlperf 'argparse:parse([], #{}).' -pa _build/test/lib/argparse/ebin
+       Code                             ||        QPS     Rel
+       argparse:parse([], #{}).          1     859 Ki    100%
    '''
+
+   If you need to add multiple released applications, supply `ERL_LIBS' environment variable instead:
    ```
-       > application:start(erlperf).
-       ok.
+       $ ERL_LIBS="_build/test/lib" erlperf 'argparse:parse([], #{}).'
+       Code                             ||        QPS     Rel
+       argparse:parse([], #{}).          1     843 Ki    100%
+   '''
 
-       > {ok, Logger} = ep_file_log:start(group_leader()).
-       {ok,<0.235.0>}
+   === Usage in production ===
+   It is possible to use `erlperf' to benchmark a running application (even in production, assuming necessary safety
+   precautions). To achieve this, add `erlperf' as a dependency, and use remote shell:
 
-       > {ok, JobPid} = ep_job:start(#{name => myname,
-           init_runner => "myapp:generate_seed().",
-           runner => "runner(Arg) -> Var = mymodule:call(Arg), mymodule:done(Var).",
-           initial_concurrency => 1}).
-       {ok,<0.291.0>}
+   ```
+       # run a mock production node with `erl -sname production'
+       # connect a remote shell to the production node
+       erl -remsh production
+       (production@max-au)3> erlperf:run(timer, sleep, [1]).
+       488
+   '''
 
-       > ep_job:set_concurrency(JobPid, 2).
-       ok.
+   === Continuous benchmarking ===
+   You can run a job continuously, to examine performance gains or losses while doing
+   hot code reload. This process is designed to help during development and testing stages,
+   allowing to quickly notice performance regressions.
 
-       % watch your job performance (local node)
+   Example source code:
+   ```
+       -module(mymod).
+       -export([do/1]).
+       do(Arg) -> timer:sleep(Arg).
+   '''
+
+   Example below assumes you have `erlperf' application started (e.g. in a `rebar3 shell')
 
-       % modify your application code, do hot code reload
-       > c:l(mymodule).
-       {module, mymodule}.
+   ```
+       % start a logger that prints VM monitoring information
+       > {ok, Logger} = erlperf_file_log:start_link(group_leader()).
+       {ok,<0.235.0>}
 
-       % stop logging for local node
-       > ep_file_log:stop(Logger).
-       ok
+       % start a job that will continuously benchmark mymod:do(),
+       %  with initial concurrency 2.
+       > JobPid = erlperf:start(#{init_runner => "rand:uniform(10).",
+           runner => "runner(Arg) -> mymod:do(Arg)."}, 2).
+       {ok,<0.291.0>}
 
-       % stop the job
-       > ep_job:stop(JobPid).
+       % increase concurrency to 4
+       > erlperf_job:set_concurrency(JobPid, 4).
        ok.
 
-       % restart the job (code is saved for your convenience, and can be accessed by name)
-       % makes it easy to restart the job if it crashed due to bad code loaded
-       > ep_job:start(erlperf:load(myname)).
-       {ok,<0.1121.0>}
+       % watch your job performance
 
-       % when finished, just shut everything down with one liner:
-       > application:stop(erlperf).
-       ok.
-   '''
+       % modify your application code,
+       % set do(Arg) -> timer:sleep(2*Arg), do hot code reload
+       > c(mymod).
+       {module, mymod}.
 
+       % see that after hot code reload throughput halved!
+   '''
 
    == Reference Guide ==
 
    === Terms ===
 
-   <ul>
-   <li> <strong>runner</strong>: process repeatedly executing the same set of calls </li>
-   <li> <strong>hooks</strong>: functions run when the new job starts (<em>init</em>), new runner
-    starts (<em>init_runner</em>), or when job is stopped (<em>done</em>) </li>
-   <li> <strong>job</strong>: single instance of the running benchmark (multiple processes) </li>
-   <li> <strong>concurrency</strong>: how many processes are running concurrently, executing <em>runner</em> code </li>
-   <li> <strong>throughput</strong>: total number of calls per sampling interval (for all concurrent processes) </li>
-   </ul>
-
-   === Usage ===
-   Benchmarking is done either using already compiled code, or a free-form
-   code source. Latter case involves an extra step to form an Erlang module,
-   compile it and load into VM.
-
-   === One-time throughput run ===
+   * **runner**: code that gets continuously executed
+   * **init**: code that runs one when the job starts (for example, start some registered process or create an ETS table)
+   * **done**: code that runs when the job is about to stop (used for cleanup, e.g. stop some registered process)
+   * **init_runner**: code that is executed in every runner process (e.g. add something to process dictionary)
+   * **job**: single instance of the running benchmark (multiple runners)
+   * **concurrency**: how many processes are running concurrently, executing *runner* code
+   * **throughput**: total number of calls per sampling interval (for all concurrent processes)
+   * **cv**: coefficient of variation, the ratio of the standard deviation to the mean. Used to stop the concurrency
+   (squeeze) test, the lower the *cv*, the longer it will take to stabilise and complete the test
+
+   === Using `erlperf' from `rebar3 shell' or `erl' REPL ===
    Supported use-cases:
-   <ul>
-    <li> single run for MFA: `erlperf:run({rand, uniform, [1000]}).' </li>
-    <li> call chain: `erlperf:run([{rand, uniform, [10]}, {erlang, node, []}]).',
-    see [recording call chain](#recording-call-chain) </li>
-    <li> anonymous function: `erlperf:run(fun() -> rand:uniform(100) end).' </li>
-    <li> anonymous object with an argument: `erlperf:run(fun(Init) -> io_lib:format("~tp", [Init]) end).' </li>
-    <li> source code: `erlperf:run("runner() -> rand:uniform(20).").' </li>
-   </ul>
-
-   Source code cannot be mixed with MFA. Call chain may contain only complete
-   MFA tuples and cannot be mixed with functions.
+    * single run for MFA: ```erlperf:run({rand, uniform, [1000]}).''' or ```erlperf:run(rand, uniform, []).'''
+    * anonymous function: ```erlperf:run(fun() -> rand:uniform(100) end).'''
+    * anonymous function with an argument: ```erlperf:run(fun(Init) -> io_lib:format("~tp", [Init]) end).'''
+    * source code: ```erlperf:run("runner() -> rand:uniform(20).").'''
+    * (experimental) call chain: ```erlperf:run([{rand, uniform, [10]}, {erlang, node, []}]).''',
+        see [recording call chain](#recording-call-chain). Call chain may contain only complete
+        MFA tuples and cannot be mixed with functions.
 
    Startup and teardown
-   <ul>
-    <li> init, done and init_runner hooks are available (there is no done_runner,
-    because it is never stopped in a graceful way) </li>
-    <li> init_runner and done may be defined with arity 0 and 1 (in the latter case,
-    result of init/0 passed as an argument) </li>
-    <li> runner could be of any arity, erlang:function_exported/3 used to
-    determine if it accepts exact or +1 argument from init_runner </li>
-   </ul>
+    * init, done and init_runner are supported (there is no done_runner,
+    because it is never stopped in a graceful way)
+    * init_runner and done may be defined with arity 0 and 1 (in the latter case,
+    result of init/0 passed as an argument)
+    * runner can be of arity 0, 1 (accepting init_runner return value) or 2 (first
+    argument is init_runner return value, and second is state passed between runner invocations)
 
    Example with mixed MFA:
    ```
@@ -212,64 +271,89 @@
    '''
 
    === Measurement options ===
-   Benchmarking is done by counting number of erlang:apply() calls done
-   for a specified period of time (<strong>duration</strong>).
-   By default, erlperf performs no <strong>warmup</strong> cycle, then takes 3 consecutive
-   <strong>samples</strong>, using <strong>concurrency</strong> of 1 (single runner). It is possible
+   Benchmarking is done by counting number of *runner* iterations done over
+   a specified period of time (**sample_duration**).
+   By default, erlperf performs no **warmup** cycle, then takes 3 consecutive
+   **samples**, using **concurrency** of 1 (single runner). It is possible
    to tune this behaviour by specifying run_options:
    ```
        erlperf:run({erlang, node, []}, #{concurrency => 2, samples => 10, warmup => 1}).
    '''
-   For list of options available, refer to benchmark reference.
 
-   === Concurrency test ===
+   Next example takes 10 samples with 100 ms duration. Note that throughput is reported
+   per *sample_duration*: if you shorten duration in half, throughput report will also be
+   halved:
+
+   ```
+       $ ./erlperf 'rand:uniform().' -d 100 -s 20
+       Code                    ||        QPS     Rel
+       rand:uniform().          1     970 Ki    100%
+       $ ./erlperf 'rand:uniform().' -d 200 -s 20
+       Code                    ||        QPS     Rel
+       rand:uniform().          1    1973 Ki    100%
+   '''
+
+   === Concurrency test (squeeze) ===
    Sometimes it's necessary to measure code running multiple concurrent
    processes, and find out when it saturates the node. It can be used to
    detect bottlenecks, e.g. lock contention, single dispatcher process
    bottleneck etc.. Example (with maximum concurrency limited to 50):
 
    ```
-       erlperf:run(Code, #{warmup => 1}, #{max => 50}).
+       > erlperf:run({code, is_loaded, [local_udp]}, #{warmup => 1}, #{max => 50}).
+       {1284971,7}
    '''
 
-   == Continuous benchmarking ==
-   All currently running benchmarking jobs can be accessed via programmatic
-   interface. It is possible to run a job continuously, to examine performance
-   gains or losses while doing hot code reload.
-
-   === Saving benchmarks for future use ===
-   Benchmark code database can be maintained for this purpose. Persisting
-   code samples may not present repeatable tests, but allow re-running
-   in the future.
+   In this example, 7 concurrent processes were able to squeeze 1284971 calls per second
+   for `code:is_loaded(local_udp)'.
 
    === Benchmarking overhead ===
-   Not yet measured. There may be a better way (using call time tracing instead of
-   counter injections), to be determined later.
+   Benchmarking overhead varies depending on ERTS version and the way runner code is supplied. See the example:
+
+   ```
+       (erlperf@max-au)7> erlperf:benchmark([
+               #{runner => "runner(X) -> is_float(X).", init_runner=>"2."},
+               #{runner => {erlang, is_float, [2]}},
+               #{runner => fun (X) -> is_float(X) end, init_runner => "2."}],
+           #{}, undefined).
+       [105824351,66424280,5057372]
+   '''
+
+   This difference is caused by the ERTS itself: running compiled code (first variant) with OTP 25 is
+   two times faster than applying a function, and 20 times faster than repeatedly calling anonymous `fun'. Use
+   the same invocation method to get a relevant result.
+
+   Absolute benchmarking overhead may be significant for very fast functions taking just a few nanoseconds.
+   Use "low overhead mode" for such occasions.
 
    === Experimental: recording call chain ===
-   Benchmarking can only work with exported functions (unlike ep_prof module,
-   that is also able to record arguments of local function calls).
+   This experimental feature allows capturing a sequence of calls as a list of
+   `{Module, Function, [Args]}'. The trace can be supplied as a `runner' argument
+   to `erlperf' for benchmarking purposes:
 
    ```
-       > f(Trace), Trace = erlperf:record(pg2, '_', '_', 1000).
+       > f(Trace), Trace = erlperf:record(pg, '_', '_', 1000).
        ...
 
        % for things working with ETS, isolation is recommended
-       > erlperf:run(Trace, #{isolation => #{}}).
+       > erlperf:run(#{runner => Trace}, #{isolation => #{}}).
        ...
 
        % Trace can be saved to file before executing:
-       > erlperf:run(#{runner => Trace, name => "pg2"}).
+       > file:write("pg.trace", term_to_binary(Trace)).
 
-       % run the trace with
-       > erlperf:run(erlperf:load("pg2")).
+       % run the saved trace
+       > {ok, Bin} = file:read_file("pg.trace"),
+       > erlperf:run(#{runner => binary_to_term(Trace)}).
    '''
 
    It's possible to create a Common Test testcase using recorded samples.
    Just put the recorded file into xxx_SUITE_data:
    ```
-       QPS = erlperf:run(erlperf:load(?config(data_dir, "pg2"))),
-       ?assert(QPS > 500). % catches regression for QPS falling below 500
+       benchmark_check(Config) ->
+           {ok, Bin} = file:read_file(filename:join(?config(data_dir, Config), "pg.trace")),
+           QPS = erlperf:run(#{runner => binary_to_term(Bin)}),
+           ?assert(QPS > 500). % catches regression for QPS falling below 500
    '''
 
    === Experimental: starting jobs in a cluster ===
@@ -278,26 +362,58 @@
 
    ```
        % watch the entire cluster (printed to console)
-       > {ok, ClusterLogger} = ep_cluster_monitor:start().
-       {ok, <0.211.0>}
+       (node1@host)> {ok, _} = erlperf_history:start_link().
+       {ok,<0.213.0>}
+       (node1@host)> {ok, ClusterLogger} = erlperf_cluster_monitor:start_link(group_leader(), [sched_util, jobs]).
+       {ok, <0.216.0>}
 
-       % also log cluster-wide reports (jobs & sched_util)
-       > {ok, ClusterLogger} = ep_cluster_monitor:start("/tmp/cluster", [time, sched_util, jobs]).
+       % also log cluster-wide reports to file (jobs & sched_util)
+       (node1@host)> {ok, FileLogger} = erlperf_cluster_monitor:start_link("/tmp/cluster", [time, sched_util, jobs]).
        {ok, <0.223.0>}
 
-       % start the benchmarking process in a separate beam
-       > {ok, Node, Job} = erlperf:start(#{
-           runner => {timer, sleep, [2]}, initial_concurrency => 4},
-           #{}).
-       {ok,job_53480@centos,<16735.104.0>}
+       % run the benchmarking process in a different node of your cluster
+       (node1@host)> rpc:call('node2@host', erlperf, run, [#{runner => {rand, uniform, []}}]).
+   '''
+
+   Cluster-wide monitoring will reflect changes accordingly.
+
+
+   = Implementation details =
+   Starting with 2.0, `erlperf' uses call counting for continuous benchmarking purposes. This allows
+   the tightest possible loop without extra runtime calls. Running
+   `erlperf 'rand:uniform().' --init '1'. --done '2.' --init_runner '3.'' results in creating,
+   compiling and loading a module with this source code:
 
-       % tail the file with "tail -f /tmp/cluster"
+   ```
+       -module(unique_name).
+       -export([init/0, init_runner/0, done/0, run/0]).
+
+       init() ->
+           1.
+
+       init_runner() ->
+           3.
+
+       done() ->
+           2.
 
-       % change concurrency:
-       > rpc:call(Node, ep_job, set_concurrency, [Job, 8]).
-       ok
+       run() ->
+           runner(),
+           run().
 
-       % watch the log file tail
+       runner() ->
+           rand:uniform().
    '''
 
-   Cluster-wide monitoring will reflect changes accordingly.
+   Number of `run/0' calls per second is reported as throughput. Before 2.0, `erlperf'
+   used `atomics' to maintain a counter shared between all runner processes, introducing
+   unnecessary BIF call overhead.
+
+   Low-overhead mode tightens it even further, turning runner into this function:
+   ```
+   runner(0) ->
+       ok;
+   runner(Count) ->
+       rand:uniform(),
+       runner(Count - 1).
+   '''
\ No newline at end of file
diff --git a/include/monitor.hrl b/include/monitor.hrl
deleted file mode 100644
index b211188..0000000
--- a/include/monitor.hrl
+++ /dev/null
@@ -1,34 +0,0 @@
-
-%% Job sample
--type job_sample() :: {
-    ID :: pid(),
-    Cycles :: non_neg_integer()
-}.
-
-
-%% Monitoring sampling structure
--record(monitor_sample, {
-    time :: integer(),
-    sched_util :: float(),
-    dcpu :: float(),
-    dio :: float(),
-    processes :: integer(),
-    ports :: integer(),
-    ets :: integer(),
-    memory_total :: non_neg_integer(),
-    memory_processes :: non_neg_integer(),
-    memory_binary :: non_neg_integer(),
-    memory_ets :: non_neg_integer(),
-    jobs :: [job_sample()]
-}).
-
--type monitor_sample() :: #monitor_sample{}.
-
-% monitoring event bus
--define(SYSTEM_EVENT, ep_system_event).
-
-% job event bus
--define(JOB_EVENT, ep_job_event).
-
-% history server pg2 group
--define(HISTORY_PROCESS_GROUP, erlperf_history).
diff --git a/priv/.gitignore b/priv/.gitignore
deleted file mode 100644
index e69de29..0000000
diff --git a/rebar.config b/rebar.config
index a78c44a..b10cba7 100644
--- a/rebar.config
+++ b/rebar.config
@@ -10,10 +10,16 @@
 ]}.
 
 {escript_incl_apps, [argparse]}.
+{escript_emu_args, "%%! +pc unicode -escript main erlperf_cli\n"}.
 
 {cover_enabled, true}.
 {cover_opts, [verbose]}.
 
+{ct_opts, [
+    {ct_hooks, [cth_surefire]},
+    {keep_logs, 1}
+]}.
+
 {post_hooks, [{"(linux|darwin|solaris|freebsd|netbsd|openbsd)",
     escriptize,
     "cp \"$REBAR_BUILD_DIR/bin/erlperf\" ./erlperf"},
diff --git a/rebar.lock b/rebar.lock
index 3677cf4..5eec03f 100644
--- a/rebar.lock
+++ b/rebar.lock
@@ -1,8 +1,8 @@
 {"1.2.0",
-[{<<"argparse">>,{pkg,<<"argparse">>,<<"1.2.2">>},0}]}.
+[{<<"argparse">>,{pkg,<<"argparse">>,<<"1.2.3">>},0}]}.
 [
 {pkg_hash,[
- {<<"argparse">>, <<"7383D68E0CC88120FD25EAAF43647EA7BE8E50DFA6AAB5649B32F524E6E30C9E">>}]},
+ {<<"argparse">>, <<"20578A5D86250D7FB1BF6D116054BF707C247395E740BC8FC1384CC49DFB06F7">>}]},
 {pkg_hash_ext,[
- {<<"argparse">>, <<"32FEB588D92B53FD06136FC8B3C014F9D2B658CD8DE81AE9B07D663637BA5485">>}]}
+ {<<"argparse">>, <<"BFE94654811112FB86880C47B3600313D8D2281A1CCE4DADA2B09E5C3C9988D3">>}]}
 ].
diff --git a/src/ep_cluster_monitor.erl b/src/ep_cluster_monitor.erl
deleted file mode 100644
index 29a82f5..0000000
--- a/src/ep_cluster_monitor.erl
+++ /dev/null
@@ -1,156 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%% @doc
-%%%  Logs monitoring events for the entire cluster, to file or device.
-%%%  Requires cluster_history service running, fails otherwise.
-%%% @end
--module(ep_cluster_monitor).
--author("maximfca@gmail.com").
-
--behaviour(gen_server).
-
-%% API
--export([
-    start/0,
-    start/2,
-    start_link/2,
-    stop/1
-]).
-
-%% gen_server callbacks
--export([
-    init/1,
-    handle_call/3,
-    handle_cast/2,
-    handle_info/2
-]).
-
--include("monitor.hrl").
-
-%% Handler: just like gen_event handler.
-%% If you do need gen_event handler, make a fun of it.
--type handler() :: {module(), atom(), term()} | file:filename_all() | {fd, io:device()} | io:device().
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts additional cluster monitor, printing
-%%  selected fields (sched_util, running job characteristics) to group_leader().
-%% User is responsible for stopping the server.
--spec start() -> {ok, Pid :: pid()} | {error, Reason :: term()}.
-start() ->
-    start(erlang:group_leader(), [sched_util, jobs]).
-
-%% @doc
-%% Starts additional cluster-wide monitor.
-%% User is responsible for stopping the server.
--spec start(handler(), [atom()]) -> {ok, Pid :: pid()} | {error, Reason :: term()}.
-start(Handler, Fields) ->
-    supervisor:start_child(ep_cluster_monitor_sup, [Handler, Fields]).
-
-%% @doc
-%% Starts cluster-wide monitor with the specified handler, and links it to the caller.
-%% Use 'record_info(fields, monitor_sample)' to fetch all fields.
--spec start_link(handler(), [atom()]) -> {ok, Pid :: pid()} | {error, Reason :: term()}.
-start_link(Handler, Fields) ->
-    gen_server:start_link(?MODULE, [Handler, Fields], []).
-
-%% @doc
-%% Stops the cluster-wide monitor instance.
--spec stop(pid()) -> ok.
-stop(Pid) ->
-    supervisor:terminate_child(ep_cluster_monitor_sup, Pid).
-
-%%%===================================================================
-%%% gen_server callbacks
-
-%% Take a sample every second
--define(SAMPLING_RATE, 1000).
-
-%% System monitor state
--record(state, {
-    % next tick
-    next :: integer(),
-    handler :: handler(),
-    fields :: [integer()]
-}).
-
-%% gen_server init
-init([Handler, Fields]) ->
-    % precise (abs) timer
-    Next = erlang:monotonic_time(millisecond) + ?SAMPLING_RATE,
-    erlang:start_timer(Next, self(), tick, [{abs, true}]),
-    {ok, #state{next = Next,
-        handler = make_handler(Handler),
-        fields = fields_to_indices(Fields)}}.
-
-handle_call(_Request, _From, _State) ->
-    error(badarg).
-
-handle_cast(_Request, _State) ->
-    error(badarg).
-
-handle_info({timeout, _, tick}, #state{next = Next, fields = Fields, handler = Handler} = State) ->
-    Next1 = Next + ?SAMPLING_RATE,
-    % if we supply negative timer, we crash - and restart with no messages in the queue
-    % this could happen if handler is too slow
-    erlang:start_timer(Next1, self(), tick, [{abs, true}]),
-    % fetch all updates from cluster history
-    Samples = ep_cluster_history:get(Next - ?SAMPLING_RATE + erlang:time_offset(millisecond)),
-    % now invoke the handler
-    NewHandler = run_handler(Handler, Fields, lists:keysort(1, Samples)),
-    {noreply, State#state{next = Next1, handler = NewHandler}};
-
-handle_info(_Info, _State) ->
-    error(badarg).
-
-%%%===================================================================
-%%% Internal functions
-
-make_handler({_M, _F, _A} = MFA) ->
-    MFA;
-make_handler(IoDevice) when is_pid(IoDevice); is_atom(IoDevice) ->
-    {fd, IoDevice};
-make_handler(Filename) when is_list(Filename); is_binary(Filename) ->
-    {ok, Fd} = file:open(Filename, [raw, append]),
-    {fd, Fd}.
-
-fields_to_indices(Names) ->
-    Fields = record_info(fields, monitor_sample),
-    Zips = lists:zip(Fields, lists:seq(1, length(Fields))),
-    lists:reverse(lists:foldl(
-        fun (Field, Inds) ->
-            {Field, Ind} = lists:keyfind(Field, 1, Zips),
-            [Ind | Inds]
-        end, [], Names)).
-
-run_handler({M, F, A}, Fields, Samples) ->
-    Filtered = [{Node, [element(I + 1, Sample) || I <- Fields]} || {Node, Sample} <- Samples],
-    {M, F, M:F(Filtered, A)};
-run_handler({fd, IoDevice}, Fields, Samples) ->
-    Filtered = lists:foldl(
-        fun ({Node, Sample}, Acc) ->
-            OneNode =
-                lists:join(" ",
-                    [io_lib:format("~s", [Node]) |
-                        [formatter(I + 1, element(I + 1, Sample)) || I <- Fields]]),
-            [OneNode | Acc]
-        end,
-        [], Samples),
-    Data = lists:join(" ", Filtered) ++ "\n",
-    Formatted = iolist_to_binary(Data),
-    ok = file:write(IoDevice, Formatted),
-    {fd, IoDevice}.
-
-formatter(#monitor_sample.time, Time) ->
-    calendar:system_time_to_rfc3339(Time div 1000);
-formatter(Percent, Num) when Percent =:= #monitor_sample.sched_util; Percent =:= #monitor_sample.dcpu; Percent =:= #monitor_sample.dio ->
-    io_lib:format("~6.2f", [Num]);
-formatter(Number, Num) when Number =:= #monitor_sample.processes; Number =:= #monitor_sample.ports ->
-    io_lib:format("~8b", [Num]);
-formatter(#monitor_sample.ets, Num) ->
-    io_lib:format("~6b", [Num]);
-formatter(Size, Num) when Size =:= #monitor_sample.memory_total; Size =:= #monitor_sample.memory_processes; Size =:= #monitor_sample.memory_binary; Size =:= #monitor_sample.memory_ets ->
-    io_lib:format("~9s", [erlperf:format_size(Num)]);
-formatter(#monitor_sample.jobs, Jobs) ->
-    lists:flatten([io_lib:format("~8s", [erlperf:format_number(Num)]) || {_Pid, Num} <- Jobs]).
diff --git a/src/ep_event_handler.erl b/src/ep_event_handler.erl
deleted file mode 100644
index 30c4af7..0000000
--- a/src/ep_event_handler.erl
+++ /dev/null
@@ -1,62 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%% @doc
-%%%   Generic event handler, converting gen_event
-%%%     callbacks into messages. Allows collecting
-%%%     events from remote nodes.
-%%% @end
--module(ep_event_handler).
--author("maximfca@gmail.com").
-
--behavior(gen_event).
-
--export([
-    subscribe/1,
-    subscribe/2,
-    unsubscribe/1,
-    unsubscribe/2
-]).
-
-%% gen_event callbacks
--export([
-    init/1,
-    handle_call/2,
-    handle_event/2
-]).
-
--spec subscribe(gen_event:emgr_ref()) -> term().
-subscribe(GenEvent) ->
-    gen_event:add_handler(GenEvent, ?MODULE, self()).
-
--spec subscribe(gen_event:emgr_ref(), term()) -> term().
-subscribe(GenEvent, Tag) ->
-    gen_event:add_handler(GenEvent, ?MODULE, {self(), Tag}).
-
--spec unsubscribe(gen_event:emgr_ref()) -> term().
-unsubscribe(GenEvent) ->
-    gen_event:delete_handler(GenEvent, ?MODULE, self()).
-
--spec unsubscribe(gen_event:emgr_ref(), term()) -> term().
-unsubscribe(GenEvent, Tag) ->
-    gen_event:delete_handler(GenEvent, ?MODULE, {self(), Tag}).
-
--type state() :: pid() | atom() | {pid() | atom(), term()}.
-
-%% gen_event init
--spec init(state()) -> {ok, state()}.
-init(Self) ->
-    {ok, Self}.
-
--spec handle_call({pid(), reference()}, state()) -> no_return().
-handle_call(_From, _State) ->
-    error(badarg).
-
-%% gen_event
--spec handle_event(term(), state()) -> no_return().
-handle_event(Event, Proc) when is_pid(Proc) ->
-    Proc ! Event,
-    {ok, Proc};
-handle_event(Event, {Proc, Tag} = Tagged) ->
-    Proc ! {Tag, Event},
-    {ok, Tagged}.
diff --git a/src/ep_file_log.erl b/src/ep_file_log.erl
deleted file mode 100644
index ba215f3..0000000
--- a/src/ep_file_log.erl
+++ /dev/null
@@ -1,135 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%% @doc
-%%%   Writes monitoring events to file.
-%%% @end
--module(ep_file_log).
--author("maximfca@gmail.com").
-
--behaviour(gen_server).
-
-%% API
--export([
-    start/1,
-    stop/1,
-    start_link/1
-]).
-
-%% gen_server callbacks
--export([
-    init/1,
-    handle_call/3,
-    handle_cast/2,
-    handle_info/2,
-    terminate/2
-]).
-
--include("monitor.hrl").
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts the server
--spec(start_link(Filename :: string() | file:io_device()) ->
-    {ok, Pid :: pid()} | ignore | {error, Reason :: term()}).
-start_link(Filename) ->
-    gen_server:start_link(?MODULE, [Filename], []).
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts the server with a different name and/or file.
-%% Server is started outside of supervision tree.
--spec start(Target :: file:filename() | file:io_device()) -> {ok, Pid :: pid()} | {error, Reason :: term()}.
-start(Filename) ->
-    supervisor:start_child(ep_file_log_sup, [Filename]).
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Stops the server, if started stand-alone. If it was started as a part
-%%  of supervision tree, server will restart.
--spec stop(pid()) -> ok.
-stop(Pid) ->
-    supervisor:terminate_child(ep_file_log_sup, Pid).
-
-%%%===================================================================
-%%% gen_server callbacks
-
-%% Repeat header every 30 lines (by default)
--define(LOG_REPEAT_HEADER, 30).
-
-%% System monitor state
--record(state, {
-    % file logger counter
-    log_counter = ?LOG_REPEAT_HEADER :: non_neg_integer(),
-    % when to print the header once again
-    log_limit = ?LOG_REPEAT_HEADER :: pos_integer(),
-    % file descriptor
-    log_file :: file:io_device(),
-    % current format line
-    format = "" :: string(),
-    % saved list of job IDs executed previously
-    jobs = [] :: [job_sample()]
-}).
-
-%% gen_server init
-init([Target]) ->
-    % subscribe to monitor events
-    erlang:process_flag(trap_exit, true),
-    ep_event_handler:subscribe(?SYSTEM_EVENT),
-    case is_list(Target) of
-        true ->
-            {ok, LogFile} = file:open(Target, [raw, append]),
-            {ok, #state{log_file = LogFile}};
-        false ->
-            {ok, #state{log_file = Target}}
-    end.
-
-handle_call(_Request, _From, _State) ->
-    error(badarg).
-
-handle_cast(_Request, _State) ->
-    error(badarg).
-
-handle_info(#monitor_sample{jobs = Jobs} = Sample, #state{log_file = File} = State) ->
-    {JobIds, Ts} = lists:unzip(Jobs),
-    State1 = maybe_write_header(JobIds, State),
-    % actual line
-    TimeFormat = calendar:system_time_to_rfc3339(Sample#monitor_sample.time div 1000),
-    Formatted = iolist_to_binary(io_lib:format(State1#state.format, [
-        TimeFormat,
-        Sample#monitor_sample.sched_util,
-        Sample#monitor_sample.dcpu,
-        Sample#monitor_sample.dio,
-        Sample#monitor_sample.processes,
-        Sample#monitor_sample.ports,
-        Sample#monitor_sample.ets,
-        erlperf:format_size(Sample#monitor_sample.memory_total),
-        erlperf:format_size(Sample#monitor_sample.memory_processes),
-        erlperf:format_size(Sample#monitor_sample.memory_binary),
-        erlperf:format_size(Sample#monitor_sample.memory_ets)
-    ] ++ [erlperf:format_number(T) || T <- Ts])),
-    ok = file:write(File, Formatted),
-    {noreply, State1};
-
-handle_info(_Info, _State) ->
-    error(badarg).
-
-terminate(_Reason, _State) ->
-    ep_event_handler:unsubscribe(?SYSTEM_EVENT).
-
-%%%===================================================================
-%%% Internal functions
-
-maybe_write_header(Jobs, #state{log_counter = LC, log_limit = LL, jobs = Prev} = State) when LC >= LL; Jobs =/= Prev ->
-    State#state{format = write_header(State#state.log_file, Jobs), log_counter = 0, jobs = Jobs};
-maybe_write_header(_, State) ->
-    State#state{log_counter = State#state.log_counter + 1}.
-
-write_header(File, Jobs) ->
-    JobCount = length(Jobs),
-    Format = "~s ~6.2f ~6.2f ~6.2f ~8b ~8b ~7b ~9s ~9s ~9s ~9s" ++
-        lists:flatten(lists:duplicate(JobCount, "~11s")) ++ "~n",
-    JobIds = list_to_binary(lists:flatten([io_lib:format("  ~6p", [J]) || J <- Jobs])),
-    Header =  <<"\nYYYY-MM-DDTHH:MM:SS-oo:oo  Sched   DCPU    DIO    Procs    Ports     ETS Mem Total  Mem Proc   Mem Bin   Mem ETS", JobIds/binary, "\n">>,
-    ok = file:write(File, Header),
-    Format.
diff --git a/src/ep_history.erl b/src/ep_history.erl
deleted file mode 100644
index 35873b6..0000000
--- a/src/ep_history.erl
+++ /dev/null
@@ -1,110 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%% @doc
-%%%   Saves monitor history.
-%%% @end
--module(ep_history).
--author("maximfca@gmail.com").
-
--behaviour(gen_server).
-
-%% API
--export([
-    start_link/0,
-    get/1,
-    get/2,
-    get_last/1
-]).
-
-%% gen_server callbacks
--export([
-    init/1,
-    handle_call/3,
-    handle_cast/2,
-    handle_info/2,
-    terminate/2
-]).
-
-
--include("monitor.hrl").
-
--define(SERVER, ?MODULE).
-
-% default: keep history for 120 seconds
--define(DEFAULT_HISTORY_DURATION, 120000).
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts the server
--spec(start_link() ->
-    {ok, Pid :: pid()} | ignore | {error, Reason :: term()}).
-start_link() ->
-    gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
-
-
-%% @doc
-%% Returns records starting from From (until now).
--spec get(integer()) -> [].
-get(From) ->
-    get(From, erlang:system_time(millisecond)).
-
-%% @doc
-%% Returns records between From and To (inclusive)
-%% Both From and To are Erlang system time in milliseconds.
--spec get(integer(), integer()) -> [].
-get(From, To) ->
-    gen_server:call(?SERVER, {get, From, To}).
-
-%% @doc
-%% Returns records for last N seconds.
--spec get_last(N :: integer()) -> [].
-get_last(N) ->
-    Now = erlang:system_time(millisecond),
-    get(Now - N * 1000, Now).
-
-%%%===================================================================
-%%% gen_server callbacks
-
-%% System monitor state
--record(state, {
-    history :: queue:queue(),
-    duration = ?DEFAULT_HISTORY_DURATION
-}).
-
-%% gen_server init
-init([]) ->
-    % subscribe to monitor events
-    ep_event_handler:subscribe(?SYSTEM_EVENT),
-    % for unsubscription - gen_event does not do it automatically
-    erlang:process_flag(trap_exit, true),
-    {ok, #state{history = queue:new()}}.
-
-handle_call({get, From, To}, _From, #state{history = History} = State) ->
-    {reply, [Sample || Sample = #monitor_sample{time = Time} <- queue:to_list(History), Time >= From, Time =< To], State};
-
-handle_call(_Request, _From, _State) ->
-    error(badarg).
-
-handle_cast(_Request, _State) ->
-    error(badarg).
-
-handle_info(#monitor_sample{} = Sample, #state{history = History, duration = Duration} = State) ->
-    {noreply, State#state{history = clean_history(queue:in(Sample, History), erlang:system_time(millisecond) - Duration)}};
-
-handle_info(_Info, _State) ->
-    error(badarg).
-
-terminate(_Reason, _State) ->
-    ep_event_handler:unsubscribe(?SYSTEM_EVENT).
-
-%%%===================================================================
-%%% Internal functions
-
-clean_history(Queue, Cutoff) ->
-    case queue:out(Queue) of
-        {{value, #monitor_sample{time = Time}}, Remain} when Time < Cutoff ->
-            clean_history(Remain, Time);
-        _ ->
-            Queue
-    end.
diff --git a/src/ep_job.erl b/src/ep_job.erl
deleted file mode 100644
index 8da5535..0000000
--- a/src/ep_job.erl
+++ /dev/null
@@ -1,534 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%% @doc
-%%%   Job runner, taking care of init/done, workers added and
-%%%     removed.
-%%% @end
--module(ep_job).
--author("maximfca@gmail.com").
-
--behaviour(gen_server).
-
-%% Job API
--export([
-    start/1,
-    start_link/1,
-    stop/1,
-    set_concurrency/2,
-    info/1,
-    get_counters/1,
-    profile/1,
-    profile/3
-]).
-
-%% gen_server callbacks
--export([
-    init/1,
-    handle_call/3,
-    handle_cast/2,
-    handle_info/2,
-    terminate/2
-]).
-
-%% Job: a simple supervisor-type process. Not an actual supervisor because
-%%  there is no need in fault tolerance.
-%% All hooks must be defined in order for job to run.
-%% Counting (atomic reference & index) are supplied externally.
-
-%% MFArgs: module, function, arguments, or a function object.
--type mfargs() :: {module(), atom(), [term()]}.
-
-%% Callable: one or more MFArgs, or a function object, or source code.
--type callable() :: mfargs() | [mfargs()] | fun() | fun((term()) -> term()) | string().
-
-%% Extended "code" variant: runner code, setup/teardown hooks, additional parameters
-%%  used to save/load job.
--type code_map() :: #{
-    runner := callable(),
-    init => callable(),
-    init_runner => callable(),
-    done  => callable(),
-    %
-    initial_concurrency => non_neg_integer(),
-    name => string()
-}.
-
-%% Code: callable, callable with hooks
--type code() :: callable() | code_map().
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts the benchmark instance using default (ep_job_sup) supervisor.
--spec start(code()) -> {ok, Pid :: pid()} | {error, {already_started, pid()}} | {error, Reason :: term()}.
-start(Code) when is_map(Code); tuple_size(Code) =:= 3; is_function(Code); is_list(Code) ->
-    supervisor:start_child(ep_job_sup, [Code]).
-
-%% @doc
-%% Starts the benchmark instance and links it to caller.
-%% Job starts with no workers, use set_concurrency/2 to start some.
--spec start_link(code()) -> {ok, Pid :: pid()} | {error, {already_started, pid()}} | {error, Reason :: term()}.
-start_link(Code) when is_map(Code); tuple_size(Code) =:= 3; is_function(Code); is_list(Code) ->
-    gen_server:start_link(?MODULE, Code, []).
-
-%% @doc
-%% Stops the benchmark instance (via supervisor).
--spec stop(pid()) -> ok.
-stop(JobId) ->
-    supervisor:terminate_child(ep_job_sup, JobId).
-
-%% @doc
-%% Change concurrency setting for this job.
-%% Does not reset counting.
--spec set_concurrency(pid(), non_neg_integer()) -> ok.
-set_concurrency(JobId, Concurrency) ->
-    gen_server:call(JobId, {set_concurrency, Concurrency}).
-
-%% @doc
-%% Returns information about currently running job. Includes job description,
-%%  and number of currently running workers.
--spec info(pid()) -> {ok, code(), Concurrency :: non_neg_integer()}.
-info(JobId) ->
-    gen_server:call(JobId, info).
-
-%% @doc
-%% Internally used by the monitoring process to access job atomic counter.
--spec get_counters(pid()) -> atomics:atomics_ref().
-get_counters(JobId) ->
-    gen_server:call(JobId, get_counters).
-
-%% @doc
-%% Runs a single iteration, using fprof profiler
--spec profile(pid()) -> term().
-profile(JobId) ->
-    profile(JobId, fprof, term).
-
-%% @doc
-%% Runs a profiler for a selected number of runner iterations.
--spec profile(pid(), fprof, term | binary | string) -> term().
-profile(JobId, Profiler, Format) ->
-    profile_impl(gen_server:call(JobId, get_code), Profiler, Format).
-
-%%--------------------------------------------------------------------
-%% Internal definitions
-
--include_lib("kernel/include/logger.hrl").
-
--include("monitor.hrl").
-
--record(state, {
-    % original spec
-    code :: code(),
-    % MFAs for all used/compiled functions
-    init :: callable(),
-    done :: callable(),
-    init_runner :: callable(),
-    runner :: callable(),
-    % code:delete() this module when terminating
-    module = [] :: module() | [],
-    %
-    init_result :: term(),
-    init_runner_result :: term(),
-    %
-    workers = [] :: [pid()],
-    % counter reference (index is always 1)
-    cref :: atomics:atomics_ref()
-}).
-
--type state() :: #state{}.
-
-%%%===================================================================
-%%% gen_server callbacks
-
--spec init(code()) -> {ok, state()}.
-init(Code) ->
-    State0 = maybe_compile(Code),
-    IR = call(State0#state.init, undefined),
-    erlang:process_flag(trap_exit, true),
-    gen_event:notify(?JOB_EVENT, {started, self(), Code, State0#state.cref}),
-    State1 = State0#state{init_result = IR},
-    maybe_save(Code),
-    Concurrency = if is_map(Code) -> maps:get(initial_concurrency, Code, 0); true -> 0 end,
-    {ok, State1#state{
-        workers = set_concurrency_impl(Concurrency, State1)
-    }}.
-
--spec handle_call(term(), {pid(), reference()}, state()) -> {reply, term(), state()}.
-handle_call(info, _From, #state{code = Code} = State) ->
-    {reply, {ok, Code, length(State#state.workers)}, State};
-
-handle_call(get_counters, _From, #state{cref = CRef} = State) ->
-    {reply, CRef, State};
-
-handle_call({set_concurrency, Concurrency}, _From, State) ->
-    {reply, ok, State#state{workers = set_concurrency_impl(Concurrency, State)}};
-
-handle_call(get_code, _From, #state{runner = Runner, init_runner = IR, init_result = IRR} = State) ->
-    {reply, {Runner, IR, IRR}, State};
-
-handle_call(_Request, _From, _State) ->
-    error(badarg).
-
--spec handle_cast(term(), state()) -> no_return().
-handle_cast(_Request, _State) ->
-    error(badarg).
-
--spec handle_info(term(), state()) -> {noreply, state()}.
-handle_info({'EXIT', Worker, Reason}, State) when Reason =:= shutdown; Reason =:= normal ->
-    {noreply, State#state{workers = lists:delete(Worker, State#state.workers)}};
-handle_info({'EXIT', Worker, Reason}, _State) ->
-    ?LOG_ERROR("Worker ~p crashed with ~100p", [Worker, Reason]),
-    {stop, Reason};
-
-handle_info(_Info, _State) ->
-    error(badarg).
-
--spec terminate(term(), state()) -> ok.
-terminate(_Reason, #state{done = Done, init_result = IR, module = Mod} = State) ->
-    % terminate all workers first
-    set_concurrency_impl(0, State),
-    call(Done, IR),
-    if Mod =/= [] ->
-        _ = code:purge(Mod),
-        true = code:delete(Mod),
-        _ = code:purge(Mod),
-        true = code:delete(Mod);
-        true ->
-            ok
-    end.
-
-%%%===================================================================
-%%% Internal: callable & runner implementation
-%%%===================================================================
-
-maybe_save(#{name := Name} = Code) ->
-    Filename = filename:join(code:priv_dir(erlperf), Name),
-    filelib:ensure_dir(Filename),
-    file:write_file(Filename, term_to_binary(Code));
-maybe_save(_) ->
-    ok.
-
-%% undefined: return undefined
-call(undefined, Ret) ->
-    Ret;
-
-%% Simple case: MFArgs, potentially +1 argument
-call({M, F, A}, Arg) when is_atom(M), is_atom(F), is_list(A) ->
-    case erlang:function_exported(M, F, length(A)) of
-        true ->
-            erlang:apply(M, F, A);
-        false ->
-            erlang:apply(M, F, A ++ [Arg])
-    end;
-
-%% MFA List (+1 argument always ignored)
-call([{M, F, A}], _) when is_atom(M), is_atom(F), is_list(A) ->
-    erlang:apply(M, F, A);
-call([{M, F, A} | Tail], _) when is_atom(M), is_atom(F), is_list(A) ->
-    erlang:apply(M, F, A),
-    call(Tail, undefined);
-
-%% function object
-call(Fun, _) when is_function(Fun, 0) ->
-    Fun();
-call(Fun, Arg) when is_function(Fun, 1) ->
-    Fun(Arg).
-
-%% Different runners (optimisation, could've used call/2 for it, but - benchmarking!)
-runner(M, F, A, CRef) ->
-    erlang:apply(M, F, A),
-    atomics:add(CRef, 1, 1),
-    runner(M, F, A, CRef).
-
-runner_list(List, CRef) ->
-    _ = [erlang:apply(M, F, A) || {M, F, A} <- List],
-    atomics:add(CRef, 1, 1),
-    runner_list(List, CRef).
-
-runner_fun_0(Fun, CRef) ->
-    Fun(),
-    atomics:add(CRef, 1, 1),
-    runner_fun_0(Fun, CRef).
-
-runner_fun_1(Fun, IWR, CRef) ->
-    Fun(IWR),
-    atomics:add(CRef, 1, 1),
-    runner_fun_1(Fun, IWR, CRef).
-
-%% necessary optimisation: remove as much as possible from the actual loop
-runner_impl({M, F, A}, IWR, CRef) ->
-    case erlang:function_exported(M, F, length(A)) of
-        true ->
-            runner(M, F, A, CRef);
-        false ->
-            runner(M, F, A ++ [IWR], CRef)
-    end;
-% for [MFA], init/init_runner are not applicable
-runner_impl([{_, _, _} | _] = List, _IWR, CRef) ->
-    runner_list(List, CRef);
-% for fun() with 0 args, init/init_runner are not applicable
-runner_impl(Fun, _IWR, CRef) when is_function(Fun, 0) ->
-    runner_fun_0(Fun, CRef);
-% clause for fun(Arg).
-runner_impl(Fun, IWR, CRef) when is_function(Fun, 1) ->
-    runner_fun_1(Fun, IWR, CRef).
-
-set_concurrency_impl(Concurrency, #state{workers = Workers}) when length(Workers) =:= Concurrency ->
-    Workers;
-
-set_concurrency_impl(Concurrency, #state{workers = Workers, init_runner = InitRunner, init_result = IR,
-    runner = Runner, cref = CRef})
-    when length(Workers) < Concurrency ->
-    Hired = [spawn_link(
-        fun () ->
-            runner_impl(Runner, call(InitRunner, IR), CRef)
-        end)
-        || _ <- lists:seq(length(Workers) + 1, Concurrency)],
-    Workers ++ Hired;
-
-set_concurrency_impl(Concurrency, #state{workers = Workers}) ->
-    {Remaining, ToFire} =
-        if Concurrency > 0 ->
-                lists:split(Concurrency, Workers);
-            true ->
-                {[], Workers}
-        end,
-    [exit(Pid, shutdown) || Pid <- ToFire],
-    % monitors procs die
-    wait_for_killed(ToFire),
-    %
-    Remaining.
-
-wait_for_killed([]) ->
-    ok;
-wait_for_killed([Pid | Tail]) ->
-    receive
-        {'EXIT', Pid, _} ->
-            wait_for_killed(Tail)
-    end.
-
-%%%===================================================================
-%%% Compilation primitives
-%%%===================================================================
-
--define (IS_MFA(XXMFA), is_tuple(XXMFA); is_tuple(hd(XXMFA)); is_function(XXMFA); XXMFA =:= []; XXMFA =:= {[]}).
-
-maybe_compile(Code) when not is_map(Code) ->
-    State0 = maybe_compile(#{runner => Code}),
-    State0#state{code = Code};
-
-maybe_compile(#{runner := _Runner} = Code) ->
-    case need_compile(Code) of
-        false ->
-            #state{
-                runner = ensure_loaded(maps:get(runner, Code)),
-                init = ensure_loaded(maps:get(init, Code, undefined)),
-                init_runner = ensure_loaded(maps:get(init_runner, Code, undefined)),
-                done = ensure_loaded(maps:get(done, Code, undefined)),
-                cref = atomics:new(1, []),
-                code = Code
-            };
-        true ->
-            do_compile(Code)
-    end.
-
-need_compile(Code) ->
-    lists:any(
-        fun (FName) ->
-            case maps:find(FName, Code) of
-                {ok, MFA} when ?IS_MFA(MFA) ->
-                    false;
-                {ok, Text} when is_list(Text) ->
-                    true;
-                error ->
-                    true
-            end
-        end,
-        [runner, init, init_runner, done]).
-
-%% Some compilation needed
-do_compile(Code) ->
-    Mod = module_name(),
-    ModForm = {attribute, 1, module, Mod},
-    % make MFA (and forms if it's source code)
-    Runner = try_export(Mod, runner, Code),
-    Init = try_export(Mod, init, Code),
-    Done = try_export(Mod, done, Code),
-    InitRunner = try_export(Mod, init_runner, Code),
-    %
-    AllCodes = [Runner, Init, InitRunner, Done],
-    %
-    ExportForm = {attribute,1,export,[{Name, Arity} || {_, {Name, Arity, _}} <- AllCodes]},
-    %
-    AllForms = [ModForm, ExportForm | [Form || {_, {_, _, Form}} <- AllCodes]],
-    %
-    % ct:pal("Code: ~tp", [AllForms]),
-    %
-    {module, Mod} =
-        case compile:forms(AllForms, [return]) of
-            {ok, App, Bin} ->
-                code:load_binary(App, atom_to_list(Mod), Bin);
-            {ok, App, Bin, Ws} ->
-                [?LOG_WARNING("~s:~b ~s", [F, Line, CompileMod:format_error(Desc)])
-                    || {F, Is} <- Ws, {Line, CompileMod, Desc} <- Is],
-                code:load_binary(App, atom_to_list(Mod), Bin);
-            {error, Errors, Ws} ->
-                [?LOG_WARNING("~s:~b ~s", [F, Line, CompileMod:format_error(Desc)])
-                    || {F, Is} <- Ws, {Line, CompileMod, Desc} <- Is],
-                Errors = lists:flatten([io_lib:format("~s:~b ~s", [F, Line, CompileMod:format_error(Desc)])
-                    || {F, Is} <- Ws, {Line, CompileMod, Desc} <- Is]),
-                error({compiler, Errors})
-        end,
-    #state{
-        module = Mod,
-        runner = ensure_callable(Runner),
-        init = ensure_callable(Init),
-        done = ensure_callable(Done),
-        init_runner = ensure_callable(InitRunner),
-        cref = atomics:new(1, []),
-        code = Code
-    }.
-
-module_name() ->
-    list_to_atom(lists:flatten(io_lib:format("job_~p_~p", [node(), self()]))).
-
-%% Returns {fun/MFA, undefined | Abstract Form}
-try_export(Mod, Name, Map) ->
-    case maps:find(Name, Map) of
-        error ->
-            % no code defined
-            undefined;
-        {ok, Code} when ?IS_MFA(Code) ->
-            % MFA or MFAList
-            {Code, undefined};
-        {ok, Code} ->
-            % source code
-            {Name1, Arity, Form} = export(Name, Code),
-            {{Mod, Name1, []}, {Name1, Arity, Form}}
-    end.
-
-ensure_loaded({M, _, _} = MFA) when is_atom(M) ->
-    case code:ensure_loaded(M) of
-        {module, M} ->
-            MFA;
-        {error, nofile} ->
-            error({module_not_found, M})
-    end;
-ensure_loaded(Empty) when Empty =:= []; Empty =:= {[]} ->
-    error("empty callable");
-ensure_loaded(Other) ->
-    Other.
-
-%% Converts a text form into Erlang Abstract Form,
-%%  and returns function name.
--spec export(done | init | init_runner | runner, string()) -> {atom(), byte(), erl_parse:abstract_form()}.
-export(DefaultName, Text) ->
-    {ok, Scan, _} = erl_scan:string(Text),
-    case erl_parse:parse_form(Scan) of
-        {ok, {function, _, Name, Arity, _} = Form} ->
-            {Name, Arity, Form};
-        {error, _} ->
-            % try if it's an expr
-            case erl_parse:parse_exprs(Scan) of
-                {ok, Clauses} ->
-                    Form = {function, 1, DefaultName, 0,
-                        [{clause,1,[],[], Clauses}]},
-                    {DefaultName, 0, Form};
-                {error, {_Line, ParseMod, Es}} ->
-                    Errors = ParseMod:format_error(Es),
-                    error({parse, Errors})
-            end
-    end.
-
-ensure_callable(undefined) ->
-    undefined;
-ensure_callable({MFA, _}) ->
-    ensure_loaded(MFA).
-
-%%%===================================================================
-%%% Profiling support
-
-profile_impl({Runner, InitRunner, InitResult}, fprof, Format) ->
-    {Pid, Ref} = spawn_monitor(fun () -> exit(run_fprof(Runner, InitRunner, InitResult, Format)) end),
-    receive
-        {'DOWN', Ref, process, Pid, {ok, IO}} ->
-            IO;
-        {'DOWN', Ref, process, Pid, Error} ->
-            error(Error)
-    end.
-
-ensure_fprof_started({ok, _Pid}) ->
-    ok;
-ensure_fprof_started({error, {already_started, _Pid}}) ->
-    ok;
-ensure_fprof_started(Error) ->
-    Error.
-
-run_fprof(Runner, InitRunner, InitResult, Format) ->
-    ok = ensure_fprof_started(fprof:start()),
-    IRR = call(InitRunner, InitResult),
-    case Runner of
-        {M, F, A} ->
-            case erlang:function_exported(M, F, length(A)) of
-                true ->
-                    fprof:apply(M, F, A);
-                false ->
-                    fprof:apply(M, F, A ++ [IRR])
-            end;
-        List when is_list(List) ->
-            fprof:trace(start),
-            _ = [erlang:apply(M, F, A) || {M, F, A} <- List],
-            fprof:trace(stop);
-        Fun when is_function(Fun, 0) ->
-            fprof:apply(Fun, []);
-        Fun when is_function(Fun) ->
-            fprof:apply(Fun, [IRR])
-    end,
-    ok = fprof:profile(),
-    % don't use file-base output, generate an Erlang structure
-    % TODO: this is quite a weird way, consult a text output...
-    TypeWriter = proc_lib:spawn_link(fun () -> capture_io([]) end),
-    ok = fprof:analyse([{dest, TypeWriter}]),
-    TypeWriter ! {read, self()},
-    receive
-        {result, Result} ->
-            IO = process_fprof_result(Result, Format),
-            {ok, IO}
-    after 5000 ->
-        timeout
-    end.
-
-process_fprof_result(Io, term) ->
-    {_, Terms} =
-        lists:foldl(
-            fun (S, {Cont, Acc}) ->
-                List = binary_to_list(S),
-                case erl_scan:tokens(Cont, List, 1, []) of
-                    {more, Cont1} ->
-                        {Cont1, Acc};
-                    {done, Term, LeftOver} ->
-                        {more, Cont1} = erl_scan:tokens([], LeftOver, 1, []),
-                        {ok, Term1, _} = Term,
-                        {ok, Term2} = erl_parse:parse_term(Term1),
-                        {Cont1, [Term2 | Acc]}
-                end
-            end, {[], []}, Io),
-    lists:reverse(Terms);
-process_fprof_result(Io, string) ->
-    [binary_to_list(S) || S <- Io];
-process_fprof_result(Io, binary) ->
-    Io.
-
-capture_io(Io) ->
-    receive
-        {io_request, From, Me, {put_chars, _Encoding, Binary}} ->
-            From ! {io_reply, Me, ok},
-            capture_io([iolist_to_binary(Binary) | Io]);
-        {io_request, From, Me, {put_chars, _Encoding, M, F, A}} ->
-            From ! {io_reply, Me, ok},
-            capture_io([iolist_to_binary(apply(M,F, A)) | Io]);
-        {read, WhereTo} ->
-            WhereTo ! {result, lists:reverse(Io)};
-        _Other ->
-            capture_io(Io)
-    end.
diff --git a/src/ep_monitor_proxy.erl b/src/ep_monitor_proxy.erl
deleted file mode 100644
index 3a956f7..0000000
--- a/src/ep_monitor_proxy.erl
+++ /dev/null
@@ -1,80 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%% @doc
-%%%   Forwards monitoring events to process group.
-%%%     It's technically possible to avoid using gen_server,
-%%%     and simply do everything in gen_event handler, but this
-%%%     does not play well with supervision.
-%%% @end
--module(ep_monitor_proxy).
--author("maximfca@gmail.com").
-
--behaviour(gen_server).
-
-%% API
--export([
-    start_link/0
-]).
-
-%% gen_server callbacks
--export([
-    init/1,
-    handle_call/3,
-    handle_cast/2,
-    handle_info/2
-]).
-
--include("monitor.hrl").
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts the server
--spec(start_link() ->
-    {ok, Pid :: pid()} | ignore | {error, Reason :: term()}).
-start_link() ->
-    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
-
-
-%%%===================================================================
-%%% gen_server callbacks
-
-%% Cluster logger state
--record(state, {
-}).
-
--type state() :: #state{}.
-
-%% gen_server init
-%% Suppress dialyzer warning for OTP compatibility: erlperf runs on OTP20
-%%  that does not support pg, and has pg2 instead.
--dialyzer({no_missing_calls, init/1}).
--compile({nowarn_deprecated_function, [{pg2, create, 1}, {pg2, join, 2}]}).
--compile({nowarn_removed, [{pg2, create, 1}, {pg2, get_members, 1}]}).
--spec init([]) -> {ok, state()}.
-init([]) ->
-    catch pg2:create(?HISTORY_PROCESS_GROUP), %% don't care if it fails, and it does on OTP 24
-    ep_event_handler:subscribe(?SYSTEM_EVENT),
-    {ok, #state{}}.
-
-handle_call(_Request, _From, _State) ->
-    error(badarg).
-
--spec handle_cast(term(), state()) -> no_return().
-handle_cast(_Request, _State) ->
-    error(badarg).
-
--dialyzer({no_missing_calls, handle_info/2}).
-handle_info(#monitor_sample{} = Sample, State) ->
-    Monitors =
-        try pg:get_members(?HISTORY_PROCESS_GROUP)
-        catch error:undef -> pg2:get_members(?HISTORY_PROCESS_GROUP)
-    end,
-    [Pid ! {node(), Sample} || Pid <- Monitors],
-    {noreply, State};
-
-handle_info(_Info, _State) ->
-    error(badarg).
-
-%%%===================================================================
-%%% Internal functions
diff --git a/src/ep_monitor_sup.erl b/src/ep_monitor_sup.erl
deleted file mode 100644
index 9e4ba90..0000000
--- a/src/ep_monitor_sup.erl
+++ /dev/null
@@ -1,93 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%% @doc
-%%% Monitoring supervisor. Does not start when application is
-%%%   running in embedded mode.
-%%% @end
-%%%-------------------------------------------------------------------
-
--module(ep_monitor_sup).
-
--behaviour(supervisor).
-
--export([
-    start_link/0,
-    start_link/2,
-    init/1
-]).
-
--include("monitor.hrl").
-
--spec start_link() -> supervisor:startlink_ret().
-start_link() ->
-    case application:get_env(start_monitor) of
-        {ok, false} ->
-            ignore;
-        _ ->
-            supervisor:start_link({local, ?MODULE}, ?MODULE, [])
-    end.
-
--spec start_link(RegName :: atom(), LogTo :: ep_file_log | ep_cluster_monitor) -> supervisor:startlink_ret().
-start_link(Sup, Log) ->
-    supervisor:start_link({local, Sup}, ?MODULE, Log).
-
--spec init(ep_file_log | ep_cluster_monitor | []) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}.
-init(Log) when Log =:= ep_file_log; Log =:= ep_cluster_monitor ->
-    SupFlags = #{
-        strategy => simple_one_for_one,
-        intensity => 5,
-        period => 60},
-    ChildSpecs = [
-        #{id => Log,
-            start => {Log, start_link, []},
-            modules => [Log]}
-    ],
-    {ok, {SupFlags, ChildSpecs}};
-
-init([]) ->
-    SupFlags = #{strategy => rest_for_one,
-                 intensity => 5,
-                 period => 60},
-    ChildSpecs = [
-        % event bus for applications willing to receive monitoring updates
-        % suffers from all traditional gen_event problems
-        #{id => ?SYSTEM_EVENT,
-            start => {gen_event, start_link, [{local, ?SYSTEM_EVENT}]},
-            modules => dynamic},
-
-        %% history collection
-        #{id => ep_history,
-            start => {ep_history, start_link, []},
-            modules => [ep_history]},
-
-        %% pg2 logger
-        #{id => ep_monitor_proxy,
-            start => {ep_monitor_proxy, start_link, []},
-            modules => [ep_monitor_proxy]},
-
-        %% entire cluster monitoring
-        #{id => ep_cluster_history,
-            start => {ep_cluster_history, start_link, []},
-            modules => [ep_cluster_history]},
-
-        % system monitor: collects all jobs statistical data to
-        %   fire events, depends on system_event to be alive
-        #{id => ep_monitor,
-            start => {ep_monitor, start_link, []},
-            modules => [ep_monitor]},
-
-        % file/console node-only monitor
-        #{id => ep_file_log_sup,
-            start => {?MODULE, start_link, [ep_file_log_sup, ep_file_log]},
-            type => supervisor,
-            modules => [?MODULE]},
-
-        % cluster-wide monitoring
-        #{id => ep_cluster_monitor_sup,
-            start => {?MODULE, start_link, [ep_cluster_monitor_sup, ep_cluster_monitor]},
-            type => supervisor,
-            modules => [?MODULE]}
-
-        ],
-    {ok, {SupFlags, ChildSpecs}}.
diff --git a/src/ep_prof.erl b/src/ep_prof.erl
deleted file mode 100644
index eedf160..0000000
--- a/src/ep_prof.erl
+++ /dev/null
@@ -1,580 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%% @doc
-%%%   Call time profiler, and a series of helper functions.
-%%% @end
--module(ep_prof).
--author("maximfca@gmail.com").
-
--behaviour(gen_server).
-
-%% Simple (immediate) API, useful for benchmarking process
--export([
-    time/1,
-    time/4,
-    trace/1,
-    trace/4,
-    record/2,
-    sample/4,
-    format_callgrind/1
-]).
-
-%% Continuous profiling API
--export([
-    start_trace/0,
-    start_trace/1,
-    stop_trace/0,
-    collect/0,
-    collect/1,
-    format/0,
-    format/1
-]).
-
-%% Generic start/stop API, gen_server callbacks
--export([
-    start/0,
-    start_link/0,
-    stop/0,
-    stop/1,
-
-    init/1,
-    handle_call/3,
-    handle_cast/2,
-    handle_info/2,
-    terminate/2
-]).
-
-%%%===================================================================
-%%% API
-
-%% Trace specification, used for erlang:trace/3 call.
-%% @see erlang:trace/3
--type pid_port_spec() :: pid() | port() | all | processes | ports|
-    existing | existing_processes | existing_ports |
-    new | new_processes | new_ports.
-
-%% Caller, MFA + how many times this called called function under tracing.
--type caller() :: {
-    CallerMod :: module(),
-    CallerFun :: atom(),
-    CallerArity :: non_neg_integer(),
-    CallerCount :: pos_integer()
-}.
-
-%% Progress callback: MF & initial state.
--type progress_callback() :: silent | {module(), atom(), term()}.
-
-%% Result of time/1,4 call.
--type module_time() :: {Module :: module(),
-    [{
-        Fun :: atom(), Arity :: non_neg_integer(),
-        Count :: non_neg_integer(), TimeMs :: non_neg_integer()
-    }]}.
-
-
-%% @doc
-%% Performs call time tracing for all processes, all modules and all functions.
-%% @param TimeMs milliseconds to wait.
-%% Does not collect any call stacks. Fast enough to be used even on highly-loaded
-%%  systems running thousands of processes.
--spec time(TimeMs :: non_neg_integer()) -> [module_time()].
-time(TimeMs) ->
-    time(TimeMs, all, [{'_', '_', '_'}], silent).
-
-%% @doc
-%% Performs call time tracing for specific processes, modules or functions.
-%%  Accepts a progress report callback.
--spec time(TimeMs :: non_neg_integer(), pid_port_spec(), erlang:trace_pattern_mfa(),
-    progress_callback()) -> [module_time()].
-time(TimeMs, PidPortSpec, MFAList, Progress) when is_list(MFAList), tuple_size(hd(MFAList)) =:= 3 ->
-    [erlang:trace_pattern(MFA, true, [local, call_time]) || MFA <- MFAList],
-    erlang:trace(PidPortSpec, true, [silent, call]),
-    receive after TimeMs -> ok end,
-    erlang:trace(PidPortSpec, false, [call]),
-    Data = collect_impl(undefined, Progress),
-    erlang:trace_pattern({'_', '_', '_'}, false, [local, call_time]),
-    process_callers(Data, #{}).
-
-%% Result of trace/1,4 call.
--type module_trace() :: {Module :: module(),
-    [{
-        Fun :: atom(), Arity :: non_neg_integer(),
-        Count :: non_neg_integer(), TimeMs :: non_neg_integer(),
-        [caller()]
-    }]}.
-
-%% @doc
-%% Performs call time tracing for all processes, all modules and all functions, and
-%%  also collects information about the caller.
-%%  This is considerably slower than using time/1, because actual tracing is being used.
-%% @param TimeMs milliseconds to wait
--spec trace(TimeMs :: non_neg_integer()) -> [module_trace()].
-trace(TimeMs) ->
-    trace(TimeMs, all, [{'_', '_', '_'}], silent).
-
-%% @doc
-%% Performs call time tracing, and collects information about the caller.
--spec trace(TimeMs :: non_neg_integer(), pid_port_spec(), erlang:trace_pattern_mfa(),
-    progress_callback()) -> [module_trace()].
-trace(TimeMs, PidPortSpec, MFAList, Progress) ->
-    TracerPid = spawn_link(fun tracer/0),
-    TraceSpec = [{'_', [], [{message, {{cp, {caller}}}}]}],
-    [erlang:trace_pattern(MFA, TraceSpec, [local, call_time]) || MFA <- MFAList],
-    erlang:trace(PidPortSpec, true, [arity, call, {tracer, TracerPid}]),
-    receive after TimeMs -> ok end,
-    erlang:trace(PidPortSpec, false, [call]),
-    Data = collect_impl(TracerPid, Progress),
-    erlang:trace_pattern({'_', '_', '_'}, false, [local, call_time]),
-    {Counts, #{}} = fetch_trace(TracerPid, Progress, 240000),
-    Callers = collect_callers(Counts),
-    process_callers(Data, Callers).
-
-%% @doc
-%% Traces calls to MFA and records arguments passed.
--spec record(TimeMs :: non_neg_integer(),
-    {Mod :: module() | '_', Fun :: atom() | '_', Arity :: non_neg_integer() | '_'}) -> [[term()]] | not_found.
-record(TimeMs, MFA) ->
-    TracerPid = spawn_link(fun tracer/0),
-    TraceSpec = [{'_', [], [{message, {{cp, {caller}}}}]}],
-    case erlang:trace_pattern(MFA, TraceSpec, [local]) of
-        0 ->
-            erlang:error(not_found);
-        _Trc ->
-            erlang:trace(all, true, [call, {tracer, TracerPid}]),
-            receive after TimeMs -> ok end,
-            erlang:trace(all, false, [call]),
-            erlang:trace_pattern({'_', '_', '_'}, false, [local]),
-            {[], Samples} = fetch_trace(TracerPid, undefined, infinity),
-            Samples
-    end.
-
-%% @doc
-%% Records samples of calls to any function in MFA list passed.
-sample(PidPortSpec, TimeMs, MFAList, Progress) when is_list(MFAList), tuple_size(hd(MFAList)) =:= 3 ->
-    TracerPid = spawn_link(fun tracer/0),
-    TraceSpec = [{'_', [], [{message, {{cp, {caller}}}}]}],
-    [erlang:trace_pattern(MFA, TraceSpec, [local]) || MFA <- MFAList],
-    erlang:trace(PidPortSpec, true, [call, {tracer, TracerPid}]),
-    receive after TimeMs -> ok end,
-    erlang:trace(PidPortSpec, false, [call]),
-    erlang:trace_pattern({'_', '_', '_'}, false, [local]),
-    {[], Samples} = fetch_trace(TracerPid, Progress, infinity),
-    Samples.
-
-format_callgrind(Data) ->
-    % build a large iolist, write it down
-    Lines = lists:map(fun ({Mod, Funs}) ->
-        [io_lib:format("fl=~s\n", [Mod]), lists:map(fun ({F,A,C,Us,Calls}) ->
-            Called = [io_lib:format("cfl=~s\ncfn=~s:~b\ncalls=~b 1\n1 1 1\n", [CM, CF, CA, CC]) || {CM,CF,CA,CC} <- Calls],
-            io_lib:format("fn=~s/~b\n1 ~b ~b\n", [F,A,Us,C]) ++ Called;
-            ({F,A,C,Us}) ->
-                io_lib:format("fn=~s/~b\n1 ~b ~b\n", [F,A,Us,C])
-                                                    end, Funs)]
-                      end, Data),
-    iolist_to_binary([<<"# callgrind format\nevents: CallTime Calls\n\n">> | Lines]).
-
-%%%===================================================================
-%%% gen_server API
-%%%===================================================================
-start_trace() ->
-    start_trace(#{}).
-
-start_trace(Options0) ->
-    Options = maps:merge(#{spec => all, sample => none, arity => true}, Options0),
-    gen_server:call(?MODULE, {start_trace, Options}, infinity).
-
-stop_trace() ->
-    gen_server:call(?MODULE, stop_trace, infinity).
-
-collect() ->
-    collect(#{}).
-
-collect(Options0) ->
-    Options = maps:merge(#{progress => undefined}, Options0),
-    gen_server:call(?MODULE, {collect, Options}, infinity).
-
-format() ->
-    format(#{}).
-
-format(Options0) ->
-    Options = maps:merge(#{sort => none, format => callgrind, progress => undefined}, Options0),
-    gen_server:call(?MODULE, {format, Options}, infinity).
-
-%%%===================================================================
-%%% Generic start/stop API
-%%%===================================================================
-
--record(state, {
-    tracing = false :: boolean(),
-    tracer = undefined :: undefined | pid(),
-    call_time = undefined :: undefined | [{module(), [{{Fun :: atom(), non_neg_integer()},
-        Count :: non_neg_integer(), Time :: non_neg_integer()}]}], % {ets, [{internal_select_delete,2,1,1}, {internal_delete_all,2,3,4}]}
-    samples = undefined :: undefined | {[{module(), atom(), non_neg_integer(), term()}], term()},
-    traced_procs :: term()
-}).
-
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts call tracing profiler outside of supervision tree.
--spec(start() ->
-    {ok, Pid :: pid()} | ignore | {error, Reason :: term()}).
-start() ->
-    gen_server:start({local, ?MODULE}, ?MODULE, [], []).
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts call tracing profiler as a part of supervision tree
--spec(start_link() ->
-    {ok, Pid :: pid()} | ignore | {error, Reason :: term()}).
-start_link() ->
-    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Stops call tracing profiler
-stop()  ->
-    stop(infinity).
-
--spec stop(Timeout :: integer() | infinity) -> ok.
-stop(Timeout)  ->
-    gen_server:stop(?MODULE, shutdown, Timeout).
-
-%%%===================================================================
-%%% gen_server callbacks
-%%%===================================================================
-
--spec init(Args :: term()) -> {ok, State :: #state{}}.
-init([]) ->
-    {ok, #state{}}.
-
-%%--------------------------------------------------------------------
-%% @private
-%% @doc
-%% Handling call messages
--spec(handle_call(Request :: term(), From :: {pid(), Tag :: term()},
-    State :: #state{}) ->
-    {reply, Reply :: term(), NewState :: #state{}} |
-    {reply, Reply :: term(), NewState :: #state{}, timeout() | hibernate} |
-    {noreply, NewState :: #state{}} |
-    {noreply, NewState :: #state{}, timeout() | hibernate} |
-    {stop, Reason :: term(), Reply :: term(), NewState :: #state{}} |
-    {stop, Reason :: term(), NewState :: #state{}}).
-
-handle_call({start_trace, #{spec := PidPortSpec, sample := Sample, arity := Arity0}}, _From, #state{tracing = false} = State) ->
-    % trace call_time for all
-    erlang:trace_pattern({'_', '_', '_'}, true, [local, call_time]),
-    %
-    Tracer = case Sample of
-        [_|_] ->
-            % trace selected MFAs
-            Arity = case Arity0 of true -> [arity]; _ -> [] end,
-            TracerPid = spawn_link(fun tracer/0),
-            TraceSpec = [{'_', [], [{message, {{cp, {caller}}}}]}],
-            [erlang:trace_pattern(MFA, TraceSpec, [local]) || MFA <- Sample],
-            erlang:trace(PidPortSpec, true, [call, {tracer, TracerPid}] ++ Arity),
-            TracerPid;
-        _ ->
-            erlang:trace(PidPortSpec, true, [silent, call]),
-            undefined
-    end,
-    {reply, ok, State#state{tracing = true, traced_procs = PidPortSpec, tracer = Tracer}};
-
-handle_call({start_trace, _PidPortSpec, _}, _From, #state{tracing = true} = State) ->
-    {reply, {error, already_started}, State};
-
-handle_call(stop_trace, _From, #state{tracing = true, traced_procs = PidPortSpec} = State) ->
-    erlang:trace(PidPortSpec, false, [call]),
-    erlang:trace_pattern({'_', '_', '_'}, pause, [local, call_time]),
-    {reply, ok, State#state{tracing = false}};
-
-handle_call(stop_trace, _From, #state{tracing = false} = State) ->
-    {reply, {error, not_started}, State};
-
-handle_call({collect, _}, _From, #state{tracing = true} = State) ->
-    {reply, {error, tracing}, State};
-
-handle_call({collect, #{progress := Progress}}, _From, #state{tracing = false, tracer = Tracer} = State) ->
-    Data = collect_impl(Tracer, Progress),
-    erlang:trace_pattern({'_', '_', '_'}, false, [local, call_time]),
-    % request data from tracer (ask for Progress too!)
-    Samples = fetch_trace(Tracer, Progress, infinity),
-    %
-    {reply, ok, State#state{call_time = Data, samples = Samples}};
-
-handle_call({format, _}, _From, #state{tracing = true} = State) ->
-    {reply, {error, tracing}, State};
-handle_call({format, _}, _From, #state{call_time = undefined} = State) ->
-    {reply, {error, no_trace}, State};
-
-% no sampling/tracing done
-handle_call({format, #{format := Format, sort := SortBy, progress := Progress}}, _From,
-    #state{tracing = false, call_time = Data, samples = undefined} = State) ->
-    {reply, {ok, format_analysis(Data, #{}, {Progress, export}, Format, SortBy)}, State};
-
-% include tracing
-handle_call({format, #{format := Format, sort := SortBy, progress := Progress}}, _From,
-    #state{tracing = false, call_time = Data, samples = {Count, Trace}} = State) ->
-    Callers = collect_callers(Count),
-    {reply, {ok, format_analysis(Data, Callers, {Progress, export}, Format, SortBy), Trace}, State};
-
-handle_call(_Request, _From, _State) ->
-    error(badarg).
-
-handle_cast(_Request, _State) ->
-    error(badarg).
-
--spec(handle_info(Info :: timeout() | term(), State :: #state{}) ->
-    {noreply, NewState :: #state{}} |
-    {noreply, NewState :: #state{}, timeout() | hibernate} |
-    {stop, Reason :: term(), NewState :: #state{}}).
-
-handle_info({'DOWN',_MRef,process,Pid,normal}, #state{tracer = Pid, tracing = false} = State) ->
-    {noreply, State#state{tracer = undefined}};
-
-handle_info(_Info, _State) ->
-    error(badarg).
-
--spec(terminate(Reason :: (normal | shutdown | {shutdown, term()} | term()),
-    State :: #state{}) -> term()).
-terminate(_Reason, #state{tracing = true, traced_procs = PidPortSpec}) ->
-    erlang:trace(PidPortSpec, false, [call]),
-    erlang:trace_pattern({'_', '_', '_'}, false, [local, call_time]);
-terminate(_Reason, _State) ->
-    ok.
-
-%%%===================================================================
-%%% Tracer process. Discards PIDs, either collects ETS table
-%%  with {{M, F, Arity, Caller}, Count}, or a map {Caller, [{M, F, Args}]} for
-%%  extended tracing.
-tracer() ->
-    process_flag(message_queue_data, off_heap),
-    TracerTab = ets:new(tracer, [set, private]),
-    tracer_loop(TracerTab, #{}).
-
-tracer_loop(Tab, Data) ->
-    receive
-        {trace, _Pid, call, {M, F, Arity}, {cp, Caller}} when is_integer(Arity) ->
-            ets:update_counter(Tab, {M, F, Arity, Caller}, 1, {{M, F, Arity, Caller}, 1}),
-            tracer_loop(Tab, Data);
-        {trace, _Pid, call, MFA, {cp, Caller}} ->
-            Data1 = maps:update_with(Caller, fun (L) -> [MFA|L] end, [MFA], Data),
-            tracer_loop(Tab, Data1);
-        {data, Control} ->
-            Control ! {data, ets:tab2list(Tab), Data},
-            ets:delete_all_objects(Tab),
-            tracer_loop(Tab, []);
-        stop ->
-            ok
-    end.
-
-%%%===================================================================
-%%% Internal functions
-%%%===================================================================
-
-pmap(List, Extra, Fun, Message, Timeout) ->
-    Parent = self(),
-    Workers = [spawn_monitor(fun() ->
-        Parent ! {self(), Fun(Item, Extra)}
-                             end) || Item <- List],
-    gather(Workers, {Message, 0, length(List), undefined}, Timeout, []).
-
-gather([], _Progress, _Timeout, Acc) ->
-    {ok, Acc};
-gather(Workers, {Message, Done, Total, PrState} = Progress, Timeout, Acc) ->
-    receive
-        {Pid, Res} when is_pid(Pid) ->
-            case lists:keytake(Pid, 1, Workers) of
-                {value, {Pid, MRef}, NewWorkers} ->
-                    erlang:demonitor(MRef, [flush]),
-                    NewPrState = report_progress(Message, Done + 1, Total, PrState),
-                    gather(NewWorkers, {Message, Done + 1, Total, NewPrState}, Timeout, [Res | Acc]);
-                false ->
-                    gather(Workers, Progress, Timeout, Acc)
-            end;
-        {'DOWN', MRef, process, Pid, Reason} ->
-            case lists:keyfind(Pid, 1, Workers) of
-                {Pid, MRef} ->
-                    % stop collecting results, as they're broken anyway, exit all spawned procs
-                    [exit(P, kill) || {P, _MRef} <- Workers],
-                    {error, Reason};
-                false ->
-                    gather(Workers, Progress, Timeout, Acc)
-            end
-    after Timeout ->
-        timeout
-    end.
-
-collect_impl(Tracer, Progress) ->
-    {ok, Data} = pmap(
-        [{'$system', undefined} | code:all_loaded()],
-        Tracer,
-        fun trace_time/2,
-        {Progress, trace_info},
-        infinity),
-    Data.
-
-trace_time({'$system', _}, Tracer) ->
-    Map = lists:foldl(fun ({M, F, A}, Acc) ->
-        maps:update_with(M, fun (L) -> [{F, A} | L] end, [], Acc)
-                      end, #{}, erlang:system_info(snifs)),
-    SysMods = maps:map(fun(Mod, Funs) ->
-        lists:filtermap(fun ({F, A}) ->
-            collate_mfa(F, A, erlang:trace_info({Mod, F, A}, call_time), Tracer)
-                        end, Funs)
-                       end, Map),
-    maps:to_list(SysMods);
-trace_time({Mod, _}, Tracer) ->
-    [{Mod, lists:filtermap(fun ({F, A}) ->
-        collate_mfa(F, A, erlang:trace_info({Mod, F, A}, call_time), Tracer)
-                           end, Mod:module_info(functions))}].
-
-collate_mfa(F, A, {call_time, List}, Tracer) when is_list(List) ->
-    {Cnt, Clock} = lists:foldl(
-        fun ({Pid, _, _, _}, Acc) when Tracer =:= Pid ->
-                Acc;
-            ({_, C, S, U}, {Cnt, Us}) ->
-                {Cnt + C, Us + U + S * 1000000}
-        end, {0, 0}, List),
-    {true, {F, A, Cnt, Clock}};
-collate_mfa(_, _, _, _) ->
-    false.
-
-% Sorting support
-expand_mods(Data) ->
-    List = lists:append(Data),
-    lists:append([[{Mod, F, A, C, T} || {F, A, C, T} <- Funs] || {Mod, Funs} <- List]).
-
-sort_column(call_time) -> 5;
-sort_column(call_count) -> 4;
-sort_column(module) -> 1;
-sort_column('fun') -> 2.
-
-% [{M,F,A,Caller,Count}]
-collect_callers(Counts) ->
-    lists:foldl(fun ({{M,F,A,Caller},Count}, Map) ->
-        maps:update_with(Caller, fun (L) ->
-            [{M,F,A,Count} | L]
-                                 end, [{M,F,A,Count}], Map)
-                end, #{}, Counts).
-
-format_analysis(Data, _, _Progress, none, none) ->
-    lists:append(Data);
-format_analysis(Data, _, _Progress, none, Order) ->
-    lists:append([lists:reverse(lists:keysort(sort_column(Order), expand_mods(Data)))]);
-
-format_analysis(Data, _, _Progress, text, none) ->
-    io_lib:format("~p", [lists:append(Data)]);
-format_analysis(Data, _, _Progress, text, Order) ->
-    io_lib:format("~p", [lists:reverse(lists:keysort(sort_column(Order), expand_mods(Data)))]);
-
-format_analysis(Data, CountMap, Progress, callgrind, _) ->
-    % prepare data in parallel
-    {ok, Lines} = pmap(Data, CountMap, fun format_callgrind/2, Progress, infinity),
-    % concatenate binaries
-    merge_binaries(Lines, <<"# callgrind format\nevents: CallTime Calls\n">>).
-
-merge_binaries([], Binary) ->
-    Binary;
-merge_binaries([H|T], Binary) ->
-    merge_binaries(T, <<Binary/binary, 10:8, H/binary>>).
-
-format_callgrind_line({Mod, Funs}, {Count, Acc}) ->
-    Mt = atom_to_binary(Mod, latin1),
-    NextAcc = <<Acc/binary, <<"fl=">>/binary, Mt/binary, 10:8>>,
-    {Count, lists:foldl(fun ({F, A, C, T}, Bin) ->
-        Ft = atom_to_binary(F, latin1),
-        At = integer_to_binary(A),
-        Ct = integer_to_binary(C),
-        Ut = integer_to_binary(T),
-        NoCallsLine = <<Bin/binary, <<"fn={">>/binary, Ft/binary, $,, At/binary,
-            <<"}\n1 ">>/binary, Ut/binary, $ , Ct/binary, 10:8>>,
-        case maps:get({Mod, F, A}, Count, undefined) of
-            undefined ->
-                NoCallsLine;
-            Callees when is_list(Callees) ->
-                lists:foldl(fun ({CM, CF, CA, CC}, CallAcc) ->
-                    CMt = atom_to_binary(CM, latin1),
-                    CFt = atom_to_binary(CF, latin1),
-                    CAt = integer_to_binary(CA),
-                    CCt = integer_to_binary(CC),
-                    <<CallAcc/binary, <<"cfi=">>/binary, CMt/binary,
-                        <<"\ncfn={">>/binary, CFt/binary, $,, CAt/binary,
-                        <<"}\ncalls=">>/binary, CCt/binary, <<" 1\n1 1 1\n">>/binary>>
-                            end, NoCallsLine, Callees)
-        end
-                end, NextAcc, Funs)}.
-
-format_callgrind(ModList, Count) ->
-    element(2, lists:foldl(fun format_callgrind_line/2, {Count, <<>>}, ModList)).
-
-report_progress({Progress, Message}, Done, Total, PrState) when is_function(Progress, 4) ->
-    Progress(Message, Done, Total, PrState);
-report_progress({Progress, Message}, Done, Total, PrState) when is_pid(Progress) ->
-    Progress ! {Message, Done, Total, PrState};
-
-% silent progress printer
-report_progress({silent, _}, _, _, State) ->
-    State;
-% default progress printer
-report_progress({undefined, _}, Done, Done, _) ->
-    io:format(group_leader(), " complete.~n", []),
-    undefined;
-report_progress({undefined, Info}, _, Total, undefined) ->
-    io:format(group_leader(), "~-20s started: ", [Info]),
-    Total div 10;
-report_progress({undefined, _}, Done, Total, Next) when Done > Next ->
-    io:format(group_leader(), " ~b% ", [Done * 100 div Total]),
-    Done + (Total div 10);
-report_progress(_, _, _, PrState) ->
-    PrState.
-
-tracer_qlen(Pid) ->
-    case erlang:process_info(Pid, message_queue_len) of
-        {message_queue_len, QLen} ->
-            QLen;
-        _ ->
-            0
-    end.
-
-wait_for_data(_, Progress, PrState, QDone, QTotal, Timeout) when Timeout =< 0 ->
-    report_progress(Progress, QDone, QTotal, PrState),
-    timeout;
-wait_for_data(TracerPid, Progress, PrState, QDone, QTotal, Timeout) ->
-    receive
-        {data, Count, Trace} ->
-            report_progress(Progress, QDone, QDone, PrState),
-            {Count, Trace}
-        after 1000 ->
-            NewQTotal = tracer_qlen(TracerPid),
-            NewQDone = QDone + (QTotal - NewQTotal), % yes it can go backwards, but that means tracing is still on
-            NewPrState = report_progress(Progress, NewQDone, NewQTotal, PrState),
-            wait_for_data(TracerPid, Progress, NewPrState, NewQDone, NewQTotal,
-                if is_integer(Timeout) -> Timeout - 1000; true -> infinity end)
-    end.
-
-fetch_trace(TracerPid, Progress, Timeout) when is_pid(TracerPid) ->
-    TracerPid ! {data, self()},
-    TracerPid ! stop,
-    wait_for_data(TracerPid, Progress, undefined, 0, tracer_qlen(TracerPid), Timeout);
-fetch_trace(_, _, _) ->
-    undefined.
-
-
-% collate: Data [{Module, [Fun,Arity,Count,Sec,Usec]}] === {erl_prim_loader,[{debug,2,0,0},
-%   and -> {M,F,Arity} => {M,F,Arity,Count} === #{{ctp,trace,4} => [{erlang,trace,3,2}],
-process_callers(Data, Callers) ->
-    lists:filtermap(
-        fun ({Mod, Funs}) ->
-            case lists:filtermap(
-                fun ({_,_,0,0}) ->
-                    false;
-                    ({F,Arity,C,Us}) ->
-                        {true, {F,Arity,C,Us,maps:get({Mod,F,Arity}, Callers, [])}}
-                end, Funs) of
-                [] -> false;
-                Other -> {true, {Mod, Other}}
-            end
-        end,
-        lists:append(Data)).
diff --git a/src/erlperf.app.src b/src/erlperf.app.src
index da22363..267cbc0 100644
--- a/src/erlperf.app.src
+++ b/src/erlperf.app.src
@@ -1,11 +1,9 @@
 {application, erlperf,
  [{description, "Erlang Performance & Benchmarking Suite"},
-  {vsn, "1.1.5"},
+  {vsn, "2.0.0"},
   {registered, [
-    erlperf_sup, ep_monitor_sup, ep_job_sup, ep_monitor,
-    ep_history, ep_file_log, ep_cluster_history,
-    ep_cluster_monitor, ep_monitor_proxy,
-    ep_job_event, ep_system_event
+    erlperf_sup, erlperf_job_sup, erlperf_monitor,
+   erlperf_history, erlperf_file_log, erlperf_cluster_monitor
   ]},
   {mod, {erlperf_app, []}},
   {applications,
@@ -17,6 +15,5 @@
   {modules, []},
 
   {licenses, ["BSD 3-clause clear"]},
-  {links, [{"Github", "https://github.com/max-au/erlperf"}]},
-  {exclude_files, ["priv/"]}
+  {links, [{"Github", "https://github.com/max-au/erlperf"}]}
  ]}.
diff --git a/src/erlperf.erl b/src/erlperf.erl
index 6c2c72b..424364d 100644
--- a/src/erlperf.erl
+++ b/src/erlperf.erl
@@ -1,11 +1,6 @@
-%#!/usr/bin/env escript
-
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
+%%% @copyright (C) 2019-2022, Maxim Fedorov
 %%% @doc
-%%%   Benchmark/squeeze implementation, does not start any permanent
-%%%     jobs.
-%%%   Command line interface.
+%%%   Application API. Benchmark/squeeze implementation.
 %%% @end
 -module(erlperf).
 -author("maximfca@gmail.com").
@@ -13,27 +8,18 @@
 %% Public API for single-run simple benchmarking
 %% Programmatic access.
 -export([
+    benchmark/3,
+    compare/2,
+    record/4,
     run/1,
     run/2,
     run/3,
-    compare/2,
-    record/4,
     start/2,
-    load/1,
-    profile/2
-]).
-
-%% Public API: escript
--export([
-    main/1
+    time/2
 ]).
 
-%% Formatters & convenience functions.
--export([
-    format/3,
-    format_number/1,
-    format_size/1
-]).
+%% accepted code variants
+-type code() :: erlperf_job:code_map() | erlperf_job:callable().
 
 %% node isolation options:
 -type isolation() :: #{
@@ -45,13 +31,14 @@
     % number of concurrently running workers (defaults to 1)
     % ignored when running concurrency test
     concurrency => pos_integer(),
-    %% sampling interval: default is 1 second (to measure QPS)
-    sample_duration => pos_integer(),
+    %% sampling interval: default is 1000 milliseconds (to measure QPS)
+    %% 'undefined' duration is used as a flag for low-overhead benchmarking
+    sample_duration => pos_integer() | undefined,
     %% warmup samples: first 'warmup' cycles are ignored (defaults to 0)
-    warmup => pos_integer(),
+    warmup => non_neg_integer(),
     %% number of samples to take, defaults to 3
     samples => pos_integer(),
-    %% coefficient of variation, when supplied, at least 'cycles'
+    %% coefficient of variation, when supplied, at least 'samples'
     %%  samples must be within the specified coefficient
     %% experimental feature allowing to benchmark processes with
     %%  wildly jumping throughput
@@ -90,48 +77,65 @@
 %%  amount of runners running at that time.
 -type concurrency_test_result() :: concurrency_result() | {Max :: concurrency_result(), [concurrency_result()]}.
 
-%% Profile options
--type profile_options() :: #{
-    profiler => fprof,
-    format => term | binary | string
-}.
-
 -export_type([isolation/0, run_options/0, concurrency_test/0]).
 
 %% Milliseconds, timeout for any remote node operation
 -define(REMOTE_NODE_TIMEOUT, 10000).
 
-%% @doc Simple case.
-%%  Runs a single benchmark, and returns a steady QPS number.
-%%  Job specification may include suite &amp; worker init parts, suite cleanup,
-%%  worker code, job name and identifier (id).
--spec run(ep_job:code()) -> non_neg_integer().
-run(Code) ->
-    [Series] = run_impl([Code], #{}, undefined),
-    Series.
 
 %% @doc
-%% Single throughput measurement cycle.
-%% Additional options are applied.
--spec run(ep_job:code(), run_options()) -> run_result().
-run(Code, RunOptions) ->
-    [Series] = run_impl([Code], RunOptions, undefined),
-    Series.
+%% Generic execution engine. Supply multiple code versions, run options and either
+%%  `undefined' for usual benchmarking, or squeeze mode settings for concurrency test.
+%% @end
+%% TODO: figure out what is wrong with this spec.
+%% Somehow having run_result() in this spec makes Dialyzer to completely
+%%  ignore option of concurrency_test_result() return.
+%%-spec benchmark([erlperf_job:code()], run_options(), concurrency_test() | undefined) ->
+%%    run_result() | [run_result()] | concurrency_test_result().
+benchmark(Codes, #{isolation := _Isolation} = Options, ConOpts) ->
+    erlang:is_alive() orelse erlang:error(not_alive),
+    %% isolation requested: need to rely on cluster_monitor and other distributed things.
+    {Peers, Nodes} = prepare_nodes(length(Codes)),
+    Opts = maps:remove(isolation, Options),
+    try
+        %% no timeout here (except that rpc itself could time out)
+        Promises =
+            [erpc:send_request(Node, erlperf, run, [Code, Opts, ConOpts])
+                || {Node, Code} <- lists:zip(Nodes, Codes)],
+        %% now wait for everyone to respond
+        [erpc:receive_response(Promise) || Promise <- Promises]
+    catch
+        error:{exception, Reason, Stack} ->
+            erlang:raise(error, Reason, Stack)
+    after
+        stop_nodes(Peers, Nodes)
+    end;
+
+%% no isolation requested, do normal in-BEAM test
+benchmark(Codes, Options, ConOpts) ->
+    {Jobs, Samples, Monitors} = start_jobs(Codes, []),
+    try
+        benchmark(Jobs, Options, ConOpts, Samples)
+    after
+        [erlang:demonitor(Mon, [flush]) || Mon <- Monitors],
+        [(catch gen:stop(Pid)) || Pid <- Jobs]
+    end.
 
 %% @doc
-%% Concurrency measurement run.
--spec run(ep_job:code(), run_options(), concurrency_test()) ->
-    concurrency_test_result().
-run(Code, RunOptions, ConTestOpts) ->
-    [Series] = run_impl([Code], RunOptions, ConTestOpts),
-    Series.
+%% Comparison run: starts several jobs and measures throughput for
+%%  all of them at the same time.
+%% All job options are honoured, and if there is isolation applied,
+%%  every job runs its own node.
+-spec compare([code()], run_options()) -> [run_result()].
+compare(Codes, RunOptions) ->
+    benchmark([code(Code) || Code <- Codes], RunOptions, undefined).
 
 %% @doc
 %% Records call trace, so it could be used to benchmark later.
 -spec record(module(), atom(), non_neg_integer(), pos_integer()) ->
     [[{module(), atom(), [term()]}]].
 record(Module, Function, Arity, TimeMs) ->
-    TracerPid = spawn_link(fun tracer/0),
+    TracerPid = spawn_link(fun rec_tracer/0),
     TraceSpec = [{'_', [], []}],
     MFA = {Module, Function, Arity},
     erlang:trace_pattern(MFA, TraceSpec, [global]),
@@ -145,380 +149,161 @@ record(Module, Function, Arity, TimeMs) ->
             Samples
     end.
 
-%% @doc
-%% Starts a continuously running job in a new BEAM instance.
--spec start(ep_job:code(), isolation()) -> {ok, node(), pid()}.
-start(Code, _Isolation) ->
-    {_Peers, [Node]} = prepare_nodes(1),
-    {ok, Pid} = rpc:call(Node, ep_job, start, [Code], ?REMOTE_NODE_TIMEOUT),
-    {ok, Node, Pid}.
+%% @doc Simple case.
+%%  Runs a single benchmark, and returns a steady QPS number.
+%%  Job specification may include suite &amp; worker init parts, suite cleanup,
+%%  worker code, job name and identifier (id).
+-spec run(code()) -> non_neg_integer().
+run(Code) ->
+    [Series] = benchmark([code(Code)], #{}, undefined),
+    Series.
 
 %% @doc
-%% Loads previously saved job (from file in the priv_dir)
--spec load(file:filename()) -> ep_job:code().
-load(Filename) ->
-    {ok, Bin} = file:read_file(filename:join(code:priv_dir(erlperf), Filename)),
-    binary_to_term(Bin).
+%% Single throughput measurement cycle.
+%% Additional options are applied.
+-spec run(code(), run_options()) -> run_result().
+run(Code, RunOptions) ->
+    [Series] = benchmark([code(Code)], RunOptions, undefined),
+    Series.
 
 %% @doc
-%% Runs a profiler for specified code
--spec profile(ep_job:code(), profile_options()) -> binary() | string() | [term()].
-profile(Code, Options) ->
-    {ok, Job} = ep_job:start_link(Code),
-    Profiler = maps:get(profiler, Options, fprof),
-    Format = maps:get(format, Options, string),
-    Result = ep_job:profile(Job, Profiler, Format),
-    ep_job:stop(Job),
-    Result.
+%% Concurrency measurement run.
+-spec run(code() | module(), run_options() | atom(), concurrency_test() | [term()]) ->
+    run_result() | concurrency_test_result().
+run(Module, Function, Args) when is_atom(Module), is_atom(Function), is_list(Args) ->
+    %% this typo is so common that I decided to have this as an unofficial API
+    run({Module, Function, Args});
+run(Code, RunOptions, ConTestOpts) ->
+    [Series] = benchmark([code(Code)], RunOptions, ConTestOpts),
+    Series.
 
 %% @doc
-%% Comparison run: starts several jobs and measures throughput for
-%%  all of them at the same time.
-%% All job options are honoured, and if there is isolation applied,
-%%  every job runs its own node.
--spec compare([ep_job:code()], run_options()) -> [run_result()].
-compare(Codes, RunOptions) ->
-    run_impl(Codes, RunOptions, undefined).
-
-%% @doc Simple command-line benchmarking interface.
-%% Example: erlperf 'rand:uniform().'
--spec main([string()]) -> no_return().
-main(Args) ->
-    Prog = #{progname => "erlperf"},
-    try
-        erlang:process_flag(trap_exit, true),
-        RunOpts = argparse:parse(Args, arguments(), Prog),
-        Code0 = [parse_code(C) || C <- maps:get(code, RunOpts)],
-        {_, Code} = lists:foldl(fun callable/2, {RunOpts, Code0}, [init, init_runner, done]),
-        COpts = case maps:find(squeeze, RunOpts) of
-                    {ok, true} ->
-                        maps:with([min, max, threshold], RunOpts);
-                    error ->
-                        #{}
-                end,
-        ROpts = maps:with([concurrency, samples, sample_duration, cv, isolation, warmup, verbose, profile],
-            RunOpts),
-        main_impl(ROpts, COpts, Code)
-    catch
-        error:{argparse, Reason} ->
-            Fmt = argparse:format_error(Reason, arguments(), Prog),
-            io:format("error: ~s", [Fmt]);
-        error:{badmatch, {error, {{parse, Reason}, _}}} ->
-            format(error, "parse error: ~s~n", [Reason]);
-        error:{badmatch, {error, {{compile, Reason}, _}}} ->
-            format(error, "compile error: ~s~n", [Reason]);
-        error:{badmatch, {error, {Reason, Stack}}} ->
-            format(error, "error starting job: ~ts~n~p~n", [Reason, Stack]);
-        Cls:Rsn:Stack ->
-            format(error, "Unhandled exception: ~ts:~p~n~p~n", [Cls, Rsn, Stack])
-    end.
+%% Starts a new continuously running job with the specified concurrency.
+%% Requires `erlperf' application to be started.
+-spec start(code(), Concurrency :: non_neg_integer()) -> pid().
+start(Code, Concurrency) ->
+    {ok, Job} = supervisor:start_child(erlperf_job_sup, [code(Code)]),
+    ok = erlperf_job:set_concurrency(Job, Concurrency),
+    Job.
 
-callable(Type, {Args, Acc}) ->
-    {Args, merge_callable(Type, maps:get(Type, Args, []), Acc, [])}.
-
-merge_callable(_Type, [], Acc, Merged) ->
-    lists:reverse(Merged) ++ Acc;
-merge_callable(_Type, _, [], Merged) ->
-    lists:reverse(Merged);
-merge_callable(Type, [[H] | T], [HA | Acc], Merged) ->
-    merge_callable(Type, T, Acc, [HA#{Type => H} | Merged]).
-
-parse_code(Code) ->
-    case lists:last(Code) of
-        $. ->
-            #{runner => Code};
-        $} when hd(Code) =:= ${ ->
-            % parse MFA tuple with added "."
-            #{runner => parse_mfa_tuple(Code)};
-        _ ->
-            case file:read_file(Code) of
-                {ok, Bin} ->
-                    #{runner => parse_call_record(Bin)};
-                Other ->
-                    error({"Unable to read file with call recording\n" ++
-                        "Did you forget to end your function with period? (dot)",
-                        Code, Other})
-            end
-    end.
-
-parse_mfa_tuple(Code) ->
-    {ok, Scan, _} = erl_scan:string(Code ++ "."),
-    {ok, Term} = erl_parse:parse_term(Scan),
-    Term.
-
-parse_call_record(Bin) ->
-    binary_to_term(Bin).
-
-arguments() ->
-    #{help =>
-    "erlperf 'timer:sleep(1).'\n"
-    "erlperf 'rand:uniform().' 'crypto:strong_rand_bytes(2).' --samples 10 --warmup 1\n"
-    "erlperf 'pg2:create(foo).' --squeeze\n"
-    "erlperf 'pg2:join(foo, self()), pg2:leave(foo, self()).' --init 1 'pg2:create(foo).' --done 1 'pg2:delete(foo).'\n",
-        arguments => [
-            #{name => concurrency, short => $c, long => "-concurrency",
-                help => "number of concurrently executed runner processes",
-                default => 1, type => {int, [{min, 1}, {max, 1024 * 1024 * 1024}]}},
-            #{name => sample_duration, short => $d, long => "-duration",
-                help => "single sample duration", default => 1000,
-                type => {int, [{min, 1}]}},
-            #{name => samples, short => $s, long => "-samples",
-                help => "minimum number of samples to collect", default => 3,
-                type => {int, [{min, 1}]}},
-            #{name => warmup, short => $w, long => "-warmup",
-                help => "number of samples to skip", default => 0,
-                type => {int, [{min, 0}]}},
-            #{name => cv, long => "-cv",
-                help => "coefficient of variation",
-                type => {float, [{min, 0.0}]}},
-            #{name => verbose, short => $v, long => "-verbose",
-                type => boolean, help => "verbose output"},
-            #{name => profile, long => "-profile", type => boolean,
-                help => "run fprof profiler for the supplied code"},
-            #{name => isolated, short => $i, long => "-isolated", type => boolean,
-                action => count, help => "run benchmarks in isolated environment (slave node)"},
-            #{name => squeeze, short => $q, long => "-squeeze", type => boolean,
-                action => count, help => "run concurrency test"},
-            #{name => min, long => "-min",
-                help => "start with this amount of processes", default => 1,
-                type => {int, [{min, 1}]}},
-            #{name => max, long => "-max",
-                help => "do not exceed this number of processes",
-                type => {int, [{max, 1024 * 1024 * 1024}]}},
-            #{name => threshold, short => $t, long => "-threshold",
-                help => "cv at least <threshold> samples should be less than <cv> to increase concurrency", default => 3,
-                type => {int, [{min, 1}]}},
-            #{name => init, long => "-init",
-                help => "init code", nargs => 1, action => append},
-            #{name => done, long => "-done",
-                help => "done code", nargs => 1, action => append},
-            #{name => init_runner, long => "-init_runner",
-                help => "init_runner code", nargs => 1, action => append},
-            #{name => code,
-                help => "code to test", nargs => nonempty_list, action => extend}
-        ]}.
-
-%%-------------------------------------------------------------------
-%% Color output
-
--spec format(error | warining, string(), [term()]) -> ok.
-format(Level, Format, Terms) ->
-    io:format(color(Level, Format), Terms).
-
--define(RED, "\e[31m").
--define(MAGENTA, "\e[35m").
--define(END, "\e[0m~n").
-
-color(error, Text) -> ?RED ++ Text ++ ?END;
-color(warning, Text) -> ?MAGENTA ++ Text ++ ?END;
-color(_, Text) -> Text.
-
-
-%% @doc Formats number rounded to 3 digits.
-%%  Example: 88 -> 88, 880000 -> 880 Ki, 100501 -> 101 Ki
--spec format_number(non_neg_integer()) -> string().
-format_number(Num) when Num > 100000000000 ->
-    integer_to_list(round(Num / 1000000000)) ++ " Gi";
-format_number(Num) when Num > 100000000 ->
-    integer_to_list(round(Num / 1000000)) ++ " Mi";
-format_number(Num) when Num > 100000 ->
-    integer_to_list(round(Num / 1000)) ++ " Ki";
-format_number(Num) ->
-    integer_to_list(Num).
-
-%% @doc Formats size (bytes) rounded to 3 digits.
-%%  Unlinke @see format_number, used 1024 as a base,
-%%  so 200 * 1024 is 200 Kb.
--spec format_size(non_neg_integer()) -> string().
-format_size(Num) when Num > 1024*1024*1024 * 100 ->
-    integer_to_list(round(Num / (1024*1024*1024))) ++ " Gb";
-format_size(Num) when Num > 1024*1024*100 ->
-    integer_to_list(round(Num / (1024 * 1024))) ++ " Mb";
-format_size(Num) when Num > 1024 * 100 ->
-    integer_to_list(round(Num / 1024)) ++ " Kb";
-format_size(Num) ->
-    integer_to_list(Num).
-
-% wrong usage
-main_impl(_RunOpts, SqueezeOps, [_, _ | _]) when map_size(SqueezeOps) > 0 ->
-    io:format("Multiple concurrency tests is not supported, run it one by one~n");
-
-main_impl(RunOpts, SqueezeOpts, Codes) ->
-    NeedLogger = maps:get(verbose, RunOpts, false),
-    application:set_env(erlperf, start_monitor, NeedLogger),
-    ok = logger:set_handler_config(default, level, warning),
-    {ok, Sup} = erlperf_sup:start_link(),
-    % verbose?
-    Logger =
-        if NeedLogger ->
-            {ok, Pid} = ep_file_log:start(group_leader()),
-                Pid;
-            true ->
-                undefined
-        end,
-    try
-        run_main(RunOpts, SqueezeOpts, Codes)
-    after
-        Logger =/= undefined andalso ep_file_log:stop(Logger),
-        gen:stop(Sup)
-    end.
+%% @doc
+%% Low-overhead benchmarking, runs the code Count times and returns
+%%  time in microseconds it took to execute the code.
+-spec time(code(), Count :: non_neg_integer()) -> TimeUs :: non_neg_integer().
+time(Code, Count) ->
+    [Series] = benchmark([code(Code)], #{samples => Count, sample_duration => undefined}, undefined),
+    Series.
 
-% profile
-run_main(#{profile := true}, _, [Code]) ->
-    Profile = erlperf:profile(Code, #{profiler => fprof, format => string}),
-    io:format("~s~n", [Profile]);
-
-% squeeze test
-% Code                         Concurrency   Throughput
-% pg2:create(foo).                      14      9540 Ki
-run_main(RunOpts, SqueezeOps, [Code]) when map_size(SqueezeOps) > 0 ->
-    {QPS, Con} = run(Code, RunOpts, SqueezeOps),
-    #{runner := CodeRunner} = Code,
-    MaxCodeLen = min(code_length(CodeRunner), 62),
-    io:format("~*s     ||        QPS~n", [-MaxCodeLen, "Code"]),
-    io:format("~*s ~6b ~10s~n", [-MaxCodeLen, format_code(CodeRunner), Con, format_number(QPS)]);
-
-% benchmark
-% erlperf 'timer:sleep(1).'
-% Code               Concurrency   Throughput
-% timer:sleep(1).              1          498
-% --
-% Code                         Concurrency   Throughput   Relative
-% rand:uniform().                        1      4303 Ki       100%
-% crypto:strong_rand_bytes(2).           1      1485 Ki        35%
-
-run_main(RunOpts, _, Codes0) ->
-    MaxColumns = case io:columns() of {ok, C} -> C; _ -> 80 end,
-    Results = run_impl(Codes0, RunOpts, undefined),
-    MaxQPS = lists:max(Results),
-    Concurrency = maps:get(concurrency, RunOpts, 1),
-    Codes = [maps:get(runner, Code) || Code <- Codes0],
-    Zipped = lists:reverse(lists:keysort(2, lists:zip(Codes, Results))),
-    %% ~6 + ~10 + ~6 format is 22 + 4 spaces means 26 total, code gets
-    %%  columns - 26
-    MaxCodeLen = min(lists:max([code_length(Code) || Code <- Codes]) + 4, MaxColumns - 26),
-    io:format("~*s     ||        QPS     Rel~n", [-MaxCodeLen, "Code"]),
-    [io:format("~*s ~6b ~10s ~6b%~n", [-MaxCodeLen, format_code(Code),
-        Concurrency, format_number(QPS), QPS * 100 div MaxQPS]) ||
-        {Code, QPS}  <- Zipped].
-
-format_code(Code) when is_tuple(Code) ->
-    lists:flatten(io_lib:format("~tp", [Code]));
-format_code(Code) when is_tuple(hd(Code)) ->
-    lists:flatten(io_lib:format("[~tp, ...]", [hd(Code)]));
-format_code(Code) ->
-    Code.
-
-code_length(Code) ->
-    length(format_code(Code)).
+%%===================================================================
+%% Codification: translate from {M, F, A} to #{runner => ...} map
+code(#{runner := _Runner} = Code) ->
+    Code;
+code({M, F, A}) when is_atom(M), is_atom(F), is_list(A) ->
+    #{runner => {M, F, A}};
+code(Fun) when is_function(Fun) ->
+    #{runner => Fun};
+code(Text) when is_list(Text) ->
+    #{runner => Text}.
 
-%%%===================================================================
-%%% Benchmarking itself
+%%===================================================================
+%% Benchmarking itself
 
 %% OTP 25 support
--dialyzer({no_missing_calls, start_nodes_slave/1}).
+-dialyzer({no_missing_calls, start_node/1}).
 -compile({nowarn_deprecated_function, [{slave, start_link, 3}, {slave, stop, 1}]}).
 -compile({nowarn_removed, [{slave, start_link, 3}, {slave, stop, 1}]}).
 
-start_nodes({module, peer}, HowMany) ->
-    Args = ["-pa", filename:dirname(code:where_is_file("erlperf.app")),
-        "-pa", filename:dirname(code:where_is_file("argparse.app"))],
-    [{ok, _Peer, _Node} = peer:start_link(#{name => peer:random_name(), args => Args}) || _ <- lists:seq(1, HowMany)];
-start_nodes({error, nofile}, HowMany) ->
-    start_nodes_slave(HowMany).
-
-start_nodes_slave(HowMany) ->
+start_node({module, peer}) ->
+    {ok, _Peer, _Node} = peer:start_link(#{name => peer:random_name()});
+start_node({error, nofile}) ->
     OsPid = os:getpid(),
     [_, HostString] = string:split(atom_to_list(node()), "@"),
     Host = list_to_atom(HostString),
-    %% TODO: use real erl_prim_loader inet with hosts equal to this host
-    %% TODO: un-hardcode dependencies (argparse)
-    Paths = ["-pa " ++ filename:dirname(code:where_is_file(F)) || F <- ["erlperf.app", "argparse.app"]],
-    Args = lists:join(" ", Paths) ++ " -setcookie " ++ atom_to_list(erlang:get_cookie()),
-    [begin
-         Uniq = erlang:unique_integer([positive]),
-         NodeId = list_to_atom(lists:concat(["job-", Uniq, "-", OsPid])),
-         {ok, Node} = slave:start_link(Host, NodeId, Args),
-         {ok, undefined, Node}
-     end || _ <- lists:seq(1, HowMany)].
+    Args = "-setcookie " ++ atom_to_list(erlang:get_cookie()),
+    Uniq = erlang:unique_integer([positive]),
+    NodeId = list_to_atom(lists:concat(["job-", Uniq, "-", OsPid])),
+    {ok, Node} = slave:start_link(Host, NodeId, Args),
+    {ok, undefined, Node}.
 
 prepare_nodes(HowMany) ->
-    not erlang:is_alive() andalso error(not_alive),
-    % start multiple nodes, ensure cluster_monitor is here (start supervisor if needs to)
-    {_, Peers, Nodes} = lists:unzip3(start_nodes(code:ensure_loaded(peer1), HowMany)),
-    % start 'erlperf' app on all slaves
-    Expected = lists:duplicate(HowMany, {ok, [argparse, erlperf]}),
-    case rpc:multicall(Nodes, application, ensure_all_started, [erlperf], ?REMOTE_NODE_TIMEOUT) of
-        {Expected, []} ->
-            {Peers, Nodes};
-        Other ->
-            error({start_failed, Other, Expected, Nodes})
-    end.
-
-%% isolation requested: need to rely on cluster_monitor and other distributed things.
-run_impl(Codes, #{isolation := _Isolation} = Options, ConOpts) ->
-    {Peers, Nodes} = prepare_nodes(length(Codes)),
-    Opts = maps:remove(isolation, Options),
+    %% start 'erlperf' parts on all peers
+    %% Cannot do this via "code:add_path" because actual *.beam files are
+    %%  parts of the binary escript.
+    _ = application:load(erlperf),
+    {ok, ModNames} = application:get_key(erlperf, modules),
+    Modules = [{Mod, _Bin, _Path} = code:get_object_code(Mod) || Mod <- ModNames],
+    PeerPresent = code:ensure_loaded(peer),
+    %% start multiple nodes
+    lists:unzip([begin
+         {ok, Peer, Node} =  start_node(PeerPresent),
+         [{module, Mod} = erpc:call(Node, code, load_binary, [Mod, Path, Bin], ?REMOTE_NODE_TIMEOUT)
+             || {Mod, Bin, Path} <- Modules],
+         {ok, _PgPid} = erpc:call(Node, pg, start, [erlperf]),
+         {ok, _MonPid} = erpc:call(Node, erlperf_monitor, start, []),
+         {Peer, Node}
+     end || _ <- lists:seq(1, HowMany)]).
+
+stop_nodes([undefined | _], Nodes) ->
+    [slave:stop(Node) || Node <- Nodes];
+stop_nodes(Peers, _Nodes) ->
+    [peer:stop(Peer) || Peer <- Peers].
+
+start_jobs([], Jobs) ->
+    lists:unzip3(lists:reverse(Jobs));
+start_jobs([Code | Codes], Jobs) ->
     try
-        % no timeout here (except that rpc itself could time out)
-        Promises =
-            [rpc:async_call(Node, erlperf, run, [Code, Opts, ConOpts])
-                || {Node, Code} <- lists:zip(Nodes, Codes)],
-        % now wait for everyone to respond
-        lists:reverse(lists:foldl(fun (Key, Acc) -> [rpc:yield(Key) | Acc] end, [], Promises))
-    after
-        case Peers of
-            [undefined | _] ->
-                [slave:stop(Node) || Node <- Nodes];
-            Peers ->
-                [peer:stop(Peer) || Peer <- Peers]
-        end
-    end;
-
-%% no isolation requested, do normal in-BEAM test
-run_impl(Codes, Options, ConOpts) ->
-    Jobs = [begin {ok, Pid} = ep_job:start(Code), Pid end || Code <- Codes],
-    try
-        benchmark_impl(Jobs, Options, ConOpts)
-    after
-        [ep_job:stop(Job) || Job <- Jobs]
+        {ok, Pid} = erlperf_job:start(Code),
+        Sample = erlperf_job:handle(Pid),
+        MonRef = monitor(process, Pid),
+        start_jobs(Codes, [{Pid, Sample, MonRef} | Jobs])
+    catch Class:Reason:Stack ->
+        %% stop jobs that were started
+        [(catch gen:stop(Pid)) || {Pid, _, _} <- Jobs],
+        erlang:raise(Class, Reason, Stack)
     end.
 
 -define(DEFAULT_SAMPLE_DURATION, 1000).
 
-benchmark_impl(Jobs, Options, ConOpts) ->
-    CRefs = [ep_job:get_counters(Job) || Job <- Jobs],
-    do_benchmark_impl(Jobs, Options, ConOpts, CRefs).
-
-%% normal benchmark
-do_benchmark_impl(Jobs, Options, undefined, CRefs) ->
+%% low-overhead benchmark
+benchmark(Jobs, #{sample_duration := undefined, samples := Samples}, undefined, _Handles) ->
+    Proxies = [spawn_monitor(fun () -> exit({success, erlperf_job:measure(Job, Samples)}) end)
+        || Job <- Jobs],
+    [case Res of
+         {success, Success} -> Success;
+         Error -> erlang:error(Error)
+     end || Res <- multicall_result(Proxies, [])];
+
+%% continuous benchmark
+benchmark(Jobs, Options, undefined, Handles) ->
     Concurrency = maps:get(concurrency, Options, 1),
-    [ok = ep_job:set_concurrency(Job, Concurrency) || Job <- Jobs],
-    perform_benchmark(CRefs, Options);
+    [ok = erlperf_job:set_concurrency(Job, Concurrency) || Job <- Jobs],
+    perform_benchmark(Jobs, Handles, Options);
 
 %% squeeze test - concurrency benchmark
-do_benchmark_impl(Jobs, Options, ConOpts, CRef) ->
+benchmark(Jobs, Options, ConOpts, Handles) ->
     Min = maps:get(min, ConOpts, 1),
-    perform_squeeze(Jobs, CRef, Min, [], {0, 0}, Options,
+    perform_squeeze(Jobs, Handles, Min, [], {0, 0}, Options,
         ConOpts#{max => maps:get(max, ConOpts, erlang:system_info(process_limit) - 1000)}).
 
 %% QPS considered stable when:
 %%  * 'warmup' cycles have passed
-%%  * 'cycles' cycles have been received
-%%  * (optional) for the last 'cycles' cycles coefficient of variation did not exceed 'cv'
-perform_benchmark(CRefs, Options) ->
+%%  * 'samples' cycles have been received
+%%  * (optional) for the last 'samples' cycles coefficient of variation did not exceed 'cv'
+perform_benchmark(Jobs, Handles, Options) ->
     Interval = maps:get(sample_duration, Options, ?DEFAULT_SAMPLE_DURATION),
     % sleep for "warmup * sample_duration"
     timer:sleep(Interval * maps:get(warmup, Options, 0)),
-    % do at least 'cycles' cycles
-    Before = [[atomics:get(CRef, 1)] || CRef <- CRefs],
-    Samples = measure_impl(Before, CRefs, Interval, maps:get(samples, Options, 3), maps:get(cv, Options, undefined)),
+    % do at least 'samples' cycles
+    Before = [[erlperf_job:sample(Handle)] || Handle <- Handles],
+    JobMap = maps:from_list([{J, []} || J <- Jobs]), %%
+    Samples = measure_impl(JobMap, Before, Handles, Interval,
+        maps:get(samples, Options, 3), maps:get(cv, Options, undefined)),
     report_benchmark(Samples, maps:get(report, Options, false)).
 
-measure_impl(Before, _CRefs, _Interval, 0, undefined) ->
+measure_impl(_Jobs, Before, _Handles, _Interval, 0, undefined) ->
     normalise(Before);
 
-measure_impl(Before, CRefs, Interval, 0, CV) ->
+measure_impl(Jobs, Before, Handles, Interval, 0, CV) ->
     %% Complication: some jobs may need a long time to stabilise compared to others.
     %% Decision: wait for all jobs to stabilise (could wait for just one?)
     case
@@ -538,13 +323,18 @@ measure_impl(Before, CRefs, Interval, 0, CV) ->
         true ->
             % imitate queue - drop last sample, push another in the head
             TailLess = [lists:droplast(L) || L <- Before],
-            measure_impl(TailLess, CRefs, Interval, 1, CV)
+            measure_impl(Jobs, TailLess, Handles, Interval, 1, CV)
     end;
 
-measure_impl(Before, CRefs, Interval, Count, CV) ->
-    timer:sleep(Interval),
-    Counts = [atomics:get(CRef, 1) || CRef <- CRefs],
-    measure_impl(merge(Counts, Before), CRefs, Interval, Count - 1, CV).
+measure_impl(Jobs, Before, Handles, Interval, Count, CV) ->
+    receive
+        {'DOWN', _Ref, process, Pid, Reason} when is_map_key(Pid, Jobs) ->
+            erlang:error({benchmark, {'EXIT', Pid, Reason}})
+    after Interval->
+        ok
+    end,
+    Counts = [erlperf_job:sample(Handle) || Handle <- Handles],
+    measure_impl(Jobs, merge(Counts, Before), Handles, Interval, Count - 1, CV).
 
 merge([], []) ->
     [];
@@ -568,13 +358,13 @@ report_benchmark(SamplesList, false) ->
 %% Test considered complete when either:
 %%  * maximum number of workers reached
 %%  * last 'threshold' added workers did not increase throughput
-perform_squeeze(_Pid, _CRef, Current, History, QMax, Options, #{max := Max}) when Current > Max ->
+perform_squeeze(_Pid, _Handle, Current, History, QMax, Options, #{max := Max}) when Current > Max ->
     % reached max allowed schedulers, exiting
     report_squeeze(QMax, History, Options);
 
-perform_squeeze(Jobs, CRef, Current, History, QMax, Options, ConOpts) ->
-    ok = ep_job:set_concurrency(hd(Jobs), Current),
-    [QPS] = perform_benchmark(CRef, Options),
+perform_squeeze(Jobs, Handles, Current, History, QMax, Options, ConOpts) ->
+    ok = erlperf_job:set_concurrency(hd(Jobs), Current),
+    [QPS] = perform_benchmark(Jobs, Handles, Options),
     NewHistory = [{QPS, Current} | History],
     case maxed(QPS, Current, QMax, maps:get(threshold, ConOpts, 3))  of
         true ->
@@ -582,7 +372,7 @@ perform_squeeze(Jobs, CRef, Current, History, QMax, Options, ConOpts) ->
             report_squeeze(QMax, NewHistory, Options);
         NewQMax ->
             % need more workers
-            perform_squeeze(Jobs, CRef, Current + 1, NewHistory, NewQMax, Options, ConOpts)
+            perform_squeeze(Jobs, Handles, Current + 1, NewHistory, NewQMax, Options, ConOpts)
     end.
 
 report_squeeze(QMax, History, Options) ->
@@ -600,9 +390,18 @@ maxed(_, Current, {_, W}, Count) when Current - W > Count ->
 maxed(_, _, QMax, _) ->
     QMax.
 
+
+multicall_result([], Acc) ->
+    lists:reverse(Acc);
+multicall_result([{Pid, Ref} | Proxies], Acc) ->
+    receive
+        {'DOWN', Ref, process, Pid, Result} ->
+            multicall_result(Proxies, [Result | Acc])
+    end.
+
 %%%===================================================================
 %%% Tracer process, uses heap to store tracing information.
-tracer() ->
+rec_tracer() ->
     process_flag(message_queue_data, off_heap),
     tracer_loop([]).
 
@@ -614,4 +413,4 @@ tracer_loop(Trace) ->
         {stop, Control} ->
             Control ! {data, Trace},
             ok
-    end.
+    end.
\ No newline at end of file
diff --git a/src/erlperf_app.erl b/src/erlperf_app.erl
index cdc8706..4a62d45 100644
--- a/src/erlperf_app.erl
+++ b/src/erlperf_app.erl
@@ -1,11 +1,9 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%%     Benchmarking application
+%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @doc
+%%% Continuous benchmarking application behaviour.
 %%% @end
-%%%-------------------------------------------------------------------
-
 -module(erlperf_app).
+-author("maximfca@gmail.com").
 
 -behaviour(application).
 
@@ -13,7 +11,8 @@
 
 -spec start(application:start_type(), term()) -> {ok, pid()}.
 start(_StartType, _StartArgs) ->
-    erlperf_sup:start_link().
+    {ok, Sup} = erlperf_sup:start_link(),
+    {ok, Sup}.
 
 -spec stop(term()) -> ok.
 stop(_State) ->
diff --git a/src/erlperf_cli.erl b/src/erlperf_cli.erl
new file mode 100644
index 0000000..a6967ee
--- /dev/null
+++ b/src/erlperf_cli.erl
@@ -0,0 +1,298 @@
+%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @doc
+%%% Command line interface adapter.
+%%% @end
+-module(erlperf_cli).
+-author("maximfca@gmail.com").
+
+%% Public API: escript
+-export([
+    main/1
+]).
+
+%% @doc Simple command-line benchmarking interface.
+%% Example: erlperf 'rand:uniform().'
+-spec main([string()]) -> no_return().
+main(Args) ->
+    Prog = #{progname => "erlperf"},
+    try
+        RunOpts0 = argparse:parse(Args, arguments(), Prog),
+
+        %% turn off logger unless verbose output is requested
+        maps:get(verbose, RunOpts0, false) =:= false andalso
+            begin
+                logger:add_primary_filter(suppress_sasl, {
+                    fun(#{meta := #{error_logger := #{tag := Tag}}}, _) when Tag =:= error; Tag =:= error_report ->
+                            stop;
+                        (_, _) ->
+                            ignore
+                    end, ok})
+            end,
+
+        %% low-overhead benchmarking is not compatible with many options, and may have "loop" written as 100M, 100K
+        %% TODO: implement mutually exclusive groups in argparse
+        RunOpts =
+            case maps:find(loop, RunOpts0) of
+                error ->
+                    RunOpts0;
+                {ok, Str} ->
+                    [erlang:error({loop, Option}) || Option <-
+                        [concurrency, sample_duration, samples, waarmup, cv], is_map_key(Option, RunOpts0)],
+                    RunOpts0#{loop => parse_loop(Str)}
+                end,
+
+        %% add code paths
+        [case code:add_path(P) of true -> ok; {error, Error} -> erlang:error({add_path, {P,Error}}) end
+            || P <- maps:get(code_path, RunOpts, [])],
+        %% find all runners
+        Code0 = [parse_code(C) || C <- maps:get(code, RunOpts)],
+        %% find associated init, init_runner, done
+        {_, Code} = lists:foldl(fun callable/2, {RunOpts, Code0}, [init, init_runner, done]),
+        %% figure out whether concurrency run is requested
+        COpts = case maps:find(squeeze, RunOpts) of
+                    {ok, true} ->
+                        maps:with([min, max, threshold], RunOpts);
+                    error ->
+                        #{}
+                end,
+        ROpts = maps:with([loop, concurrency, samples, sample_duration, cv, isolation, warmup, verbose],
+            RunOpts),
+        %% when isolation is requested, the node must be distributed
+        is_map_key(isolation, RunOpts) andalso (not erlang:is_alive())
+            andalso net_kernel:start([list_to_atom(lists:concat(
+            ["erlperf-", erlang:unique_integer([positive]), "-", os:getpid()])), shortnames]),
+        %% do the actual run
+        main_impl(ROpts, COpts, Code)
+    catch
+        error:{argparse, Reason} ->
+            Fmt = argparse:format_error(Reason, arguments(), Prog),
+            format(info, "Error: ~s", [Fmt]);
+        throw:{parse, FunName, Other} ->
+            format(error, "Unable to read file named '~s' (expected to contain call chain recording)~nReason: ~p\n"
+                "Did you forget to end your function with period? (dot)~n", [FunName, Other]);
+        error:{add_path, {Path, Error}} ->
+            format(error, "Error adding code path ~s: ~p~n", [Path, Error]);
+        error:{generic, Error} ->
+            format(error, "Error: ~s~n", [Error]);
+        error:{loop, Option} ->
+            format(error, "Low-overhead benchmarking is not compatible with ~s~n", [Option]);
+        error:{generate, {parse, FunName, Error}} ->
+            format(error, "Parse error for ~s: ~s~n", [FunName, lists:flatten(Error)]);
+        error:{compile, Errors, Warnings} ->
+            Errors =/= [] andalso format(error, "Compile error: ~s~n", [compile_errors(Errors)]),
+            Warnings =/= [] andalso format(warning, "Warning: ~s~n", [compile_errors(Warnings)]);
+        error:{benchmark, {'EXIT', Job, Error}} ->
+            node(Job) =/= node() andalso format(error, "~s reported an error:~n", [node(Job)]),
+            format(error, "~p~n", [Error]);
+        Cls:Rsn:Stack ->
+            format(error, "Unhandled exception: ~ts:~p~n~p~n", [Cls, Rsn, Stack])
+    after
+        logger:remove_primary_filter(suppress_sasl)
+    end.
+
+%% formats compiler errors/warnings
+compile_errors([]) -> "";
+compile_errors([{_, []} | Tail]) ->
+    compile_errors(Tail);
+compile_errors([{L, [{_Anno, Mod, Err} | T1]} | Tail]) ->
+    lists:flatten(Mod:format_error(Err)) ++ compile_errors([{L, T1} | Tail]).
+
+callable(Type, {Args, Acc}) ->
+    {Args, merge_callable(Type, maps:get(Type, Args, []), Acc, [])}.
+
+merge_callable(_Type, [], Acc, Merged) ->
+    lists:reverse(Merged) ++ Acc;
+merge_callable(_Type, _, [], Merged) ->
+    lists:reverse(Merged);
+merge_callable(Type, [[H] | T], [HA | Acc], Merged) ->
+    merge_callable(Type, T, Acc, [HA#{Type => H} | Merged]).
+
+parse_code(Code) ->
+    case lists:last(Code) of
+        $. ->
+            #{runner => Code};
+        $} when hd(Code) =:= ${ ->
+            % parse MFA tuple with added "."
+            #{runner => parse_mfa_tuple(Code)};
+        _ ->
+            case file:read_file(Code) of
+                {ok, Bin} ->
+                    #{runner => parse_call_record(Bin)};
+                Other ->
+                    erlang:throw({parse, Code, Other})
+            end
+    end.
+
+parse_mfa_tuple(Code) ->
+    {ok, Scan, _} = erl_scan:string(Code ++ "."),
+    {ok, Term} = erl_parse:parse_term(Scan),
+    Term.
+
+parse_call_record(Bin) ->
+    binary_to_term(Bin).
+
+parse_loop(Loop) ->
+    case string:to_integer(Loop) of
+        {Int, "M"} -> Int * 1000000;
+        {Int, "K"} -> Int * 1000;
+        {Int, []} -> Int;
+        {Int, "G"} -> Int * 1000000000;
+        _Other -> erlang:error({generic, "unsupported syntax for low-overhead count: " ++ Loop})
+    end.
+
+arguments() ->
+    #{help =>
+        "\nBenchmark timer:sleep(1):\n    erlperf 'timer:sleep(1).'\n"
+        "Benchmark rand:uniform() vs crypto:strong_rand_bytes(2):\n    erlperf 'rand:uniform().' 'crypto:strong_rand_bytes(2).' --samples 10 --warmup 1\n"
+        "Figure out concurrency limits:\n    erlperf 'code:is_loaded(local_udp).' --init 'code:ensure_loaded(local_udp).'\n"
+        "Benchmark pg join/leave operations:\n    erlperf 'pg:join(s, foo, self()), pg:leave(s, foo, self()).' --init 'pg:start_link(s).'\n"
+        "Low-overhead benchmark for a single BIF:\n    erlperf 'erlang:unique_integer().' -l 1000000\n",
+        arguments => [
+            #{name => concurrency, short => $c, long => "-concurrency",
+                help => "number of concurrently executed runner processes",
+                type => {int, [{min, 1}, {max, 1024 * 1024 * 1024}]}},
+            #{name => sample_duration, short => $d, long => "-duration",
+                help => "single sample duration, milliseconds (1000)",
+                type => {int, [{min, 1}]}},
+            #{name => samples, short => $s, long => "-samples",
+                help => "minimum number of samples to collect (3)",
+                type => {int, [{min, 1}]}},
+            #{name => loop, short => $l, long => "-loop",
+                help => "low overhead mode count, e.g. 100K, 200M, 3G"},
+            #{name => warmup, short => $w, long => "-warmup",
+                help => "number of samples to skip (0)",
+                type => {int, [{min, 0}]}},
+            #{name => cv, long => "-cv",
+                help => "coefficient of variation",
+                type => {float, [{min, 0.0}]}},
+            #{name => verbose, short => $v, long => "-verbose",
+                type => boolean, help => "print monitoring statistics"},
+            #{name => code_path, long => "pa", type => string,
+                action => append, help => "extra code path, see -pa erl documentation"},
+            #{name => isolation, short => $i, long => "-isolated", type => boolean,
+                help => "run benchmarks in an isolated environment (peer node)"},
+            #{name => squeeze, short => $q, long => "-squeeze", type => boolean,
+                help => "run concurrency test"},
+            #{name => min, long => "-min",
+                help => "start with this amount of processes (1)",
+                type => {int, [{min, 1}]}},
+            #{name => max, long => "-max",
+                help => "do not exceed this number of processes",
+                type => {int, [{max, erlang:system_info(process_limit) - 1000}]}},
+            #{name => threshold, short => $t, long => "-threshold",
+                help => "cv at least <threshold> samples should be less than <cv> to increase concurrency", default => 3,
+                type => {int, [{min, 1}]}},
+            #{name => init, long => "-init",
+                help => "init code", nargs => 1, action => append},
+            #{name => done, long => "-done",
+                help => "done code", nargs => 1, action => append},
+            #{name => init_runner, long => "-init_runner",
+                help => "init_runner code", nargs => 1, action => append},
+            #{name => code,
+                help => "code to test", nargs => nonempty_list, action => extend}
+        ]}.
+
+%%-------------------------------------------------------------------
+%% Color output
+
+-spec format(error | warning | info, string(), [term()]) -> ok.
+format(Level, Format, Terms) ->
+    io:format(color(Level, Format), Terms).
+
+-define(RED, "\e[31m").
+-define(MAGENTA, "\e[35m").
+-define(END, "\e[0m~n").
+
+color(error, Text) -> ?RED ++ Text ++ ?END;
+color(warning, Text) -> ?MAGENTA ++ Text ++ ?END;
+color(info, Text) -> Text.
+
+% wrong usage
+main_impl(_RunOpts, SqueezeOps, [_, _ | _]) when map_size(SqueezeOps) > 0 ->
+    io:format("Multiple concurrency tests is not supported, run it one by one~n");
+
+main_impl(RunOpts, SqueezeOpts, Codes) ->
+    % verbose?
+    {Pg, Monitor, Logger} =
+        case maps:get(verbose, RunOpts, false) of
+            true ->
+                {ok, P} = pg:start_link(erlperf),
+                {ok, Mon} = erlperf_monitor:start_link(),
+                {ok, Log} = erlperf_file_log:start_link(group_leader()),
+                {P, Mon, Log};
+            false ->
+                {undefined, undefined, undefined}
+        end,
+    try
+        run_main(RunOpts, SqueezeOpts, Codes)
+    after
+        Logger =/= undefined andalso gen:stop(Logger),
+        Monitor =/= undefined andalso gen:stop(Monitor),
+        Pg =/= undefined andalso gen:stop(Pg)
+    end.
+
+%% low overhead mode
+run_main(#{loop := Loop}, #{}, Codes) ->
+    TimeUs = erlperf:benchmark(Codes, #{samples => Loop, sample_duration => undefined}, undefined),
+    %% for presentation purposes, convert time to QPS
+    %% Duration is fixed to 1 second here
+    QPS = [Loop * 1000000 div T || T <- TimeUs],
+    format_result(Codes, 1, QPS, [T * 1000 div Loop || T <- TimeUs]);
+
+%% squeeze test: do not print "Relative" column as it's always 100%
+% Code                         Concurrency   Throughput
+% pg2:create(foo).                      14      9540 Ki
+run_main(RunOpts, SqueezeOps, [Code]) when map_size(SqueezeOps) > 0 ->
+    Duration = maps:get(sample_duration, RunOpts, 1000),
+    {QPS, Con} = erlperf:run(Code, RunOpts, SqueezeOps),
+    Timing = Duration * 1000000 div QPS * Con,
+    format_result([Code], Con, [QPS], [Timing]);
+
+%% benchmark: don't print "Relative" column for a single sample
+% erlperf 'timer:sleep(1).'
+% Code               Concurrency   Throughput
+% timer:sleep(1).              1          498
+% --
+% Code                         Concurrency   Throughput   Relative
+% rand:uniform().                        1      4303 Ki       100%
+% crypto:strong_rand_bytes(2).           1      1485 Ki        35%
+
+run_main(RunOpts, _, Execs) ->
+    Concurrency = maps:get(concurrency, RunOpts, 1),
+    Duration = maps:get(sample_duration, RunOpts, 1000),
+    Throughput = erlperf:benchmark(Execs, RunOpts, undefined),
+    Timings = [Duration * 1000000 div T * Concurrency || T <- Throughput],
+    format_result(Execs, Concurrency, Throughput, Timings).
+
+format_result(Execs, Concurrency, Throughput, Timings) ->
+    MaxQPS = lists:max(Throughput),
+    Codes = [maps:get(runner, Code) || Code <- Execs],
+    Zipped = lists:keysort(3, lists:zip3(Codes, Throughput, Timings)),
+    %% Columns: Code | Concurrency | Throughput | Time | Relative
+    %% Code takes all the remaining width.
+    MaxColumns = case io:columns() of {ok, C} -> C; _ -> 80 end,
+    %% Space taken by all columns except code
+    case Throughput of
+        [_] ->
+            %% omit "Rel" when there is one result
+            MaxCodeLen = min(code_length(hd(Codes)) + 4, MaxColumns - 31),
+            io:format("~*s     ||        QPS       Time~n", [-MaxCodeLen, "Code"]),
+            io:format("~*s ~6b ~10s ~10s~n", [-MaxCodeLen, format_code(hd(Codes)), Concurrency,
+                erlperf_file_log:format_number(hd(Throughput)), erlperf_file_log:format_duration(hd(Timings))]);
+        [_|_] ->
+            MaxCodeLen = min(lists:max([code_length(Code) || Code <- Codes]) + 4, MaxColumns - 37),
+            io:format("~*s     ||        QPS       Time     Rel~n", [-MaxCodeLen, "Code"]),
+            [io:format("~*s ~6b ~10s ~10s ~6b%~n", [-MaxCodeLen, format_code(Code), Concurrency,
+                erlperf_file_log:format_number(QPS), erlperf_file_log:format_duration(Time), QPS * 100 div MaxQPS])
+                || {Code, QPS, Time}  <- Zipped]
+    end.
+
+format_code(Code) when is_tuple(Code) ->
+    lists:flatten(io_lib:format("~tp", [Code]));
+format_code(Code) when is_tuple(hd(Code)) ->
+    lists:flatten(io_lib:format("[~tp, ...]", [hd(Code)]));
+format_code(Code) ->
+    Code.
+
+code_length(Code) ->
+    length(format_code(Code)).
diff --git a/src/erlperf_cluster_monitor.erl b/src/erlperf_cluster_monitor.erl
new file mode 100644
index 0000000..74dada9
--- /dev/null
+++ b/src/erlperf_cluster_monitor.erl
@@ -0,0 +1,156 @@
+%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @doc
+%%% Logs monitoring events for the entire cluster, to file or device.
+%%%  Requires erlperf_history service running, fails otherwise.
+%%% Uses completely different to erlperf_monitor approach; instead of waiting
+%%%  for new samples to come, cluster monitor just outputs existing
+%%%  samples periodically.
+%%% @end
+-module(erlperf_cluster_monitor).
+-author("maximfca@gmail.com").
+
+-behaviour(gen_server).
+
+%% API
+-export([
+    start_link/2
+]).
+
+%% gen_server callbacks
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2
+]).
+
+%% Handler: just like gen_event handler.
+%% If you do need gen_event handler, make a fun of it.
+-type handler() :: {module(), atom(), term()} | file:filename_all() | {fd, io:device()} | io:device().
+
+%% @doc
+%% Starts cluster-wide monitor with the specified handler, and links it to the caller.
+%% Use 'record_info(fields, monitor_sample)' to fetch all fields.
+-spec start_link(handler(), [atom()]) -> {ok, Pid :: pid()} | {error, Reason :: term()}.
+start_link(Handler, Fields) ->
+    gen_server:start_link(?MODULE, [Handler, Fields], []).
+
+%%%===================================================================
+%%% gen_server callbacks
+
+%% Take a sample every second
+-define(SAMPLING_RATE, 1000).
+
+%% System monitor state
+-record(state, {
+    %% next tick
+    next :: integer(),
+    handler :: handler(),
+    fields :: [atom()],
+    %% previously printed header, elements are node() | field name | job PID
+    %% if the new header is different from the previous one, it gets printed
+    header = [] :: [atom() | {jobs, [pid()]} | {node, node()}]
+}).
+
+%% gen_server init
+init([Handler, Fields]) ->
+    %% precise (abs) timer
+    Next = erlang:monotonic_time(millisecond) + ?SAMPLING_RATE,
+    {ok, handle_tick(#state{next = Next, handler = make_handler(Handler), fields = Fields})}.
+
+handle_call(_Request, _From, _State) ->
+    erlang:error(notsup).
+
+handle_cast(_Request, _State) ->
+    erlang:error(notsup).
+
+handle_info({timeout, _, tick}, State) ->
+    {noreply, handle_tick(State)}.
+
+%%%===================================================================
+%%% Internal functions
+
+handle_tick(#state{next = Next, fields = Fields, handler = Handler, header = Header} = State) ->
+    Next1 = Next + ?SAMPLING_RATE,
+    %% if we supply negative timer, we crash - and restart with no messages in the queue
+    %% this could happen if handler is too slow
+    erlang:start_timer(Next1, self(), tick, [{abs, true}]),
+    %% fetch all updates from cluster history
+    Samples = erlperf_history:get(Next - ?SAMPLING_RATE + erlang:time_offset(millisecond)),
+    %% now invoke the handler
+    {NewHandler, NewHeader} = run_handler(Handler, Fields, Header, lists:keysort(1, Samples)),
+    State#state{next = Next1, handler = NewHandler, header = NewHeader}.
+
+make_handler({_M, _F, _A} = MFA) ->
+    MFA;
+make_handler(IoDevice) when is_pid(IoDevice); is_atom(IoDevice) ->
+    {fd, IoDevice};
+make_handler(Filename) when is_list(Filename); is_binary(Filename) ->
+    {ok, Fd} = file:open(Filename, [raw, append]),
+    {fd, Fd}.
+
+run_handler(Handler, _Fields, Header, []) ->
+    {Handler, Header};
+
+%% handler: MFA callback
+run_handler({M, F, A}, Fields, Header, Samples) ->
+    Filtered = [{Node, maps:with(Fields, Sample)} || {Node, Sample} <- Samples],
+    {{M, F, M:F(Filtered, A)}, Header};
+
+%% built-in handler: file/console output
+run_handler({fd, IoDevice}, Fields, Header, Samples) ->
+    {NewHeader, Filtered} = lists:foldl(
+        fun ({Node, Sample}, {Hdr, Acc}) ->
+            OneNode =
+                lists:join(" ",
+                    [io_lib:format("~s", [Node]) |
+                        [formatter(F, maps:get(F, Sample)) || F <- Fields]]),
+            %% special case for Jobs: replace the field with the {jobs, [Job]}
+            FieldsWithJobs = [
+                case F of
+                    jobs -> {Pids, _} = lists:unzip(maps:get(jobs, Sample)), {jobs, Pids};
+                    F -> F
+                end || F <- Fields],
+            {[{node, Node} | FieldsWithJobs] ++ Hdr, [OneNode | Acc]}
+        end,
+        {[], []}, Samples),
+    %% check if header has changed and print if it has
+    NewHeader =/= Header andalso
+        begin
+            FmtHdr = iolist_to_binary(lists:join(" ", [header(S) || S <- NewHeader])),
+            ok = file:write(IoDevice, FmtHdr)
+        end,
+    %% print the actual line
+    Data = lists:join(" ", Filtered) ++ "\n",
+    Formatted = iolist_to_binary(Data),
+    ok = file:write(IoDevice, Formatted),
+    {{fd, IoDevice}, NewHeader}.
+
+header(time) -> "      date     time    TZ";
+header(sched_util) -> "%sched";
+header(dcpu) -> " %dcpu";
+header(dio) -> "  %dio";
+header(processes) -> "  procs";
+header(ports) -> "  ports";
+header(ets) -> "  ets";
+header(memory_total) -> "mem_total";
+header(memory_processes) -> " mem_proc";
+header(memory_binary) -> "  mem_bin";
+header(memory_ets) -> "  mem_ets";
+header({jobs, Jobs}) ->
+    lists:flatten([io_lib:format("~14s", [pid_to_list(Pid)]) || Pid <- Jobs]);
+header({node, Node}) ->
+    atom_to_list(Node).
+
+formatter(time, Time) ->
+    calendar:system_time_to_rfc3339(Time div 1000);
+formatter(Percent, Num) when Percent =:= sched_util; Percent =:= dcpu; Percent =:= dio ->
+    io_lib:format("~6.2f", [Num]);
+formatter(Number, Num) when Number =:= processes; Number =:= ports ->
+    io_lib:format("~7b", [Num]);
+formatter(ets, Num) ->
+    io_lib:format("~5b", [Num]);
+formatter(Size, Num) when Size =:= memory_total; Size =:= memory_processes; Size =:= memory_binary; Size =:= memory_ets ->
+    io_lib:format("~9s", [erlperf_file_log:format_size(Num)]);
+formatter(jobs, Jobs) ->
+    lists:flatten([io_lib:format("~14s", [erlperf_file_log:format_number(Num)]) || {_Pid, Num} <- Jobs]).
diff --git a/src/erlperf_file_log.erl b/src/erlperf_file_log.erl
new file mode 100644
index 0000000..739bbef
--- /dev/null
+++ b/src/erlperf_file_log.erl
@@ -0,0 +1,138 @@
+%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @doc
+%%%   Writes monitoring events to I/O device.
+%%% @end
+-module(erlperf_file_log).
+-author("maximfca@gmail.com").
+
+-behaviour(gen_server).
+
+%% API
+-export([
+    start_link/1,
+    %% leaky API...
+    format_number/1,
+    format_size/1,
+    format_duration/1
+]).
+
+%% gen_server callbacks
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2
+]).
+
+%%--------------------------------------------------------------------
+%% @doc
+%% Starts the server
+-spec(start_link(Filename :: string() | file:io_device()) -> {ok, Pid :: pid()} | {error, Reason :: term()}).
+start_link(Filename) ->
+    gen_server:start_link(?MODULE, [Filename], []).
+
+
+%%%===================================================================
+%%% gen_server callbacks
+
+%% Repeat header every 30 lines (by default)
+-define(LOG_REPEAT_HEADER, 30).
+
+%% System monitor state
+-record(state, {
+    % file logger counter
+    log_counter = ?LOG_REPEAT_HEADER :: non_neg_integer(),
+    % when to print the header once again
+    log_limit = ?LOG_REPEAT_HEADER :: pos_integer(),
+    % file descriptor
+    log_file :: file:io_device(),
+    % current format line
+    format = "" :: string(),
+    % saved list of job IDs executed previously
+    jobs = [] :: [erlperf_monitor:job_sample()]
+}).
+
+%% gen_server init
+init([Target]) ->
+    % subscribe to monitor events
+    ok = pg:join(erlperf, {erlperf_monitor, node()}, self()),
+    WriteTo = if is_list(Target) -> {ok, LogFile} = file:open(Target, [raw, append]), LogFile; true -> Target end,
+    {ok, #state{log_file = WriteTo}}.
+
+handle_call(_Request, _From, _State) ->
+    erlang:error(notsup).
+
+handle_cast(_Request, _State) ->
+    erlang:error(notsup).
+
+handle_info(#{jobs := Jobs, time := Time, sched_util := SchedUtil, dcpu := DCPU, dio := DIO, processes := Processes,
+    ports := Ports, ets := Ets, memory_total := MemoryTotal, memory_processes := MemoryProcesses,
+    memory_binary := MemoryBinary, memory_ets := MemoryEts}, #state{log_file = File} = State) ->
+    {JobIds, Ts} = lists:unzip(Jobs),
+    State1 = maybe_write_header(JobIds, State),
+    % actual line
+    TimeFormat = calendar:system_time_to_rfc3339(Time div 1000),
+    Formatted = iolist_to_binary(io_lib:format(State1#state.format, [
+        TimeFormat, SchedUtil, DCPU, DIO, Processes,
+        Ports, Ets,
+        format_size(MemoryTotal),
+        format_size(MemoryProcesses),
+        format_size(MemoryBinary),
+        format_size(MemoryEts)
+    ] ++ [format_number(T) || T <- Ts])),
+    ok = file:write(File, Formatted),
+    {noreply, State1}.
+
+%%%===================================================================
+%%% Internal functions
+
+maybe_write_header(Jobs, #state{log_counter = LC, log_limit = LL, jobs = Prev} = State) when LC >= LL; Jobs =/= Prev ->
+    State#state{format = write_header(State#state.log_file, Jobs), log_counter = 0, jobs = Jobs};
+maybe_write_header(_, State) ->
+    State#state{log_counter = State#state.log_counter + 1}.
+
+write_header(File, Jobs) ->
+    JobCount = length(Jobs),
+    Format = "~s ~6.2f ~6.2f ~6.2f ~8b ~8b ~7b ~9s ~9s ~9s ~9s" ++
+        lists:concat(lists:duplicate(JobCount, "~11s")) ++ "~n",
+    JobIds = list_to_binary(lists:flatten([io_lib:format(" ~10s", [pid_to_list(J)]) || J <- Jobs])),
+    Header =  <<"\nYYYY-MM-DDTHH:MM:SS-oo:oo  Sched   DCPU    DIO    Procs    Ports     ETS Mem Total  Mem Proc   Mem Bin   Mem ETS", JobIds/binary, "\n">>,
+    ok = file:write(File, Header),
+    Format.
+
+%% @doc Formats size (bytes) rounded to 3 digits.
+%%  Unlike @see format_number, used 1024 as a base,
+%%  so 200 * 1024 is 200 Kb.
+-spec format_size(non_neg_integer()) -> string().
+format_size(Num) when Num > 1024*1024*1024 * 100 ->
+    integer_to_list(round(Num / (1024*1024*1024))) ++ " Gb";
+format_size(Num) when Num > 1024*1024*100 ->
+    integer_to_list(round(Num / (1024 * 1024))) ++ " Mb";
+format_size(Num) when Num > 1024 * 100 ->
+    integer_to_list(round(Num / 1024)) ++ " Kb";
+format_size(Num) ->
+    integer_to_list(Num).
+
+%% @doc Formats number rounded to 3 digits.
+%%  Example: 88 -> 88, 880000 -> 880 Ki, 100501 -> 101 Ki
+-spec format_number(non_neg_integer()) -> string().
+format_number(Num) when Num > 100000000000 ->
+    integer_to_list(round(Num / 1000000000)) ++ " Gi";
+format_number(Num) when Num > 100000000 ->
+    integer_to_list(round(Num / 1000000)) ++ " Mi";
+format_number(Num) when Num > 100000 ->
+    integer_to_list(round(Num / 1000)) ++ " Ki";
+format_number(Num) ->
+    integer_to_list(Num).
+
+%% @doc Formats time duration, from nanoseconds to seconds
+%%  Example: 88 -> 88 ns, 88000 -> 88 us, 10000000 -> 10 ms
+-spec format_duration(non_neg_integer()) -> string().
+format_duration(Num) when Num > 100000000000 ->
+    integer_to_list(round(Num / 1000000000)) ++ " s";
+format_duration(Num) when Num > 100000000 ->
+    integer_to_list(round(Num / 1000000)) ++ " ms";
+format_duration(Num) when Num > 100000 ->
+    integer_to_list(round(Num / 1000)) ++ " us";
+format_duration(Num) ->
+    integer_to_list(Num) ++ " ns".
\ No newline at end of file
diff --git a/src/ep_cluster_history.erl b/src/erlperf_history.erl
similarity index 58%
rename from src/ep_cluster_history.erl
rename to src/erlperf_history.erl
index 8db5f95..a4dd707 100644
--- a/src/ep_cluster_history.erl
+++ b/src/erlperf_history.erl
@@ -1,8 +1,11 @@
-%%% @author Maxim Fedorov, <maximfca@gmail.com>
+%%% @copyright (C) 2019-2022, Maxim Fedorov
 %%% @doc
-%%% Collects, accumulates &amp; filters cluster-wide metrics.
+%%% Collects, accumulates &amp; filters cluster-wide monitoring events.
+%%% Essentially a simple in-memory database for quick cluster overview.
+%%% Started only when the application is configured for running in a
+%%% primary node.
 %%% @end
--module(ep_cluster_history).
+-module(erlperf_history).
 -author("maximfca@gmail.com").
 
 -behaviour(gen_server).
@@ -22,8 +25,6 @@
     handle_info/2
 ]).
 
--include_lib("erlperf/include/monitor.hrl").
-
 -define(TABLE, ?MODULE).
 
 %%--------------------------------------------------------------------
@@ -37,14 +38,14 @@ start_link() ->
 %% @doc
 %% Returns cluster history from time From (all fields), sorted
 %%  by time.
--spec get(From :: integer()) -> [{node(), monitor_sample()}].
+-spec get(From :: integer()) -> [{node(), erlperf_monitor:monitor_sample()}].
 get(From) ->
     get(From, erlang:system_time(millisecond)).
 
 %% @doc
 %% Returns records between From and To (inclusive)
 %% Both From and To are Erlang system time in milliseconds.
--spec get(From :: integer(), To :: integer()) -> [{node(), monitor_sample()}].
+-spec get(From :: integer(), To :: integer()) -> [{node(), erlperf_monitor:monitor_sample()}].
 get(From, To) ->
     % ets:fun2ms(fun ({T, _} = R) when T =< To, T >= From -> R end =>
     %    [{{'$1','$2'},[{'=<','$1', To},{'>=','$1', From}],['$_']}]
@@ -67,40 +68,20 @@ get(From, To) ->
 %%%===================================================================
 %%% gen_server callbacks
 
-%% Suppress dialyzer warning for OTP compatibility: erlperf runs on OTP20
-%%  that does not support pg, and has pg2 instead.
--dialyzer({no_missing_calls, init/1}).
--compile({nowarn_deprecated_function, [{pg2, create, 1}, {pg2, join, 2}]}).
--compile({nowarn_removed, [{pg2, create, 1}, {pg2, join, 2}]}).
 init([]) ->
-    try
-        ok = pg:join(?HISTORY_PROCESS_GROUP, self())
-    catch
-        error:undef ->
-            ok = pg2:create(?HISTORY_PROCESS_GROUP),
-            ok = pg2:join(?HISTORY_PROCESS_GROUP, self())
-    end,
+    ok = pg:join(erlperf, cluster_monitor, self()),
     ?TABLE = ets:new(?TABLE, [protected, ordered_set, named_table]),
-    % initial fetch (subject to race condition)
-    Now = erlang:system_time(millisecond),
-    From = Now - ?DEFAULT_HISTORY_DURATION,
-    {Replies, _BadNodes} = gen_server:multi_call([node() | nodes()], history, {get, From, Now}, ?RPC_TIMEOUT),
-    % update ETS table with all responses
-    [ets:insert(?TABLE, {Time, {Node, Sample}}) || {Node, #monitor_sample{time = Time}= Sample} <- Replies],
     {ok, #state{}}.
 
 handle_call(_Request, _From, _State) ->
-    error(badarg).
+    erlang:error(notsup).
 
 handle_cast(_Request, _State) ->
-    error(badarg).
+    erlang:error(notsup).
 
-handle_info({Node, #monitor_sample{time = Time} = Sample}, State) ->
+handle_info({Node, #{time := Time} = Sample}, State) ->
     ets:insert(?TABLE, {Time, {Node, Sample}}),
-    {noreply, maybe_clean(State)};
-
-handle_info(_Info, _State) ->
-    error(badarg).
+    {noreply, maybe_clean(State)}.
 
 %%%===================================================================
 %%% Internal functions
diff --git a/src/erlperf_job.erl b/src/erlperf_job.erl
new file mode 100644
index 0000000..e38e948
--- /dev/null
+++ b/src/erlperf_job.erl
@@ -0,0 +1,526 @@
+%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% @doc
+%%% Job runner, taking care of init/done, workers added and removed.
+%%% Works just like a simple_one_for_one supervisor (children are
+%%%   temporary runners).
+%%% There are two benchmarking modes: continuous (activated by
+%%%   setting non-zero concurrency), and sample-based (activated
+%%%   manually and deactivated after runner does requested amount
+%%%   of iterations).
+%%%
+%%% Job is defined with 4 functions (code map): <ul>
+%%%  <li>init/0 - called once when starting the job</li>
+%%%  <li>init_runner/0 - called when starting a runner process, or
+%%%    init_runner/1 that accepts return value from init/0. It is
+%%%    an error to omit init/0 if init_runner/1 is defined, but
+%%%    it is not an error to have init_runner/0 when init/0 exists</li>
+%%%  <li>runner/0 - ignores init_runner/0,1</li>
+%%%  <li>runner/1 - requires init_runner/0,1 return value as initial state,
+%%%    passes it as state for the next invocation (accumulator)</li>
+%%%  <li>runner/2 - requires init_runner/0,1 return value as initial state,
+%%%    passes accumulator as a second argument</li>
+%%%  <li>done/1 - requires init/0 (and accepts its return value)</li>
+%%% </ul>
+%%% @end
+-module(erlperf_job).
+-author("maximfca@gmail.com").
+
+-behaviour(gen_server).
+
+%% Job API
+-export([
+    start/1,
+    start_link/1,
+    concurrency/1,
+    set_concurrency/2,
+    measure/2,
+    sample/1,
+    handle/1,
+    source/1
+]).
+
+%% gen_server callbacks
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    terminate/2
+]).
+
+%% MFArgs: module, function, arguments.
+-type mfargs() :: {module(), atom(), [term()]}.
+
+%% Callable: one or more MFArgs, or a function object, or source code
+-type callable() ::
+    mfargs() |
+    [mfargs()] |
+    fun() |
+    fun((term()) -> term()) |
+    string().
+
+%% Benchmark code: init, init_runner, runner, done.
+-type code_map() :: #{
+    runner := callable(),
+    init => callable(),
+    init_runner => callable(),
+    done => callable()
+}.
+
+%% Internal (opaque) type, please do not use
+-type handle() :: {module(), non_neg_integer()}.
+
+%% Temporary type until OTP25 is everywhere
+-type server_ref() :: gen_server:server_ref().
+
+-export_type([callable/0, code_map/0]).
+
+%% @doc
+%% Starts the benchmark instance.
+%% Job starts with no workers, use set_concurrency/2 to start some.
+-spec start(code_map()) -> {ok, pid()} | {error, term()}.
+start(#{runner := _MustHave} = Code) ->
+    gen_server:start(?MODULE, generate(Code), []).
+
+%% @doc
+%% Starts the benchmark instance and links it to caller.
+%% Job starts with no workers, use set_concurrency/2 to start some.
+-spec start_link(code_map()) -> {ok, pid()} | {error, term()}.
+start_link(#{runner := _MustHave} = Code) ->
+    gen_server:start_link(?MODULE, generate(Code), []).
+
+%% @doc
+-spec concurrency(server_ref()) -> Concurrency :: non_neg_integer().
+concurrency(JobId) ->
+    gen_server:call(JobId, concurrency).
+
+%% @doc
+%% Change concurrency setting for this job.
+%% Does not reset counting.
+-spec set_concurrency(server_ref(), non_neg_integer()) -> ok.
+set_concurrency(JobId, Concurrency) ->
+    gen_server:call(JobId, {set_concurrency, Concurrency}).
+
+%% @doc
+%% Executes the runner SampleCount times, returns time in microseconds it
+%%  took to execute. Similar to `timer:tc'. Has less overhead compared to
+%%  continuous benchmarking, therefore can be used even for very fast functions.
+-spec measure(server_ref(), SampleCount :: non_neg_integer()) ->
+    TimeUs :: non_neg_integer() | already_started.
+measure(JobId, SampleCount) ->
+    gen_server:call(JobId, {measure, SampleCount}, infinity).
+
+%% @doc
+%% Returns the sampling handle for the job.
+-spec handle(server_ref()) -> handle().
+handle(JobId) ->
+    gen_server:call(JobId, handle).
+
+%% @doc
+%% Returns the current sample for the job, or undefined if the job has stopped.
+-spec sample(handle()) -> non_neg_integer() | undefined.
+sample({Module, Arity}) ->
+    {call_count, Count} = erlang:trace_info({Module, Module, Arity}, call_count),
+    Count.
+
+%% @doc
+%% Returns the source code that was generated for this job.
+-spec source(handle()) -> [string()].
+source(JobId) ->
+    gen_server:call(JobId, source).
+
+
+%%--------------------------------------------------------------------
+%% Internal definitions
+
+-include_lib("kernel/include/logger.hrl").
+
+-record(exec, {
+    name :: atom(),         %% generated module name (must be generated for tracing to work)
+    source :: [string()],   %% module source code
+    binary :: binary(),     %% generated bytecode
+    init :: fun(() -> term()),  %% init function
+    init_runner :: fun((term()) -> term()), %% must accept 1 argument
+    runner :: {fun((term()) -> term()), non_neg_integer()},
+    sample_runner :: {fun((non_neg_integer(), term()) -> term()), non_neg_integer()},
+    done :: fun((term()) -> term())  %% must accept 1 argument
+}).
+
+-type exec() :: #exec{}.
+
+-record(erlperf_job_state, {
+    %% original spec
+    exec :: exec(),
+    %% return value of init/1
+    init_result :: term(),
+    %% continuous workers
+    workers = [] :: [pid()],
+    %% temporary workers (for sample_count call)
+    sample_workers = #{} :: #{pid() => {pid(), reference()}}
+}).
+
+-type state() :: #erlperf_job_state{}.
+
+%%%===================================================================
+%%% gen_server callbacks
+
+init(#exec{name = Mod, binary = Bin, init = Init, runner = {_Fun, Arity}} = Exec) ->
+    %% need to trap exits to avoid crashing and not cleaning up the loaded module
+    erlang:process_flag(trap_exit, true),
+    {module, Mod} = code:load_binary(Mod, Mod, Bin),
+    %% run the init/0 if defined
+    InitRet =
+        try Init()
+        catch
+            Class:Reason:Stack ->
+                %% clean up loaded module before crashing
+                code:purge(Mod),
+                code:delete(Mod),
+                erlang:raise(Class, Reason, Stack)
+        end,
+    %% register in the monitor
+    ok = erlperf_monitor:register(self(), {Mod, Arity}, 0),
+    %% start tracing this module runner function
+    1 = erlang:trace_pattern({Mod, Mod, Arity}, true, [local, call_count]),
+    {ok, #erlperf_job_state{exec = Exec, init_result = InitRet}}.
+
+-spec handle_call(term(), {pid(), reference()}, state()) -> {reply, term(), state()}.
+handle_call(handle, _From, #erlperf_job_state{exec = #exec{name = Name, runner = {_Fun, Arity}}} = State) ->
+    {reply, {Name, Arity}, State};
+
+handle_call(concurrency, _From, #erlperf_job_state{workers = Workers} = State) ->
+    {reply, length(Workers), State};
+
+handle_call({measure, SampleCount}, From, #erlperf_job_state{sample_workers = SampleWorkers,
+    exec = #exec{init_runner = InitRunner, sample_runner = SampleRunner},
+    init_result = IR} = State) when SampleWorkers =:= #{} ->
+    {noreply, State#erlperf_job_state{sample_workers =
+        start_sample_count(SampleCount, From, InitRunner, IR, SampleRunner)}};
+
+handle_call({measure, _SampleCount}, _From, #erlperf_job_state{} = State) ->
+    {reply, already_started, State};
+
+handle_call(source, _From, #erlperf_job_state{exec = #exec{source = Source}} = State) ->
+    {reply, Source, State};
+
+handle_call({set_concurrency, Concurrency}, _From, #erlperf_job_state{workers = Workers} = State) ->
+    {reply, ok, State#erlperf_job_state{workers = set_concurrency_impl(length(Workers), Concurrency, State)}}.
+
+handle_cast(_Request, _State) ->
+    error(notsup).
+
+-spec handle_info(term(), state()) -> {noreply, state()}.
+handle_info({'EXIT', SampleWorker, Reason},
+    #erlperf_job_state{sample_workers = SampleWorkers} = State) when is_map_key(SampleWorker, SampleWorkers) ->
+    {ReplyTo, MoreSW} = maps:take(SampleWorker, SampleWorkers),
+    gen:reply(ReplyTo, Reason),
+    {noreply, State#erlperf_job_state{sample_workers = MoreSW}};
+
+handle_info({'EXIT', Worker, Reason}, #erlperf_job_state{workers = Workers} = State) when Reason =:= shutdown ->
+    {noreply, State#erlperf_job_state{workers = lists:delete(Worker, Workers)}};
+handle_info({'EXIT', Worker, Reason}, #erlperf_job_state{workers = Workers} = State) ->
+    {stop, Reason, State#erlperf_job_state{workers = lists:delete(Worker, Workers)}}.
+
+-spec terminate(term(), state()) -> ok.
+terminate(_Reason, #erlperf_job_state{init_result = IR, workers = Workers, exec = #exec{name = Mod, done = Done}} = State) ->
+    %% terminate all workers first
+    set_concurrency_impl(length(Workers), 0, State),
+    %% call "done" for cleanup
+    try Done(IR)
+    catch
+        Class:Reason:Stack ->
+            %% duly note, but do not crash, it is pointless at this moment
+            ?LOG_ERROR("Exception while executing 'done': ~s:~0p~n~0p", [Class, Reason, Stack])
+    after
+        _ = code:purge(Mod),
+        true = code:delete(Mod)
+    end.
+
+%%%===================================================================
+%%% Internal: runner implementation
+
+%% Single run
+start_sample_count(SampleCount, ReplyTo, InitRunner, InitRet, {SampleRunner, _}) ->
+    Child = erlang:spawn_link(
+        fun() ->
+            %% need to send a message even if init_runner fails, hence 'after'
+            IRR = InitRunner(InitRet),
+            T1 = erlang:monotonic_time(),
+            SampleRunner(SampleCount, IRR),
+            T2 = erlang:monotonic_time(),
+            Time = erlang:convert_time_unit(T2 - T1, native, microsecond),
+            exit(Time)
+        end
+    ),
+    #{Child => ReplyTo}.
+
+set_concurrency_impl(OldConcurrency, Concurrency, #erlperf_job_state{workers = Workers, init_result = IR, exec = Exec}) ->
+    case Concurrency - OldConcurrency of
+        0 ->
+            Workers;
+        NeedMore when NeedMore > 0 ->
+            Workers ++ add_workers(NeedMore, Exec, IR, []);
+        NeedLess ->
+            remove_workers(NeedLess, Workers)
+    end.
+
+add_workers(0, _ExecMap, _InitRet, NewWorkers) ->
+    %% ensure all new workers completed their InitRunner routine
+    [receive {Worker, init_runner} -> ok end || Worker <- NewWorkers],
+    NewWorkers;
+add_workers(More, #exec{init_runner = InitRunner, runner = {Runner, _RunnerArity}} = Exec, InitRet, NewWorkers) ->
+    Control = self(),
+    %% spawn all processes, and then wait until they complete InitRunner
+    Worker = erlang:spawn_link(
+        fun () ->
+            %% need to send a message even if init_runner fails, hence 'after'
+            IRR = try InitRunner(InitRet) after Control ! {self(), init_runner} end,
+            Runner(IRR)
+        end),
+    add_workers(More - 1, Exec, InitRet, [Worker | NewWorkers]).
+
+remove_workers(0, Workers) ->
+    Workers;
+remove_workers(Less, [Worker | Workers]) ->
+    exit(Worker, kill),
+    receive
+        {'EXIT', Worker, _Reason} ->
+            remove_workers(Less + 1, Workers)
+    end.
+
+%%%===================================================================
+%%% Internal: code generation
+
+%% @doc Creates an Erlang module (text) based on the code map passed
+%%      Returns module name (may be generated), runner arity (for tracing purposes),
+%%      and module source code (text)
+%%      Exception: raises error with Reason = {generate, {FunName, Arity, ...}}
+%%
+%%      Important: early erlperf versions were generating AST (forms) instead
+%%      of source code, which isn't exactly supported - AST is internal thing
+%%      that can change over time.
+-spec generate(code_map()) -> exec().
+generate(Code) ->
+    Name = list_to_atom(lists:concat(["job_", os:getpid(), "_", erlang:unique_integer([positive])])),
+    generate(Name, Code).
+
+generate(Name, #{runner := Runner} = Code) ->
+    {InitFun, InitArity, InitExport, InitText} = generate_init(Name, maps:get(init, Code, error)),
+    {IRFun, IRArity, IRExport, IRText} = generate_one(Name, init_runner, maps:get(init_runner, Code, error)),
+    {DoneFun, DoneArity, DoneExport, DoneText} = generate_one(Name, done, maps:get(done, Code, error)),
+
+    %% RunnerArity: how many arguments _original_ runner wants to accept.
+    %% Example: run(State) is 1, and run() is 0.
+    %% Pass two function names: one that is for sample_count, and one for continuous
+    ContName = atom_to_list(Name),
+    SampleCountName = list_to_atom(ContName ++ "_" ++ integer_to_list(erlang:unique_integer([positive]))),
+    {RunnerFun, SampleRunnerFun, RunnerArity, RunArity, RunnerText} = generate_runner(Name, SampleCountName, Runner),
+    RunnerExports = [{Name, RunArity}, {SampleCountName, RunArity + 1}],
+
+    %% verify compatibility between 4 pieces of code
+    %% 1. done/1 requires init/0 return value
+    DoneArity =:= 1 andalso InitArity =:= undefined andalso erlang:error({generate, {done, 1, requires, init}}),
+    %% 2. init_runner/1 requires init/0,1
+    IRArity =:= 1 andalso InitArity =:= undefined andalso erlang:error({generate, {init_runner, 1, requires, init}}),
+    %% 3. runner/1,2 requires init/0,1
+    RunnerArity > 0 andalso IRArity =:= undefined andalso erlang:error({generate, {runner, RunnerArity, requires, init_runner}}),
+    %% 4. runner/[3+] is not allowed
+    RunnerArity > 2 andalso erlang:error({generate, {runner, RunnerArity, not_supported}}),
+    %% 5. TODO: Verify there are no name clashes
+
+    %%
+    Exports = lists:concat(lists:join(", ", [io_lib:format("~s/~b", [F, Arity]) || {F, Arity} <-
+        [InitExport, IRExport, DoneExport | RunnerExports], Arity =/= undefined])),
+
+    Texts = [Text || Text <- [InitText, IRText, DoneText | RunnerText], Text =/= ""],
+
+    Source = ["-module(" ++ atom_to_list(Name) ++ ").", "-export([" ++ Exports ++ "])." | Texts],
+    #exec{name = Name, binary = compile(Name, Source), init = InitFun, init_runner = IRFun, source = Source,
+        runner = {RunnerFun, RunArity}, sample_runner = {SampleRunnerFun, RunArity}, done = DoneFun}.
+
+%% generates init/0 code
+generate_init(_Mod, Fun) when is_function(Fun, 0) ->
+    {Fun, 0, {[], undefined}, ""};
+generate_init(_Mod, {M, F, Args}) when is_atom(M), is_atom(F), is_list(Args) ->
+    {fun () -> erlang:apply(M, F, Args) end, 0, {[], undefined}, ""};
+generate_init(_Mod, [{M, F, Args} | _Tail] = MFAList) when is_atom(M), is_atom(F), is_list(Args) ->
+    [erlang:error({generate, {init, 0, invalid}}) ||
+        {M1, F1, A} <- MFAList, not is_atom(M1) orelse not is_atom(F1) orelse not is_list(A)],
+    {fun () -> [erlang:apply(M1, F1, A) || {M1, F1, A} <- MFAList] end, 0, {[], undefined}, ""};
+generate_init(Mod, Text) when is_list(Text) ->
+    case generate_text(init, Text, false) of
+        {0, NewName, FullText} ->
+            {fun () -> Mod:NewName() end, 0, {NewName, 0}, FullText};
+        {WrongArity, NewName, _} ->
+            erlang:error({generate, {init, NewName, WrongArity}})
+    end;
+generate_init(_Mod, error) ->
+    {fun () -> undefined end, undefined, undefined, ""}.
+
+%% generates init_runner/1 or done/1
+generate_one(_Mod, _FunName, error) ->
+    {fun (_Ignore) -> undefined end, undefined, {[], undefined}, ""};
+generate_one(_Mod, _FunName, Fun) when is_function(Fun, 1) ->
+    {Fun, 1, {[], undefined}, ""};
+generate_one(_Mod, _FunName, Fun) when is_function(Fun, 0) ->
+    {fun (_Ignore) -> Fun() end, 0, {[], undefined}, ""};
+generate_one(_Mod, _FunName, {M, F, Args}) when is_atom(M), is_atom(F), is_list(Args) ->
+    {fun (_Ignore) -> erlang:apply(M, F, Args) end, 0, {[], undefined}, ""};
+generate_one(_Mod, FunName, [{M, F, Args} | _Tail] = MFAList) when is_atom(M), is_atom(F), is_list(Args) ->
+    [erlang:error({generate, {FunName, 1, invalid, {M1, F1, A}}}) ||
+        {M1, F1, A} <- MFAList, not is_atom(M1) orelse not is_atom(F1) orelse not is_list(A)],
+    {fun (_Ignore) -> [erlang:apply(M1, F1, A) || {M1, F1, A} <- MFAList] end, 0, {[], undefined}, ""};
+generate_one(Mod, FunName, Text) when is_list(Text) ->
+    case generate_text(FunName, Text, false) of
+        {0, NewName, FullText} ->
+            {fun (_Ignore) -> Mod:NewName() end, 0, {NewName, 0}, FullText};
+        {1, NewName, FullText} ->
+            {fun (Arg) -> Mod:NewName(Arg) end, 1, {NewName, 1}, FullText};
+        {WrongArity, NewName, _} ->
+            erlang:error({generate, {FunName, WrongArity, NewName}})
+    end.
+
+%% runner wrapper:
+%% Generates at least 2 functions, one for continuous, and one for
+%%  sample-count benchmarking.
+generate_runner(Mod, SampleCountName, Fun) when is_function(Fun, 0) ->
+    {
+        fun (_Ignore) -> Mod:Mod(Fun) end,
+        fun (SampleCount, _Ignore) -> Mod:SampleCountName(SampleCount, Fun) end,
+        0, 1,
+        [lists:concat([Mod, "(Fun) -> Fun(), ", Mod, "(Fun)."]),
+            lists:concat([SampleCountName, "(0, _Fun) -> ok; ", SampleCountName, "(Count, Fun) -> Fun(), ",
+                SampleCountName, "(Count - 1, Fun)."])]
+    };
+generate_runner(Mod, SampleCountName, Fun) when is_function(Fun, 1) ->
+    {
+        fun (Init) -> Mod:Mod(Init, Fun) end,
+        fun (SampleCount, Init) -> Mod:SampleCountName(SampleCount, Init, Fun) end,
+        1, 2,
+        [lists:concat([Mod, "(Init, Fun) -> Fun(Init), ", Mod, "(Init, Fun)."]),
+            lists:concat([SampleCountName, "(0, _Init, _Fun) -> ok; ", SampleCountName, "(Count, Init, Fun) -> Fun(Init), ",
+                SampleCountName, "(Count - 1, Init, Fun)."])]
+    };
+generate_runner(Mod, SampleCountName, Fun) when is_function(Fun, 2) ->
+    {
+        fun (Init) -> Mod:Mod(Init, Init, Fun) end,
+        fun (SampleCount, Init) -> Mod:SampleCountName(SampleCount, Init, Init, Fun) end,
+        2, 3,
+        [lists:concat([Mod, "(Init, State, Fun) -> ", Mod, "(Init, Fun(Init, State), Fun)."]),
+            lists:concat([SampleCountName, "(0, _Init, _State, _Fun) -> ok; ", SampleCountName, "(Count, Init, State, Fun) -> ",
+                SampleCountName, "(Count - 1, Init, Fun(Init, State), Fun)."])]
+    };
+
+%% runner wrapper: MFA
+generate_runner(Mod, SampleCountName, {M, F, Args}) when is_atom(M), is_atom(F), is_list(Args) ->
+    {
+        fun (_Ignore) -> Mod:Mod(M, F, Args) end,
+        fun (SampleCount, _Ignore) -> Mod:SampleCountName(SampleCount, M, F, Args) end,
+        0, 3,
+        [lists:concat([Mod, "(M, F, A) -> erlang:apply(M, F, A), ", Mod, "(M, F, A)."]),
+            lists:concat([SampleCountName, "(0, _M, _F, _A) -> ok; ", SampleCountName,
+                "(Count, M, F, A) -> erlang:apply(M, F, A), ", SampleCountName, "(Count - 1, M, F, A)."])]
+    };
+
+%% runner wrapper: MFAList
+generate_runner(Mod, SampleCountName, [{M, F, Args} | _Tail] = MFAList) when is_atom(M), is_atom(F), is_list(Args) ->
+    [erlang:error({generate, {runner, 0, invalid, {M1, F1, A}}}) ||
+        {M1, F1, A} <- MFAList, not is_atom(M1) orelse not is_atom(F1) orelse not is_list(A)],
+    {
+        fun (_Ignore) -> Mod:Mod(MFAList) end,
+        fun (SampleCount, _Ignore) -> Mod:SampleCountName(SampleCount, MFAList) end,
+        0, 1,
+        [lists:concat([Mod, "(MFAList) -> [erlang:apply(M, F, A) || {M, F, A} <- MFAList], ", Mod, "(MFAList)."]),
+            lists:concat([SampleCountName, "(0, _MFAList) -> ok; ", SampleCountName,
+                "(Count, MFAList) -> [erlang:apply(M, F, A) || {M, F, A} <- MFAList], ", SampleCountName, "(Count - 1, MFAList)."])]
+    };
+
+generate_runner(Mod, SampleCountName, Text) when is_list(Text) ->
+    case generate_text(runner, Text, true) of
+        {0, NoDotText} ->
+            %% very special case: embedding the text directly, without creating a new function
+            %%  at all.
+            {
+                fun (_Ignore) -> Mod:Mod() end,
+                fun (SampleCount, _Ignore) -> Mod:SampleCountName(SampleCount) end,
+                0, 0,
+                [lists:concat([Mod, "() -> ", NoDotText, ", ", Mod, "()."]),
+                    lists:concat([SampleCountName, "(0) -> ok;", SampleCountName, "(Count) -> ",
+                        NoDotText, ", ", SampleCountName, "(Count - 1)."]),
+                    ""]
+            };
+        {0, NewName, FullText} ->
+            {
+                fun (_Ignore) -> Mod:Mod() end,
+                fun (SampleCount, _Ignore) -> Mod:SampleCountName(SampleCount) end,
+                0, 0,
+                [lists:concat([Mod, "() -> ", NewName, "(), ", Mod, "()."]),
+                    lists:concat([SampleCountName, "(0) -> ok;", SampleCountName, "(Count) -> ",
+                        NewName, "(), ", SampleCountName, "(Count - 1)."]),
+                    FullText]
+            };
+        {1, NewName, FullText} ->
+            {
+                fun (Init) -> Mod:Mod(Init) end,
+                fun (SampleCount, Init) -> Mod:SampleCountName(SampleCount, Init) end,
+                1, 1,
+                [lists:concat([Mod, "(Init) -> ", NewName, "(Init), ", Mod, "(Init)."]),
+                    lists:concat([SampleCountName, "(0, _Init) -> ok;", SampleCountName, "(Count, Init) -> ",
+                        NewName, "(Init), ", SampleCountName, "(Count - 1, Init)."]),
+                    FullText]
+            };
+        {2, NewName, FullText} ->
+            {
+                fun (Init) -> Mod:Mod(Init, Init) end,
+                fun (SampleCount, Init) -> Mod:SampleCountName(SampleCount, Init, Init) end,
+                2, 2,
+                [lists:concat([Mod, "(Init, State) -> ", Mod, "(Init, ", NewName, "(Init, State))."]),
+                    lists:concat([SampleCountName, "(0, _Init, _State) -> ok;", SampleCountName, "(Count, Init, State) -> ",
+                        SampleCountName, "(Count - 1, Init, ", NewName, "(Init, State))."]),
+                    FullText]
+            }
+    end;
+
+generate_runner(_Mod, _SampleCountName, Any) ->
+    erlang:error({generate, {parse, runner, Any}}).
+
+%% generates function text
+generate_text(Name, Text, AllowRaw) when is_list(Text) ->
+    case erl_scan:string(Text) of
+        {ok, Scan, _} ->
+            case erl_parse:parse_form(Scan) of
+                {ok, {function, _, AnyName, Arity, _}} ->
+                    {Arity, AnyName, Text};
+                {error, _} ->
+                    % try if it's an expr
+                    case erl_parse:parse_exprs(Scan) of
+                        {ok, _Clauses} when AllowRaw ->
+                            {0, lists:droplast(Text)};
+                        {ok, _Clauses} ->
+                            %% just wrap it in fun_name/0
+                            {0, Name, lists:concat([Name, "() -> ", Text])};
+                        {error, {_Line, ParseMod, Es}} ->
+                            Errors = ParseMod:format_error(Es),
+                            erlang:error({generate, {parse, Name, Errors}})
+                    end
+            end;
+        {error, ErrorInfo, ErrorLocation} ->
+            error({generate, {scan, Name, ErrorInfo, ErrorLocation}})
+    end.
+
+%% @doc Compiles text string into a binary module ready for code loading.
+compile(Name, Lines) ->
+    %% might not be the best way, but OTP simply does not have file:compile(Source, ...)
+    %% Original design was to write the actual source file to temporary disk location,
+    %%  but for diskless or write-protected hosts it was less convenient.
+    Tokens = [begin {ok, T, _} = erl_scan:string(Line), T end || Line <- Lines],
+    Forms = [begin {ok, F} = erl_parse:parse_form(T), F end || T <- Tokens],
+
+    case compile:forms(Forms, [no_spawn_compiler_process, binary, return]) of
+        {ok, Name, Bin} ->
+            Bin;
+        {ok, Name, Bin, _Warnings} ->
+            Bin;
+        {error, Errors, Warnings} ->
+            erlang:error({compile, Errors, Warnings})
+    end.
diff --git a/src/ep_job_sup.erl b/src/erlperf_job_sup.erl
similarity index 52%
rename from src/ep_job_sup.erl
rename to src/erlperf_job_sup.erl
index 490caf3..2ff9ff1 100644
--- a/src/ep_job_sup.erl
+++ b/src/erlperf_job_sup.erl
@@ -1,11 +1,8 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
-%%%   Supervisor for all jobs running on this node.
+%%% @copyright (C) 2019-2022, Maxim Fedorov
+%%% Supervises statically started jobs.
 %%% @end
-%%%-------------------------------------------------------------------
-
--module(ep_job_sup).
+-module(erlperf_job_sup).
+-author("maximfca@gmail.com").
 
 -behaviour(supervisor).
 
@@ -26,8 +23,8 @@ init([]) ->
             period => 60},
         [
             #{
-                id => ep_job,
-                start => {ep_job, start_link, []},
-                modules => [ep_job]
+                id => erlperf_job,
+                start => {erlperf_job, start_link, []},
+                modules => [erlperf_job]
             }
         ]}}.
diff --git a/src/ep_monitor.erl b/src/erlperf_monitor.erl
similarity index 52%
rename from src/ep_monitor.erl
rename to src/erlperf_monitor.erl
index 7d52778..e6da348 100644
--- a/src/ep_monitor.erl
+++ b/src/erlperf_monitor.erl
@@ -1,18 +1,19 @@
 %%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
+%%% @copyright (C) 2019-2022, Maxim Fedorov
 %%% @doc
 %%%   System monitor: scheduler, RAM, and benchmarks throughput
-%%%  sampled.
+%%%  samples.
 %%% @end
--module(ep_monitor).
+-module(erlperf_monitor).
 -author("maximfca@gmail.com").
 
 -behaviour(gen_server).
 
 %% API
 -export([
-    start_link/0
+    start/0,
+    start_link/0,
+    register/3
 ]).
 
 %% gen_server callbacks
@@ -20,27 +21,52 @@
     init/1,
     handle_call/3,
     handle_cast/2,
-    handle_info/2,
-    terminate/2
+    handle_info/2
 ]).
 
 
 -include_lib("kernel/include/logger.hrl").
 
--include("monitor.hrl").
-
--define(SERVER, ?MODULE).
-
 -define(DEFAULT_TICK_INTERVAL_MS, 1000).
 
+%% Monitoring sampling structure
+-type monitor_sample() :: #{
+    time => integer(),
+    sched_util => float(),
+    dcpu => float(),
+    dio => float(),
+    processes => integer(),
+    ports => integer(),
+    ets => integer(),
+    memory_total => non_neg_integer(),
+    memory_processes => non_neg_integer(),
+    memory_binary => non_neg_integer(),
+    memory_ets => non_neg_integer(),
+    jobs => [{Job :: pid(), Cycles :: non_neg_integer()}]
+}.
+
+-export_type([monitor_sample/0]).
 
 %%--------------------------------------------------------------------
+%% @doc
+%% Starts the server (unlinked, not supervised, used only for
+%%  isolated BEAM runs)
+-spec(start() -> {ok, Pid :: pid()} | {error, Reason :: term()}).
+start() ->
+    gen_server:start({local, ?MODULE}, ?MODULE, [], []).
+
 %% @doc
 %% Starts the server
--spec(start_link() ->
-    {ok, Pid :: pid()} | ignore | {error, Reason :: term()}).
+-spec(start_link() -> {ok, Pid :: pid()} | {error, Reason :: term()}).
 start_link() ->
-    gen_server:start_link({local, ?SERVER}, ?MODULE, [], []).
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+%% @doc
+%% Registers job to monitor (ignoring failures, as monitor may not be
+%%  running).
+-spec register(pid(), term(), non_neg_integer()) -> ok.
+register(Job, Handle, Initial) ->
+    gen_server:cast(?MODULE, {register, Job, Handle, Initial}).
 
 %%%===================================================================
 %%% gen_server callbacks
@@ -48,7 +74,7 @@ start_link() ->
 %% System monitor state
 -record(state, {
     % bi-map of job processes to counters
-    jobs = [] :: [{pid(), CRef :: reference(), Prev :: integer()}],
+    jobs :: [{pid(), Handle :: erlperf_job:handle(), Prev :: integer()}]    ,
     % scheduler data saved from last call
     sched_data :: [{pos_integer(), integer(), integer()}],
     % number of normal schedulers
@@ -61,21 +87,18 @@ start_link() ->
 }).
 
 init([]) ->
-    % subscribe to job events
-    erlang:process_flag(trap_exit, true),
-    ep_event_handler:subscribe(?JOB_EVENT, job),
-    %
+    %% subscribe to jobs starting up
+    %% TODO: figure out if there is a way to find jobs after restart.
+    %% ask a supervisor? but not all jobs are supervised...
+    Jobs = [],
+    %% Jobs = [{Pid, erlperf_job:handle(Pid), 0} ||
+    %%        {_, Pid, _, _} <- try supervisor:which_children(erlperf_job_sup) catch exit:{noproc, _} -> [] end],
+    %% [monitor(process, Pid) || {Pid, _, _} <- Jobs],
+    %% enable scheduler utilisation calculation
     erlang:system_flag(scheduler_wall_time, true),
     Tick = ?DEFAULT_TICK_INTERVAL_MS,
-    % start timing right now, and try to align for the middle of the interval
-    %   to simplify collecting cluster-wide data
-    % or not? Let's decide later.
-    %Next = erlang:monotonic_time(millisecond) + Tick - erlang:system_time(millisecond) rem Tick + Tick div 2,
     Next = erlang:monotonic_time(millisecond) + Tick,
     erlang:start_timer(Next, self(), tick, [{abs, true}]),
-    %% TODO: is it possible to avoid hacking over supervisor here?
-    Jobs = [{Pid, ep_job:get_counters(Pid), 0} || {_, Pid, _, _} <- supervisor:which_children(ep_job_sup)],
-    % init done
     {ok, #state{
         tick = Tick,
         jobs = Jobs,
@@ -86,33 +109,17 @@ init([]) ->
     }.
 
 handle_call(_Request, _From, _State) ->
-    error(badarg).
+    erlang:error(notsup).
 
-handle_cast(_Request, _State) ->
-    error(badarg).
+handle_cast({register, Job, Handle, Initial}, #state{jobs = Jobs} = State) ->
+    monitor(process, Job),
+    {noreply, State#state{jobs = [{Job, Handle, Initial} | Jobs]}}.
 
-terminate(_Reason, _State) ->
-    ep_event_handler:unsubscribe(?JOB_EVENT, job).
-
-%%--------------------------------------------------------------------
-%% @doc
-%% Starts the server
-
-handle_info({job, {started, Pid, _Code, CRef}}, #state{jobs = Jobs} = State) ->
-    monitor(process, Pid),
-    % emit system event for anyone willing to record this (e.g. benchmark_store process)
-    {noreply, State#state{jobs = [{Pid, CRef, 0} | Jobs]}};
-
-handle_info({'DOWN', _MRef, process, Pid, Reason}, #state{jobs = Jobs} = State) ->
-    Reason =/= normal andalso Reason =/= shutdown andalso
-        ?LOG_NOTICE("Job ~p exited with ~p", [Pid, Reason]),
+handle_info({'DOWN', _MRef, process, Pid, _Reason}, #state{jobs = Jobs} = State) ->
     {noreply, State#state{jobs = lists:keydelete(Pid, 1, Jobs)}};
 
 handle_info({timeout, _, tick}, State) ->
-    {noreply, handle_tick(State)};
-
-handle_info(_Info, _State) ->
-    error(badarg).
+    {noreply, handle_tick(State)}.
 
 %%%===================================================================
 %%% Internal functions
@@ -122,26 +129,33 @@ handle_tick(#state{sched_data = Data, normal = Normal, dcpu = Dcpu} = State) ->
     {NU, DU, DioU} = fold_normal(Data, NewSched, Normal, Dcpu, 0, 0),
     % add benchmarking info
     {Jobs, UpdatedJobs} = lists:foldl(
-        fun ({Pid, CRef, Prev}, {J, Save}) ->
-            Cycles = atomics:get(CRef, 1),
-            {[{Pid, Cycles - Prev} | J], [{Pid, CRef, Cycles} | Save]}
+        fun ({Pid, Handle, Prev}, {J, Save}) ->
+            Cycles =
+                case erlperf_job:sample(Handle) of
+                    C when is_integer(C) -> C;
+                    undefined -> Prev %% job is stopped, race condition here
+                end,
+            {[{Pid, Cycles - Prev} | J], [{Pid, Handle, Cycles} | Save]}
         end, {[], []}, State#state.jobs),
     %
-    Sample = #monitor_sample{
-        time = erlang:system_time(millisecond),
-        memory_total = erlang:memory(total),
-        memory_processes = erlang:memory(processes),
-        memory_binary = erlang:memory(binary),
-        memory_ets = erlang:memory(ets),
-        sched_util = NU * 100,
-        dcpu = DU * 100,
-        dio = DioU * 100,
-        processes = erlang:system_info(process_count),
-        ports = erlang:system_info(port_count),
-        ets = erlang:system_info(ets_count),
-        jobs = Jobs},
-    % notify subscribers
-    gen_event:notify(?SYSTEM_EVENT, Sample),
+    Sample = #{
+        time => erlang:system_time(millisecond),
+        memory_total => erlang:memory(total),
+        memory_processes => erlang:memory(processes),
+        memory_binary => erlang:memory(binary),
+        memory_ets => erlang:memory(ets),
+        sched_util => NU * 100,
+        dcpu => DU * 100,
+        dio => DioU * 100,
+        processes => erlang:system_info(process_count),
+        ports => erlang:system_info(port_count),
+        ets => erlang:system_info(ets_count),
+        jobs => Jobs},
+    % notify local subscribers
+    [Pid ! Sample || Pid <- pg:get_members(erlperf, {erlperf_monitor, node()})],
+    % notify global subscribers
+    [Pid ! {node(), Sample} || Pid <- pg:get_members(erlperf, cluster_monitor)],
+    %%
     NextTick = State#state.next_tick + State#state.tick,
     erlang:start_timer(NextTick, self(), tick, [{abs, true}]),
     State#state{sched_data = NewSched, next_tick = NextTick, jobs = lists:reverse(UpdatedJobs)}.
diff --git a/src/erlperf_sup.erl b/src/erlperf_sup.erl
index 2e0e14c..64be90f 100644
--- a/src/erlperf_sup.erl
+++ b/src/erlperf_sup.erl
@@ -1,12 +1,11 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (C) 2019, Maxim Fedorov
+%%% @copyright (C) 2019-2022, Maxim Fedorov
 %%% @doc
-%%%     Top-level supervisor, spawns job supervisor and monitor.
+%%% Top-level supervisor. Always starts process group scope
+%%%  for `erlperf'. Depending on the configuration starts
+%%%  a number of jobs or a cluster-wide monitoring solution.
 %%% @end
-%%%-------------------------------------------------------------------
-
 -module(erlperf_sup).
+-author("maximfca@gmail.com").
 
 -behaviour(supervisor).
 
@@ -15,46 +14,37 @@
     init/1
 ]).
 
--include("monitor.hrl").
-
 -spec start_link() -> supervisor:startlink_ret().
 start_link() ->
     supervisor:start_link({local, ?MODULE}, ?MODULE, []).
 
 -spec init([]) -> {ok, {supervisor:sup_flags(), [supervisor:child_spec()]}}.
 init([]) ->
-    SupFlags = #{strategy => rest_for_one,
-                 intensity => 2,
-                 period => 60},
-    %% OTP 24 support: pg2 module was removed, replaced with pg
-    %% However `pg` is not started by default, so run it as a part
-    %%  of erlperf supervision tree
-    PGSpec =
-        try
-            pg:module_info(),
-            undefined = whereis(pg),
-            [#{id => pg, start => {pg, start_link, []}}]
-        catch
-            error:undef -> [];
-            error:{badmatch, Pid} when is_pid(Pid) -> []
-        end,
+    SupFlags = #{strategy => rest_for_one, intensity => 2, period => 60},
 
     ChildSpecs = [
-        % event bus for job-related changes, started-stopped jobs
-        #{id => ?JOB_EVENT,
-            start => {gen_event, start_link, [{local, ?JOB_EVENT}]},
-            modules => dynamic},
-
-        % supervisor for all concurrently running jobs
-        #{id => ep_job_sup,
-            start => {ep_job_sup, start_link, []},
+        %% start own pg scope, needed for cluster-wide operations
+        %% even if the node-wide monitoring is not running, the scope
+        %% needs to be up to send "job started" events for the cluster
+        #{
+            id => pg,
+            start => {pg, start_link, [erlperf]},
+            modules => [pg]
+        },
+
+        %% monitoring
+        #{
+            id => erlperf_monitor,
+            start => {erlperf_monitor, start_link, []},
+            modules => [erlperf_monitor]
+        },
+
+        %% supervisor for statically started jobs
+        #{
+            id => erlperf_job_sup,
+            start => {erlperf_job_sup, start_link, []},
             type => supervisor,
-            modules => [ep_job_sup]},
+            modules => [erlperf_job_sup]
+        }],
 
-        % supervisor for node & cluster monitoring
-        #{id => ep_monitor_sup,
-            start => {ep_monitor_sup, start_link, []},
-            type => supervisor,
-            modules => [ep_monitor_sup]}
-        ],
-    {ok, {SupFlags, PGSpec ++ ChildSpecs}}.
+    {ok, {SupFlags, ChildSpecs}}.
diff --git a/test/ep_cluster_history_SUITE.erl b/test/ep_cluster_history_SUITE.erl
deleted file mode 100644
index 92cb202..0000000
--- a/test/ep_cluster_history_SUITE.erl
+++ /dev/null
@@ -1,140 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019 Maxim Fedorov
-%%% @doc
-%%%     Tests cluster monitor
-%%% @end
-
--module(ep_cluster_history_SUITE).
--author("maximfca@gmail.com").
-
-%% Common Test headers
-
--include_lib("stdlib/include/assert.hrl").
-
--include_lib("erlperf/include/monitor.hrl").
-
-%% Test server callbacks
--export([
-    suite/0,
-    all/0,
-    init_per_suite/1, end_per_suite/1,
-    init_per_testcase/2, end_per_testcase/2
-]).
-
-%% Test cases
--export([
-    errors/0, errors/1,
-    monitor_cluster/0, monitor_cluster/1
-]).
-
--export([
-    handle_update/2
-]).
-
-suite() ->
-    [{timetrap, {seconds, 10}}].
-
-init_per_suite(Config) ->
-    % for this one, need distributed node + full app running
-    test_helpers:ensure_started(erlperf, test_helpers:ensure_distributed(Config)).
-
-end_per_suite(Config) ->
-    test_helpers:maybe_undistribute(test_helpers:ensure_stopped(Config)).
-
-init_per_testcase(_TestCase, Config) ->
-    Config.
-
-end_per_testcase(_TestCase, _Config) ->
-    ok.
-
-all() ->
-    [errors, monitor_cluster].
-
-%%--------------------------------------------------------------------
-%% TEST CASES
-
-errors() ->
-    [{doc, "Test that we do not handle unknown stuff"}].
-
-errors(_Config) ->
-    ?assertException(error, badarg, ep_cluster_history:handle_cast(undefined, undefined)),
-    ?assertException(error, badarg, ep_cluster_history:handle_call(undefined, self(), undefined)),
-    ?assertException(error, badarg, ep_cluster_history:handle_info(undefined, undefined)),
-    %
-    ?assertException(error, badarg, ep_cluster_monitor:handle_cast(undefined, undefined)),
-    ?assertException(error, badarg, ep_cluster_monitor:handle_call(undefined, self(), undefined)),
-    ?assertException(error, badarg, ep_cluster_monitor:handle_info(undefined, undefined)),
-    %
-    ?assertException(error, badarg, ep_job:handle_cast(undefined, undefined)),
-    ?assertException(error, badarg, ep_job:handle_call(undefined, self(), undefined)),
-    ?assertException(error, badarg, ep_job:handle_info(undefined, undefined)).
-
-
-monitor_cluster() ->
-    [{doc, "Tests cluster-wide history, also piggy-back for cluster_monitor here"}].
-
-%% TODO: remove piggy-backing
-
-monitor_cluster(Config) ->
-    Started = erlang:system_time(millisecond),
-    Control = self(),
-    % start cluster monitor for [time, sched_util, jobs] only
-    {ok, Pid} = ep_cluster_monitor:start({?MODULE, handle_update, [Control]}, [time, sched_util, jobs]),
-    %
-    ok = ct:capture_start(),
-    {ok, ClusterMonPid} = ep_cluster_monitor:start(),
-    LogFile = filename:join(proplists:get_value(priv_dir, Config), "cluster_log.txt"),
-    {ok, ClusterFilePid} = ep_cluster_monitor:start(LogFile, [time, jobs]),
-    % start a benchmark on the different node, and monitor progress
-    % link the process to kill it if timetrap fires
-    Remote = spawn_link(
-        fun () ->
-            erlperf:run({timer, sleep, [10]}, #{concurrency => 2, isolation => #{}}),
-            unlink(Control)
-        end),
-    % start a job locally
-    {ok, Job} = ep_job:start({timer, sleep, [1]}),
-    % get cluster history samples for 2 nodes
-    {Total, Samples} = poll_history([], erlang:system_time(millisecond), 3),
-    ok = ep_job:stop(Job),
-    unlink(Remote),
-    exit(Remote, kill),
-    % some history must be there as well
-    History = ep_cluster_history:get(Started),
-    %
-    ?assertNotEqual([], History),
-    %
-    ct:capture_stop(),
-    Console = ct:capture_get(),
-    %
-    ok = ep_cluster_monitor:stop(ClusterFilePid),
-    ok = ep_cluster_monitor:stop(ClusterMonPid),
-    ok = ep_cluster_monitor:stop(Pid),
-    %
-    ?assertNotEqual([], Console),
-    %
-    ?assertNotEqual([], Total),
-    ?assertNotEqual([], Samples).
-
-handle_update(Sample, [Control]) ->
-    Control ! {monitor, Sample},
-    [Control].
-
--define(INTERVAL, 1000).
-
-poll_history(Events, From, 0) when is_integer(From) ->
-    poll_history(Events, [], 0);
-poll_history(Events, Samples, 0) ->
-    % collect cluster_monitor events too
-    receive
-        {monitor, Sample}  ->
-            poll_history(Events, [Sample | Samples], 0)
-    after
-        1 ->
-            {Events, Samples}
-    end;
-poll_history(Events, From, Count) ->
-    timer:sleep(?INTERVAL),
-    History = ep_cluster_history:get(From, From + ?INTERVAL),
-    poll_history([History | Events], From + ?INTERVAL, Count - 1).
diff --git a/test/ep_file_log_SUITE.erl b/test/ep_file_log_SUITE.erl
deleted file mode 100644
index 52d8521..0000000
--- a/test/ep_file_log_SUITE.erl
+++ /dev/null
@@ -1,83 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019 Maxim Fedorov
-%%% @doc
-%%%     Tests file_log
-%%% @end
-%%% -------------------------------------------------------------------
-
--module(ep_file_log_SUITE).
-
-%% Include stdlib header to enable ?assert() for readable output
--include_lib("stdlib/include/assert.hrl").
-
-%% Test server callbacks
--export([
-    suite/0,
-    all/0,
-    groups/0,
-    init_per_suite/1, end_per_suite/1,
-    init_per_testcase/2, end_per_testcase/2
-]).
-
-%% Test cases
--export([
-    console/0, console/1,
-    errors/1,
-    manual_start/0, manual_start/1
-]).
-
-suite() ->
-    [{timetrap, {seconds, 10}}].
-
-init_per_suite(Config) ->
-    test_helpers:ensure_started(erlperf, Config).
-
-end_per_suite(Config) ->
-    test_helpers:ensure_stopped(Config).
-
-init_per_testcase(basic, Config) ->
-    Config;
-init_per_testcase(_TestCase, Config) ->
-    Config.
-
-end_per_testcase(_TestCase, Config) ->
-    Config.
-
-groups() ->
-    [{all, [parallel], [console, errors, manual_start]}].
-
-%% cannot run basic & manual_start in parallel, due to specific "init" for basic.
-all() ->
-    [{group, all}].
-
-console() ->
-    [{doc, "Tests console logging"}].
-
-console(_Config) ->
-    ok = ct:capture_start(),
-    {ok, Pid} = ep_file_log:start(erlang:group_leader()),
-    erlperf:run({timer, sleep, [1]}),
-    ep_file_log:stop(Pid),
-    ok = ct:capture_stop(),
-    Console = ct:capture_get(),
-    ?assertNotEqual([], Console).
-
-% test that we do *not* handle unknown stuff
-errors(_Config) ->
-    ?assertException(error, badarg, ep_file_log:handle_cast(undefined, undefined)),
-    ?assertException(error, badarg, ep_file_log:handle_call(undefined, self(), undefined)),
-    ?assertException(error, badarg, ep_file_log:handle_info(undefined, undefined)),
-    %% piggy-backing...
-    ?assertException(error, badarg, ep_event_handler:handle_call(undefined, undefined)).
-
-manual_start() ->
-    [{doc, "Tests manual startup of file log"}].
-
-manual_start(Config) ->
-    Filename = filename:join(proplists:get_value(priv_dir, Config), "file_log_manual.txt"),
-    {ok, Pid} = ep_file_log:start(Filename),
-    erlperf:run({timer, sleep, [1]}),
-    ep_file_log:stop(Pid),
-    {ok, Logs} = file:read_file(Filename),
-    ?assert(byte_size(Logs) > 10).
\ No newline at end of file
diff --git a/test/ep_history_SUITE.erl b/test/ep_history_SUITE.erl
deleted file mode 100644
index e2d8d43..0000000
--- a/test/ep_history_SUITE.erl
+++ /dev/null
@@ -1,82 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019 Maxim Fedorov
-%%% @doc
-%%%     Tests monitor
-%%% @end
-
--module(ep_history_SUITE).
-
--include_lib("stdlib/include/assert.hrl").
-
--include("monitor.hrl").
-
-%% Test server callbacks
--export([
-    suite/0,
-    all/0,
-    init_per_suite/1, end_per_suite/1,
-    init_per_testcase/2, end_per_testcase/2
-]).
-
-%% Test cases
--export([
-    errors/0, errors/1,
-    get_last/0, get_last/1,
-    pg_clash/0, pg_clash/1
-]).
-
-suite() ->
-    [{timetrap, {seconds, 10}}].
-
-init_per_suite(Config) ->
-    test_helpers:ensure_started(erlperf, Config).
-
-end_per_suite(Config) ->
-    test_helpers:ensure_stopped(Config).
-
-init_per_testcase(_TestCase, Config) ->
-    Config.
-
-end_per_testcase(_TestCase, _Config) ->
-    ok.
-
-all() ->
-    [errors, get_last, pg_clash].
-
-%%--------------------------------------------------------------------
-%% TEST CASES
-
-errors() ->
-    [{doc, "Test that we do not handle unknown stuff"}].
-
-errors(_Config) ->
-    ?assertException(error, badarg, ep_history:handle_cast(undefined, undefined)),
-    ?assertException(error, badarg, ep_history:handle_call(undefined, self(), undefined)),
-    ?assertException(error, badarg, ep_history:handle_info(undefined, undefined)),
-    % piggy-backing: not a good idea, but adding entire test suite for monitor_pg2_proxy is overkill
-    ?assertException(error, badarg, ep_monitor_proxy:handle_cast(undefined, undefined)),
-    ?assertException(error, badarg, ep_monitor_proxy:handle_call(undefined, self(), undefined)),
-    ?assertException(error, badarg, ep_monitor_proxy:handle_info(undefined, undefined)).
-
-get_last() ->
-    [{doc, "Test that getting last 5 seconds yield something"}].
-
-get_last(_Config) ->
-    timer:sleep(1000),
-    Last = ep_history:get_last(5),
-    ?assertNotEqual([], Last).
-
-pg_clash() ->
-    [{doc, "Tests that pg does not clash if started by kernel/another app"}].
-
-pg_clash(Config) when is_list(Config) ->
-    case code:which(pg) of
-        non_existing ->
-            {skip, {otp_version, "pg is not supported"}};
-        _ ->
-            ok = application:stop(erlperf),
-            {ok, Pg} = pg:start_link(),
-            ok = application:start(erlperf),
-            gen_server:stop(Pg)
-    end.
diff --git a/test/ep_monitor_SUITE.erl b/test/ep_monitor_SUITE.erl
deleted file mode 100644
index e4a6bef..0000000
--- a/test/ep_monitor_SUITE.erl
+++ /dev/null
@@ -1,85 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019 Maxim Fedorov
-%%% @doc
-%%%     Tests monitor
-%%% @end
-
--module(ep_monitor_SUITE).
-
--include_lib("stdlib/include/assert.hrl").
-
--include("monitor.hrl").
-
-%% Test server callbacks
--export([
-    suite/0,
-    all/0,
-    init_per_suite/1, end_per_suite/1,
-    init_per_testcase/2, end_per_testcase/2
-]).
-
-%% Test cases
--export([
-    errors/0, errors/1,
-    subscribe/0, subscribe/1
-]).
-
-suite() ->
-    [{timetrap, {seconds, 10}}].
-
-init_per_suite(Config) ->
-    test_helpers:ensure_started(erlperf, Config).
-
-end_per_suite(Config) ->
-    test_helpers:ensure_stopped(Config).
-
-init_per_testcase(_TestCase, Config) ->
-    ok = ep_event_handler:subscribe(?SYSTEM_EVENT),
-    Config.
-
-end_per_testcase(_TestCase, _Config) ->
-    ep_event_handler:unsubscribe(?SYSTEM_EVENT),
-    ok.
-
-all() ->
-    [subscribe, errors].
-
-%%--------------------------------------------------------------------
-%% TEST CASES
-
-subscribe() ->
-    [{doc, "Tests monitoring subscription"}].
-
-subscribe(_Config) ->
-    % start a benchmark and see it running on 1 scheduler
-    Code = {timer, sleep, [10]},
-    {ok, Job} = ep_job:start(Code),
-    ok = ep_job:set_concurrency(Job, 4),
-    ?assertEqual({ok, Code, 4}, ep_job:info(Job)),
-    % wait for 3 seconds, receive updates
-    First = receive_updates(Job, 0, 2),
-    ok = ep_job:set_concurrency(Job, 2),
-    Second = receive_updates(Job, 0, 1),
-    ok = gen_server:stop(Job),
-    ?assert(First > 0),
-    ?assert(Second > 0),
-    ?assert(First > Second).
-
-receive_updates(_, Total, 0) ->
-    Total;
-receive_updates(Job, Total, Count) ->
-    receive
-        #monitor_sample{jobs = [{Job, Cycles}]} ->
-            receive_updates(Job, Total + Cycles, Count - 1);
-        Other ->
-            ?assertEqual([], Other)
-    end.
-
-errors() ->
-    [{doc, "Test that we do not handle unknown stuff"}].
-
-errors(_Config) ->
-    ?assertException(error, badarg, ep_monitor:handle_cast(undefined, undefined)),
-    ?assertException(error, badarg, ep_monitor:handle_call(undefined, self(), undefined)),
-    ?assertException(error, badarg, ep_monitor:handle_info(undefined, undefined)).
diff --git a/test/ep_prof_SUITE.erl b/test/ep_prof_SUITE.erl
deleted file mode 100644
index 04a7115..0000000
--- a/test/ep_prof_SUITE.erl
+++ /dev/null
@@ -1,92 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019 Maxim Fedorov
-%%% @doc
-%%%     Tests ctp
-%%% @end
-%%% -------------------------------------------------------------------
-
--module(ep_prof_SUITE).
-
--include_lib("stdlib/include/assert.hrl").
-
--export([all/0, suite/0]).
--export([time/1, trace/1, record/1, extended/1, more/1]).
-
-suite() ->
-    [{timetrap, {seconds,30}}].
-
-all() ->
-    [time, trace, record, extended, more].
-
-time(_Config) ->
-    ok.
-
-trace(_Config) ->
-    ok.
-
-record(Config) when is_list(Config) ->
-    spawn(fun () -> do_anything(1000, 1000) end),
-    Trace = ep_prof:sample(all, 500, [{?MODULE, '_', '_'}], silent),
-    Trace2 = ep_prof:record(500, {?MODULE, '_', '_'}),
-    %?assert(ct:pal("~p", [Trace]),
-    Data = ep_prof:format_callgrind(ep_prof:trace(500)),
-    %ct:pal("~p", [Data]),
-    ?assert(is_map(Trace)),
-    ?assert(is_map(Trace2)),
-    ?assert(is_binary(Data)).
-
-extended(Config) when is_list(Config) ->
-    spawn(fun () -> do_anything(1000, 1000) end),
-    Data = ep_prof:time(500),
-    ?assert(is_list(Data)).
-
-more(_Config) ->
-    %SampleSet = ctp:sample(all, 500, [{ctp_SUITE, '_', '_'}], spawn(fun progress_printer/0)),
-    %length(maps:get(undefined, SampleSet)) > 10,
-    % ctp:run(all, 50),
-    {ok, _} = ep_prof:start(),
-    % start rand() module - otherwise we must handle on_load
-    rand:module_info(),
-    %
-    ok = ep_prof:start_trace(#{sample => [{'_', '_', '_'}], arity => false}),
-    %ok = ctp:start_trace(#{arity => true}),
-    % inline 'timer:sleep()'
-    do_anything(10000, 10000),
-    do_other(10000, 10000),
-    %
-    ok = ep_prof:stop_trace(),
-    ok = ep_prof:collect(#{progress => spawn(fun progress_printer/0)}),
-    {ok, Grind, Trace} = ep_prof:format(#{format => callgrind}),
-    %io:format("Trace: ~s~n", [binary:part(Grind, 1, 300)]),
-    file:write_file("/tmp/callgrind.001", Grind),
-    file:write_file("/tmp/data.001", term_to_binary(Trace)),
-    %ctp:replay(Trace),
-    %{ok, Grind, Trace} = ctp:format(#{format => none, sort => call_time}),
-    %io:format("~p~n", [lists:sublist(Grind, 1, 5)]),
-    %io:format("Trace: ~p~n", [Trace]),
-    %file:write_file("/tmp/trace.001", io_lib:format("~p", [Trace])),
-    ok = ep_prof:stop(),
-    ok.
-
-progress_printer() ->
-    receive
-        {Step, 1, _, _} ->
-            io:format("~s started ", [Step]),
-            progress_printer();
-        {_Step, Done, Total, _} when Done rem 20 == 0->
-            io:format("~b/~b ", [Done, Total]),
-            progress_printer();
-        {_Step, Done, Done, _} ->
-            io:format(" complete.~n");
-        _ ->
-            progress_printer()
-    end.
-
-do_anything(0, _) -> ok;
-do_anything(C, N) when C rem 1000 == 0 -> do_anything(C-1, rand:uniform(C) + N);
-do_anything(C, N) -> do_anything(C-1, N).
-
-do_other(0, _) -> ok;
-do_other(C, N) when C rem 10 == 0 -> io_lib:format(".~b.", [N]);
-do_other(C, N) -> do_other(C-1, N).
diff --git a/test/erlperf_SUITE.erl b/test/erlperf_SUITE.erl
index 20a6617..0fff8b2 100644
--- a/test/erlperf_SUITE.erl
+++ b/test/erlperf_SUITE.erl
@@ -1,6 +1,6 @@
 %%%-------------------------------------------------------------------
 %%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019 Maxim Fedorov
+%%% @copyright (c) 2019-2022 Maxim Fedorov
 %%% @doc
 %%%     Tests benchmark module, machine-readable output for benchmarks.
 %%% @end
@@ -10,14 +10,7 @@
 
 -include_lib("stdlib/include/assert.hrl").
 
--export([
-    suite/0, all/0, groups/0,
-    init_per_testcase/2,
-    end_per_testcase/2,
-    init_per_group/2,
-    end_per_group/2,
-    init_per_suite/1,
-    end_per_suite/1]).
+-export([suite/0, all/0, groups/0]).
 
 -export([
     start_link/0,
@@ -33,13 +26,13 @@
     code_gen_server/1, mfa_concurrency/1, mfa_no_concurrency/1,
     code_extra_node/1, mixed/0, mixed/1,
     crasher/0, crasher/1, undefer/0, undefer/1, compare/1,
-    errors/0, errors/1, formatters/1]).
+    errors/0, errors/1]).
 
 -export([
     cmd_line_simple/1, cmd_line_verbose/1, cmd_line_compare/1,
     cmd_line_squeeze/1, cmd_line_usage/1, cmd_line_init/1,
     cmd_line_double/1, cmd_line_triple/1, cmd_line_pg/1, cmd_line_mfa/1,
-    cmd_line_recorded/1, cmd_line_profile/1,
+    cmd_line_recorded/1,
     cmd_line_squeeze/0
 ]).
 
@@ -53,30 +46,6 @@
 suite() ->
     [{timetrap, {seconds, 10}}].
 
-init_per_suite(Config) ->
-    Config.
-
-end_per_suite(Config) ->
-    test_helpers:maybe_undistribute(Config).
-
-init_per_group(cmdline, Config) ->
-    Config;
-init_per_group(_, Config) ->
-    test_helpers:ensure_started(erlperf, Config).
-
-end_per_group(cmdline, Config) ->
-    Config;
-end_per_group(_, Config) ->
-    test_helpers:ensure_stopped(Config).
-
-init_per_testcase(code_extra_node, Config) ->
-    test_helpers:ensure_distributed(Config);
-init_per_testcase(_TestCase, Config) ->
-    Config.
-
-end_per_testcase(_TestCase, _Config) ->
-    ok.
-
 groups() ->
     [
         {benchmark, [parallel], [
@@ -98,8 +67,7 @@ groups() ->
             mixed,
             crasher, undefer,
             compare,
-            errors,
-            formatters
+            errors
         ]},
         {cmdline, [sequential], [
             cmd_line_simple,
@@ -112,8 +80,7 @@ groups() ->
             cmd_line_triple,
             cmd_line_pg,
             cmd_line_mfa,
-            cmd_line_recorded,
-            cmd_line_profile
+            cmd_line_recorded
         ]},
         {squeeze, [], [
             mfa_squeeze
@@ -136,7 +103,7 @@ handle_call({sleep, Num}, _From, State) ->
     {reply, ok, State}.
 
 handle_cast(_Req, _State) ->
-    error(badarg).
+    erlang:error(notsup).
 
 start_link() ->
     {ok, Pid} = gen_server:start_link(?MODULE, [], []),
@@ -162,11 +129,13 @@ capture_io(Fun) ->
 %%  -warmup
 %%  -interval
 
-mfa(_Config) ->
-    C = erlperf:run({timer, sleep, [1]}),
-    ?assert(C > 250 andalso C < 1101).
+mfa(Config) when is_list(Config) ->
+    C = erlperf:run(timer, sleep, [1]),
+    ?assert(C > 250 andalso C < 1101),
+    Time = erlperf:time({timer, sleep, [1]}, 100),
+    ?assert(Time > 100000 andalso Time < 300000). %% between 100 and 300 ms
 
-mfa_with_cv(_Config) ->
+mfa_with_cv(Config) when is_list(Config) ->
     C = erlperf:run({timer, sleep, [1]}, #{cv => 0.05}),
     ?assert(C > 250 andalso C < 1101).
 
@@ -174,51 +143,53 @@ mfa_with_tiny_cv() ->
     [{doc, "Tests benchmarking with very small coefficient of variation, potentially long"},
         {timetrap, {seconds, 60}}].
 
-mfa_with_tiny_cv(_Config) ->
+mfa_with_tiny_cv(Config) when is_list(Config) ->
     C = erlperf:run({timer, sleep, [1]}, #{samples => 2, interval => 100, cv => 0.002}),
     ?assert(C > 250 andalso C < 1101).
 
-mfa_list(_Config) ->
+mfa_list(Config) when is_list(Config) ->
     C = erlperf:run([{rand, seed, [exrop]}, {timer, sleep, [1]}, {rand, uniform, [20]}, {timer, sleep, [1]}]),
-    ?assert(C > 200 andalso C < 450).
+    ?assert(C > 200 andalso C < 450, {out_of_range, C, 200, 450}).
 
-mfa_fun(_Config) ->
+mfa_fun(Config) when is_list(Config) ->
     C = erlperf:run(fun () -> timer:sleep(1) end),
-    ?assert(C > 250 andalso C < 1101).
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-mfa_fun1(_Config) ->
-    C = erlperf:run(fun (undefined) -> timer:sleep(1); (ok) -> timer:sleep(1) end),
-    ?assert(C > 250 andalso C < 1101).
+mfa_fun1(Config) when is_list(Config) ->
+    C = erlperf:run(#{runner => fun (undefined) -> timer:sleep(1); (ok) -> timer:sleep(1) end,
+        init_runner => fun() -> undefined end}),
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-code(_Config) ->
+code(Config) when is_list(Config) ->
     C = erlperf:run("timer:sleep(1)."),
-    ?assert(C > 250 andalso C < 1101).
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-code_fun(_Config) ->
+code_fun(Config) when is_list(Config) ->
     C = erlperf:run("runner() -> timer:sleep(1)."),
-    ?assert(C > 250 andalso C < 1101).
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-code_fun1(_Config) ->
-    C = erlperf:run("runner(undefined) -> timer:sleep(1)."),
-    ?assert(C > 250 andalso C < 1101).
+code_fun1(Config) when is_list(Config) ->
+    C = erlperf:run(#{runner => "runner(undefined) -> timer:sleep(1), undefined.",
+        init_runner => "undefined."}),
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-mfa_init(_Config) ->
+mfa_init(Config) when is_list(Config) ->
     C = erlperf:run(#{
         runner => fun (1) -> timer:sleep(1) end,
         init => [{rand, seed, [exrop]}, {rand, uniform, [100]}],
         init_runner => {erlang, abs, [-1]}
     }),
-    ?assert(C > 250 andalso C < 1101).
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-mfa_fun_init(_Config) ->
+mfa_fun_init(Config) when is_list(Config) ->
     C = erlperf:run(#{
-        runner => {timer, sleep, []},
+        runner => fun(Timeout) -> timer:sleep(Timeout) end,
         init => fun () -> ok end,
         init_runner => fun (ok) -> 1 end
     }),
-    ?assert(C > 250 andalso C < 1101).
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-code_gen_server(_Config) ->
+code_gen_server(Config) when is_list(Config) ->
     C = erlperf:run(#{
         runner => "run(Pid) -> gen_server:call(Pid, {sleep, 1}).",
         init => "Pid = " ++ atom_to_list(?MODULE) ++ ":start_link(), register(server, Pid), Pid.",
@@ -226,13 +197,13 @@ code_gen_server(_Config) ->
         done => "stop(Pid) -> gen_server:stop(Pid)."
     }),
     ?assertEqual(undefined, whereis(server)),
-    ?assert(C > 250 andalso C < 1101).
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-mfa_concurrency(_Config) ->
+mfa_concurrency(Config) when is_list(Config) ->
     C = erlperf:run({timer, sleep, [1]}, #{concurrency => 2}),
-    ?assert(C > 500 andalso C < 2202).
+    ?assert(C > 500 andalso C < 2202, {out_of_range, C, 500, 2202}).
 
-mfa_no_concurrency(_Config) ->
+mfa_no_concurrency(Config) when is_list(Config) ->
     C = erlperf:run(
         #{
             runner => fun (Pid) -> gen_server:call(Pid, {sleep, 1}) end,
@@ -241,62 +212,66 @@ mfa_no_concurrency(_Config) ->
             done => {gen_server, stop, []}
         },
         #{concurrency => 4}),
-    ?assert(C > 250 andalso C < 1101).
+    ?assert(C > 250 andalso C < 1101, {out_of_range, C, 250, 1101}).
 
-code_extra_node(_Config) ->
+code_extra_node(Config) when is_list(Config) ->
     C = erlperf:run(#{
-            runner => "{ok, Timer} = application:get_env(kernel, test), timer:sleep(Timer).",
+            runner => "{ok, 1} = application:get_env(kernel, test), timer:sleep(1).",
             init => "application:set_env(kernel, test, 1)."
         },
         #{concurrency => 2, sample_duration => 100, isolation => #{}}),
     ?assertEqual(undefined, application:get_env(kernel, test)),
     ct:pal("~p", [C]),
-    ?assert(C > 50 andalso C < 220).
+    ?assert(C > 50 andalso C < 220, {out_of_range, C, 50, 220}).
 
 crasher() ->
     [{doc, "Tests job that crashes"}].
 
-crasher(_Config) ->
-    C = erlperf:run({erlang, throw, [ball]}, #{concurrency => 2}),
-    ?assertEqual(0, C).
+crasher(Config) when is_list(Config) ->
+    ?assertException(error, {benchmark, {'EXIT', _, _}},
+        erlperf:run({erlang, throw, [ball]}, #{concurrency => 2})).
 
 mixed() ->
     [{doc, "Tests mixed approach when code co-exists with MFAs"}].
 
-mixed(_Config) ->
+mixed(Config) when is_list(Config) ->
     C = erlperf:run(#{
         runner => [{timer, sleep, [1]}, {timer, sleep, [2]}],
         init => "rand:uniform().",
         init_runner => fun (Int) -> Int end
     }),
-    ?assert(C > 100 andalso C < 335).
+    ?assert(C > 100 andalso C < 335, {out_of_range, C, 100, 335}).
 
 undefer() ->
     [{doc, "Tests job undefs - e.g. wrong module name"}].
 
-undefer(_Config) ->
-    ?assertException(error, {badmatch, {error, {{module_not_found, '$cannot_be_this'}, _}}},
+undefer(Config) when is_list(Config) ->
+    ?assertException(error, {benchmark, {'EXIT', _, {undef, _}}},
         erlperf:run({'$cannot_be_this', throw, []}, #{concurrency => 2})).
 
-compare(_Config) ->
+compare(Config) when is_list(Config) ->
     [C1, C2] = erlperf:compare(["timer:sleep(1).", "timer:sleep(2)."], #{sample_duration => 100}),
-    ?assert(C1 > C2).
+    ?assert(C1 > C2),
+    %% low-overhead comparison benchmark
+    [T1, T2] = erlperf:benchmark([#{runner => {timer, sleep, [1]}}, #{runner => "timer:sleep(2)."}],
+        #{sample_duration => undefined, samples => 50}, undefined),
+    ?assert(T1 < T2).
 
 errors() ->
     [{doc, "Tests various error conditions"}].
 
-errors(_Config) ->
-    ?assertException(error, {badmatch, {error, {"empty callable", _}}},
+errors(Config) when is_list(Config) ->
+    ?assertException(error, {generate, {parse, init, _}},
         erlperf:run(#{runner => {erlang, node, []}, init => []})),
-    ?assertException(error, {badmatch, {error, {"empty callable", _}}},
+    ?assertException(error, {generate, {parse, runner, _}},
         erlperf:run(#{runner => []})),
-    ?assertException(error, {badmatch, {error, {"empty callable", _}}},
+    ?assertException(error, {generate, {parse, runner, _}},
         erlperf:run(#{runner => {[]}})).
 
 mfa_squeeze() ->
     [{timetrap, {seconds, 120}}].
 
-mfa_squeeze(_Config) ->
+mfa_squeeze(Config) when is_list(Config) ->
     case erlang:system_info(schedulers_online) of
         LowCPU when LowCPU < 3 ->
             skip;
@@ -310,113 +285,148 @@ mfa_squeeze(_Config) ->
 %%--------------------------------------------------------------------
 %% command-line testing
 
+parse_qps(QPST, "") -> list_to_integer(QPST);
+parse_qps(QPST, "Ki") -> list_to_integer(QPST) * 1000;
+parse_qps(QPST, "Mi") -> list_to_integer(QPST) * 1000000;
+parse_qps(QPST, "Gi") -> list_to_integer(QPST) * 1000000000.
+
+parse_duration(TT, "ns") -> list_to_integer(TT) div 1000;
+parse_duration(TT, "us") -> list_to_integer(TT);
+parse_duration(TT, "ms") -> list_to_integer(TT) * 1000;
+parse_duration(TT, "s") -> list_to_integer(TT) * 1000000.
+
+filtersplit(Str, Sep) ->
+    [L || L <- string:split(Str, Sep, all), L =/= ""].
+
+parse_out(Out) ->
+    [Header | Lines] = filtersplit(Out, "\n"),
+    case filtersplit(Header, " ") of
+        ["Code", "||", "QPS", "Time"] ->
+            [begin
+                 case filtersplit(Ln, " ") of
+                     [Code, ConcT, QPST, TT, TTU] ->
+                         {Code, list_to_integer(ConcT), parse_qps(QPST, ""), parse_duration(TT, TTU)};
+                     [Code, ConcT, QPST, QU, TT, TTU] ->
+                         {Code, list_to_integer(ConcT), parse_qps(QPST, QU), parse_duration(TT, TTU)}
+                 end
+             end || Ln <- Lines];
+        ["Code", "||", "QPS", "Time", "Rel"] ->
+            [begin
+                 case filtersplit(Ln, " ") of
+                     [Code, ConcT, QPST, TT, TTU, Rel] ->
+                         {Code, list_to_integer(ConcT), parse_qps(QPST, ""), parse_duration(TT, TTU),
+                             list_to_integer(lists:droplast(Rel))};
+                     [Code, ConcT, QPST, QU, TT, TTU, Rel] ->
+                         {Code, list_to_integer(ConcT), parse_qps(QPST, QU), parse_duration(TT, TTU),
+                             list_to_integer(lists:droplast(Rel))}
+                 end
+             end || Ln <- Lines];
+        _BadRet ->
+            ct:pal(Out),
+            ?assert(false)
+    end.
+
 % erlperf 'timer:sleep(1). -d 100'
-cmd_line_simple(_Config) ->
+cmd_line_simple(Config) when is_list(Config) ->
     Code = "timer:sleep(1).",
-    Out = capture_io(fun() -> erlperf:main([Code, "-d", "100"]) end),
-    [LN1, LN2] = string:split(Out, "\n"),
-    ?assertEqual(["Code", "||", "QPS", "Rel"], string:lexemes(LN1, " ")),
-    ?assertMatch([Code, "1", _, "100%\n"], string:lexemes(LN2, " ")),
-    ok.
+    Out = capture_io(fun() -> erlperf_cli:main([Code, "-d", "100"]) end),
+    [{Code, 1, C, T}] = parse_out(Out),
+    ?assert(C > 25 andalso C < 110, {qps, C}),
+    ?assert(T > 1000 andalso T < 3000, {time, T}).
 
 % erlperf 'timer:sleep(1). -v'
-cmd_line_verbose(_Config) ->
+cmd_line_verbose(Config) when is_list(Config) ->
     Code = "timer:sleep(1).",
-    Out = capture_io(fun () -> erlperf:main([Code, "-v"]) end),
-    Lines = string:lexemes(Out, "\n"),
+    Out = capture_io(fun () -> erlperf_cli:main([Code, "-v"]) end),
+    Lines = filtersplit(Out, "\n"),
+    %% TODO: actually verify that stuff printed is monitoring stuff
     ?assert(length(Lines) > 3),
-    ok.
+    %% parse last 2 lines
+    [{Code, 1, C, T}] = parse_out(lists:join("\n", lists:sublist(Lines, length(Lines) - 1, 2))),
+    ?assert(C > 250 andalso C < 1101, {qps, C}),
+    ?assert(T > 1000 andalso T < 3000, {time, T}).
 
-% erlperf 'rand:uniform().' 'crypto:strong_rand_bytes(2).' -d 100 -s 5 -w 1 -c 2
-cmd_line_compare(_Config) ->
+% erlperf 'timer:sleep(1).' 'timer:sleep(2).' -d 100 -s 5 -w 1 -c 2
+cmd_line_compare(Config) when is_list(Config) ->
     Out = capture_io(
-        fun () -> erlperf:main(["timer:sleep(1).", "timer:sleep(2).", "-s", "5", "-d", "100", "-w", "1", "-c", "2"]) end),
-    ?assertNotEqual([], Out),
-    % Code            Concurrency   Throughput   Relative
-    % timer:sleep().            2          950       100%
-    % timer:sleep(2).           2          475        50%
-    ok.
+        fun () -> erlperf_cli:main(["timer:sleep(1).", "timer:sleep(2).", "-s", "5", "-d", "100", "-w", "1", "-c", "2"]) end),
+    % Code            Concurrency   Throughput      Time      Rel
+    % timer:sleep().            2          950    100 ns     100%
+    % timer:sleep(2).           2          475    200 ns      50%
+    [{_Code, 2, C, T, R}, {_Code2, 2, C2, T2, R2}] = parse_out(Out),
+    ?assert(C > 66 andalso C < 220, {qps, C}),
+    ?assert(C2 > 50 andalso C2 < 110, {qps, C2}),
+    ?assert(T < T2),
+    ?assert(R > R2).
 
 cmd_line_squeeze() ->
     [{doc, "Tests concurrency test via command line"}, {timetrap, {seconds, 30}}].
 
 % erlperf 'timer:sleep(1).' --sample_duration 50 --squeeze --min 2 --max 4 --threshold 2
-cmd_line_squeeze(_Config) ->
+cmd_line_squeeze(Config) when is_list(Config) ->
     Out = capture_io(
-        fun () -> erlperf:main(["timer:sleep(1).", "--sample_duration", "50", "--squeeze", "--min", "2", "--max", "4", "--threshold", "2"]) end),
-    ?assertNotEqual([], Out),
-    ok.
+        fun () -> erlperf_cli:main(["timer:sleep(1).", "--duration", "50", "--squeeze", "--min", "2", "--max", "4", "--threshold", "2"]) end),
+    [{_Code, 4, C, T}] = parse_out(Out),
+    ?assert(C > 50 andalso C < 220, {qps, C}),
+    ?assert(T > 1000 andalso T < 3000, {time, T}).
 
 % erlperf -q
-cmd_line_usage(_Config) ->
-    Out = capture_io(fun () -> erlperf:main(["-q"]) end),
-    Line1 = "error: erlperf: required argument missing: code",
+cmd_line_usage(Config) when is_list(Config) ->
+    Out = capture_io(fun () -> erlperf_cli:main(["-q"]) end),
+    Line1 = "Error: erlperf: required argument missing: code",
     ?assertEqual(Line1, lists:sublist(Out, length(Line1))),
-    Out2 = capture_io(fun () -> erlperf:main(["--un code"]) end),
-    ?assertEqual("error: erlperf: unrecognised argument: --un code", lists:sublist(Out2, 48)),
+    Out2 = capture_io(fun () -> erlperf_cli:main(["--un code"]) end),
+    ?assertEqual("Error: erlperf: unrecognised argument: --un code", lists:sublist(Out2, 48)),
     ok.
 
-% erlperf '{file,_}=code:is_loaded(slave).' --init 'code:ensure_loaded(slave).' --done 'code:purge(slave), code:delete(slave).'
-cmd_line_init(_Config) ->
-    Code = "{file,_}=code:is_loaded(slave).",
-    Out = capture_io(fun () -> erlperf:main(
-        [Code, "--init", "code:ensure_loaded(slave).", "--done", "code:purge(slave), code:delete(slave)."])
+% erlperf '{file,_}=code:is_loaded(pool).' --init 'code:ensure_loaded(pool).' --done 'code:purge(pool), code:delete(pool).'
+cmd_line_init(Config) when is_list(Config) ->
+    Code = "{file,_}=code:is_loaded(pool).",
+    Out = capture_io(fun () -> erlperf_cli:main(
+        [Code, "--init", "code:ensure_loaded(pool).", "--done", "code:purge(pool), code:delete(pool)."])
                                   end),
     % verify 'done' was done
-    ?assertEqual(false, code:is_loaded(slave)),
+    ?assertEqual(false, code:is_loaded(pool)),
     % verify output
-    [LN1, LN2] = string:split(Out, "\n"),
-    ?assertEqual(["Code", "||", "QPS", "Rel"], string:lexemes(LN1, " ")),
-    ?assertMatch([Code, "1", _, _, "100%\n"], string:lexemes(LN2, " ")),
-    ok.
+    [{_Code, 1, C, T}] = parse_out(Out),
+    ?assert(C > 50, {qps, C}),
+    ?assert(T > 0, {time, T}).
 
 % erlperf 'runner(X) -> timer:sleep(X).' --init '1.' 'runner(Y) -> timer:sleep(Y).' --init '2.' -s 2 --duration 100
-cmd_line_double(_Config) ->
+cmd_line_double(Config) when is_list(Config) ->
     Code = "runner(X)->timer:sleep(X).",
-    Out = capture_io(fun () -> erlperf:main([Code, "--init", "1.", Code, "--init", "2.", "-s", "2",
+    Out = capture_io(fun () -> erlperf_cli:main([Code, "--init_runner", "1.", Code, "--init_runner", "2.", "-s", "2",
         "--duration", "100"]) end),
-    [LN1, LN2, _LN3 | _] = string:split(Out, "\n", all),
-    ?assertEqual(["Code", "||", "QPS", "Rel"], string:lexemes(LN1, " ")),
-    ?assertMatch([Code, "1", _, "100%"], string:lexemes(LN2, " ")),
-    ok.
-
-cmd_line_triple(_Config) ->
-    Out = capture_io(fun () -> erlperf:main(["timer:sleep(1).", "-s", "2", "--duration", "100",
+    [{Code, 1, C, T, R}, {Code, 1, C2, T2, R2}] = parse_out(Out),
+    ?assert(C > 25 andalso C < 110, {qps, C}),
+    ?assert(C2 > 25 andalso C2 < 55, {qps, C2}),
+    ?assert(T < T2),
+    ?assert(R > R2).
+
+cmd_line_triple(Config) when is_list(Config) ->
+    Out = capture_io(fun () -> erlperf_cli:main(["timer:sleep(1).", "-s", "2", "--duration", "100",
         "timer:sleep(2).", "timer:sleep(3)."]) end),
-    [LN1, _LN2, _LN3, LN4 | _] = string:split(Out, "\n", all),
-    ?assertEqual(["Code", "||", "QPS", "Rel"], string:lexemes(LN1, " ")),
-    [_, "1", TS3QPSS, TS3PercentS] = string:lexemes(LN4, " "),
-    TS3QPS = list_to_integer(TS3QPSS),
-    TS3Percent = list_to_integer(lists:droplast(TS3PercentS)),
-    ?assert(TS3QPS > 20 andalso TS3QPS < 30, {"expected between 20 and 30, got", TS3QPS}),
-    ?assert(TS3Percent > 40 andalso TS3Percent < 60, {"expected between 40 and 60, got", TS3QPS}),
+    [_, _, {_, 1, C3, _T3, R3}] = parse_out(Out),
+    ?assert(C3 > 20 andalso C3 < 30, {"expected between 20 and 30, got", C3}),
+    ?assert(R3 > 40 andalso R3 < 60, {"expected between 40 and 60, got", R3}),
     ok.
 
 % erlperf 'runner(Arg) -> ok = pg:join(Arg, self()), ok = pg:leave(Arg, self()).' --init_runner 'pg:create(self()), self().'
-cmd_line_pg(_Config) ->
-    case code:which(pg) of
-        non_existing ->
-            {skip, {otp_version, "pg is not supported"}};
-        _ ->
-            ?assertEqual(undefined, whereis(scope)), %% ensure scope is not left
-            Code = "runner(S)->ok=pg:join(S,self()),ok=pg:leave(S,self()).",
-            Out = capture_io(fun () -> erlperf:main(
-                [Code, "--init_runner", "{ok,Scope}=pg:start_link(scope),Scope."])
-                                          end),
-            [LN1, LN2] = string:split(Out, "\n"),
-            ?assertEqual(undefined, whereis(scope)), %% ensure runner exited
-            ?assertEqual(["Code", "||", "QPS", "Rel"], string:lexemes(LN1, " ")),
-            ?assertMatch([Code, "1", _, _, "100%\n"], string:lexemes(LN2, " "))
-    end.
+cmd_line_pg(Config) when is_list(Config) ->
+    ?assertEqual(undefined, whereis(scope)), %% ensure scope is not left
+    Code = "runner(S)->pg:join(S,g,self()),pg:leave(S,g,self()).",
+    Out = capture_io(fun () -> erlperf_cli:main(
+        [Code, "--init_runner", "{ok,Scope}=pg:start_link(scope),Scope."])
+                                  end),
+    ?assertEqual(undefined, whereis(scope)), %% ensure runner exited
+    [{_Code, 1, C, _T}] = parse_out(Out),
+    ?assert(C > 100, {qps, C}).
 
-% erlperf '{rand, uniform, [100]}'
-cmd_line_mfa(_Config) ->
+% erlperf '{rand, uniform, [4]}'
+cmd_line_mfa(Config) when is_list(Config) ->
     Code = "{rand,uniform,[4]}",
-    Out = capture_io(fun () -> erlperf:main([Code]) end),
-    [LN1, LN2] = string:split(Out, "\n"),
-    ?assertEqual(["Code", "||", "QPS", "Rel"], string:lexemes(LN1, " ")),
-    ?assertMatch([Code, "1" | _], string:lexemes(LN2, " ")),
-    ok.
+    Out = capture_io(fun () -> erlperf_cli:main([Code]) end),
+    [{Code, 1, _C, _T}] = parse_out(Out).
 
 % erlperf 'runner(Arg) -> ok = pg2:join(Arg, self()), ok = pg2:leave(Arg, self()).' --init 'ets:file2tab("pg2.tab").'
 cmd_line_recorded(Config) ->
@@ -435,42 +445,14 @@ cmd_line_recorded(Config) ->
             {ets, delete, [test_ets_tab, 100]}
         ])),
     %
-    Out = capture_io(fun () -> erlperf:main(
+    Out = capture_io(fun () -> erlperf_cli:main(
         [RecFile, "--init", "ets:file2tab(\"" ++ EtsFile ++ "\")."])
                                   end),
     [LN1, LN2] = string:split(Out, "\n"),
-    ?assertEqual(["Code", "||", "QPS", "Rel"], string:lexemes(LN1, " ")),
+    ?assertEqual(["Code", "||", "QPS", "Time"], string:lexemes(LN1, " ")),
     ?assertMatch(["[{ets,insert,[test_ets_tab,{100,40}]},", "...]", "1" | _], string:lexemes(LN2, " ")),
     ok.
 
-% profiler test
-cmd_line_profile(_Config) ->
-    case code:which(pg) of
-        non_existing ->
-            {skip, {otp_version, "pg is not supported"}};
-        _ ->
-            ?assertEqual(undefined, whereis(scope)), %% ensure runner is not left from the last run
-            Code = "runner(Arg)->ok=pg:join(Arg,1,self()),ok=pg:leave(Arg,1,self()).",
-            Out = capture_io(fun () -> erlperf:main(
-                [Code, "--init_runner", "element(2, pg:start_link(scope)).", "--profile"])
-                                          end),
-            ?assertEqual(undefined, whereis(scope)), %% ensure runner exited
-            [LN1 | _] = string:split(Out, "\n"),
-            ?assertEqual("Reading trace data...", LN1)
-    end.
-
-formatters(_Config) ->
-    ?assertEqual("88", erlperf:format_size(88)),
-    ?assertEqual("88000", erlperf:format_number(88000)),
-    ?assertEqual("881 Mb", erlperf:format_size(881 * 1024 * 1024)),
-    ?assertEqual("881 Mb", erlperf:format_size(881 * 1024 * 1024)),
-    ?assertEqual("123 Gb", erlperf:format_size(123 * 1024 * 1024 * 1024)),
-    % rounding
-    ?assertEqual("42", erlperf:format_number(42)),
-    ?assertEqual("432 Ki", erlperf:format_number(431992)),
-    ?assertEqual("333 Mi", erlperf:format_number(333000000)),
-    ?assertEqual("999 Gi", erlperf:format_number(998500431992)).
-
 %%--------------------------------------------------------------------
 %% record-replay
 
diff --git a/test/erlperf_file_log_SUITE.erl b/test/erlperf_file_log_SUITE.erl
new file mode 100644
index 0000000..4e85132
--- /dev/null
+++ b/test/erlperf_file_log_SUITE.erl
@@ -0,0 +1,66 @@
+%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @doc
+%%%     Tests erlperf_file_log
+%%% @end
+-module(erlperf_file_log_SUITE).
+-author("maximfca@gmail.com").
+
+%% Include stdlib header to enable ?assert() for readable output
+-include_lib("stdlib/include/assert.hrl").
+
+%% Test server callbacks
+-export([
+    suite/0,
+    all/0
+]).
+
+%% Test cases
+-export([
+    file_log/0, file_log/1,
+    formatters/0, formatters/1
+]).
+
+suite() ->
+    [{timetrap, {seconds, 10}}].
+
+all() ->
+    [formatters, file_log].
+
+formatters() ->
+    [{doc, "Basic tests for formatters like Ki (Kilo-calls) and Kb (Kilo-bytes)"}].
+
+formatters(Config) when is_list(Config) ->
+    ?assertEqual("88", erlperf_file_log:format_size(88)),
+    ?assertEqual("88000", erlperf_file_log:format_number(88000)),
+    ?assertEqual("881 Mb", erlperf_file_log:format_size(881 * 1024 * 1024)),
+    ?assertEqual("881 Mb", erlperf_file_log:format_size(881 * 1024 * 1024)),
+    ?assertEqual("123 Gb", erlperf_file_log:format_size(123 * 1024 * 1024 * 1024)),
+    % rounding
+    ?assertEqual("42", erlperf_file_log:format_number(42)),
+    ?assertEqual("432 Ki", erlperf_file_log:format_number(431992)),
+    ?assertEqual("333 Mi", erlperf_file_log:format_number(333000000)),
+    ?assertEqual("999 Gi", erlperf_file_log:format_number(998500431992)).
+
+file_log() ->
+    [{doc, "Tests console and file logging sanity and equality"}].
+
+file_log(Config) when is_list(Config) ->
+    {ok, Pg} = pg:start_link(erlperf),
+    {ok, Mon} = erlperf_monitor:start_link(),
+    Filename = filename:join(proplists:get_value(priv_dir, Config), "file_log_manual.txt"),
+    ok = ct:capture_start(),
+    {ok, FileLog} = erlperf_file_log:start_link(Filename),
+    {ok, ConsoleLog} = erlperf_file_log:start_link(erlang:group_leader()),
+    erlperf:run(timer, sleep, [1]),
+    ok = ct:capture_stop(),
+    [gen:stop(Srv) || Srv <- [ConsoleLog, FileLog, Mon, Pg]],
+    ConsoleLines = ct:capture_get(),
+    Console = list_to_binary(lists:concat(ConsoleLines)),
+    {ok, Logs} = file:read_file(Filename),
+    ?assertEqual(Logs, Console),
+    ?assert(length(ConsoleLines) > 3, {"at least header and 3 samples are expected to be printed", ConsoleLines}),
+    %% header must contain the job
+    [Hdr, S1, S2, S3 | _] = [string:trim(lists:last(string:lexemes(Line, " "))) || Line <- ConsoleLines],
+    ?assert(is_pid(list_to_pid(Hdr))),
+    Samples = [list_to_integer(S) || S <- [S1, S2, S3]],
+    [?assert(Sample > 10 andalso Sample < 1000) || Sample <- Samples].
diff --git a/test/erlperf_history_SUITE.erl b/test/erlperf_history_SUITE.erl
new file mode 100644
index 0000000..31b729f
--- /dev/null
+++ b/test/erlperf_history_SUITE.erl
@@ -0,0 +1,147 @@
+%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @doc
+%%% Tests combination of erlperf_monitor, erlperf_cluster_monitor,
+%%%  erlperf_history and erlperf_job. This is an integration test
+%%%  for the entire cluster monitoring subsystem.
+%%% @end
+-module(erlperf_history_SUITE).
+-author("maximfca@gmail.com").
+
+%% Common Test headers
+
+-include_lib("stdlib/include/assert.hrl").
+
+%% Test server callbacks
+-export([suite/0, all/0]).
+
+%% Test cases
+-export([
+    monitor_cluster/0, monitor_cluster/1
+]).
+
+-export([
+    handle_update/2
+]).
+
+suite() ->
+    [{timetrap, {seconds, 10}}].
+
+all() ->
+    [monitor_cluster].
+
+%%--------------------------------------------------------------------
+%% TEST CASES
+
+monitor_cluster() ->
+    [{doc, "Tests cluster-wide history, also piggy-back for cluster_monitor here"}].
+
+%% TODO: remove piggy-backing
+
+monitor_cluster(Config) ->
+    {ok, AppSup} = erlperf_sup:start_link(),
+    ?assertNotEqual(undefined, whereis(erlperf_monitor)),
+    {ok, HistPid} = erlperf_history:start_link(),
+
+    Control = self(),
+    LogFile = filename:join(proplists:get_value(priv_dir, Config), "cluster_log.txt"),
+    ok = ct:capture_start(),
+
+    AllFields = [time, sched_util, dcpu, dio, processes, ports, ets, memory_total, memory_processes, memory_ets, jobs],
+
+    %% start cluster monitor
+    {ok, ClusterHandlePid} = erlperf_cluster_monitor:start_link({?MODULE, handle_update, [Control]}, AllFields),
+    %% start another cluster monitor (now printing to console)
+    {ok, ClusterMonPid} = erlperf_cluster_monitor:start_link(erlang:group_leader(), AllFields),
+    %% start 3rd cluster monitor printing to a file
+    {ok, ClusterFilePid} = erlperf_cluster_monitor:start_link(LogFile, AllFields),
+
+    Started = erlang:system_time(millisecond),
+    %% start a benchmark on the different node, and monitor progress
+    %% link the process to kill it if timetrap fires
+    Remote = spawn_link(
+        fun () ->
+            try
+                erlperf:run({timer, sleep, [10]}, #{concurrency => 2, isolation => #{}}),
+                Control ! {done, self()}
+            catch Class:Reason:Stack ->
+                Control ! {error, self(), Class, Reason, Stack}
+            end
+        end),
+
+    %% start a job locally
+    {ok, Job} = erlperf_job:start(#{runner => {timer, sleep, [2]}}),
+    {ok, Job2} = erlperf_job:start(#{runner => {timer, sleep, [10]}}),
+    %% set concurrency
+    erlperf_job:set_concurrency(Job, 1),
+    erlperf_job:set_concurrency(Job2, 1),
+
+    %% get cluster history samples for 2 nodes
+    Samples = poll_history([], 3),
+    History = erlperf_history:get(Started),
+
+    %% capture text output
+    ct:capture_stop(),
+    Console = ct:capture_get(),
+
+    %% stop local jobs
+    ok = gen:stop(Job2),
+    ok = gen:stop(Job),
+
+    %% wait for remote job to stop
+    receive
+        {done, Remote} ->
+            ok;
+        {error, Remote, C, R, S} ->
+            erlang:error({remote, C, R, S})
+    after 5000 ->
+        erlang:error({timeout, stopping, Remote})
+    end,
+
+    {ok, FileBin} = file:read_file(LogFile),
+    [ok = gen:stop(Pid) || Pid <- [ClusterFilePid, ClusterMonPid, ClusterHandlePid, HistPid, AppSup]],
+    %% verify historical samples are present in the history server
+    ?assertNotEqual([], History),
+    SmallHistory = [{N, maps:with(AllFields, Evt)} || {N, Evt} <- History],
+    ?assertEqual([], Samples -- SmallHistory, {missing_events, Samples, SmallHistory}),
+    ?assertEqual([], SmallHistory -- Samples, {extra_events, SmallHistory, Samples}),
+
+    %% Samples from the local node must contain 2 jobs with non-zero count
+    %% Remote samples have 1 job
+    [begin
+         Node =:= node() andalso ?assertEqual(2, length(Jobs), {jobs, Node, Jobs}),
+         [?assert(Count > 0, {count, Node, Jobs}) || {_Job, Count} <- Jobs]
+     end || {Node, #{jobs := Jobs}} <- Samples],
+
+    %% verify console output vs. file output
+    ?assertNotEqual([], Console),
+    ConsoleBin = list_to_binary(lists:flatten(Console)),
+
+    %% parse file and compare with Samples
+    FileSamples = binary_to_samples(FileBin),
+    ConsoleSamples = binary_to_samples(ConsoleBin),
+    ?assertEqual([], FileSamples -- Samples, {missing_events, FileSamples -- Samples}),
+    ?assertEqual([], ConsoleSamples -- Samples, {missing_events, ConsoleSamples -- Samples}).
+
+binary_to_samples(Bin) ->
+    Split = binary:split(Bin, <<"\n">>, [global]),
+    Parsed = [[Col || Col <- binary:split(Line, <<" ">>, [global]), Col =/= <<>>] || Line <- Split, Line =/= <<>>],
+    [{binary_to_atom(Node), #{time => calendar:rfc3339_to_system_time(binary_to_list(DateTime)),
+        sched_util => binary_to_float(Sched), jobs => Jobs}}
+        || [Node, DateTime, Sched, Jobs] <- Parsed].
+
+handle_update(Sample, [Control]) ->
+    Control ! {monitor, Sample},
+    [Control].
+
+-define(INTERVAL, 1000).
+
+poll_history(Events, 0) ->
+    lists:reverse(Events);
+poll_history(Events, Count) ->
+    % collect cluster_monitor events too
+    receive
+        {monitor, Sample}  ->
+            poll_history(Sample ++ Events, Count - 1)
+    after 5000 ->
+        erlang:error(timeout)
+    end.
diff --git a/test/erlperf_job_SUITE.erl b/test/erlperf_job_SUITE.erl
new file mode 100644
index 0000000..062b0db
--- /dev/null
+++ b/test/erlperf_job_SUITE.erl
@@ -0,0 +1,175 @@
+%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @doc
+%%% Tests all combinations of code maps accepted by erlperf_job
+%%% @end
+-module(erlperf_job_SUITE).
+-author("maximfca@gmail.com").
+
+%% Include stdlib header to enable ?assert() for readable output
+-include_lib("stdlib/include/assert.hrl").
+
+%% Test server callbacks
+-export([
+    suite/0,
+    all/0,
+    groups/0
+]).
+
+%% Test cases
+-export([
+    overhead/0, overhead/1,
+    runner_code/1, runner_mfa/1, runner_fun/1, runner_mod_fun/1,
+    runner_fun1/1, runner_fun2/1, runner_code_fun/1, runner_code_fun1/1,
+    runner_code_fun2/1, runner_code_name2/1,
+    %% internal export
+    sleep/1
+]).
+
+suite() ->
+    [{timetrap, {minutes, 1}}].
+
+groups() ->
+    [{variants, [parallel], [runner_code, runner_mfa, runner_fun, runner_mod_fun,
+        runner_fun1, runner_fun2, runner_code_fun, runner_code_fun1,
+        runner_code_fun2, runner_code_name2]}].
+
+all() ->
+    [{group, variants}, overhead].
+
+sleep(Time) ->
+    timer:sleep(Time), Time.
+
+overhead() ->
+    [{doc, "Measures bound number of iterations instead of speecific duration"}].
+
+overhead(Config) when is_list(Config) ->
+    SampleCount = 10000000,
+    %% special case: code with no init/state/MFA/fun/... can be optimised
+    %%  to a simple "for-loop" and get executed with the lease overhead
+    {ok, Job} = erlperf_job:start_link(#{runner => "rand:uniform(1000)."}),
+    Sampler = erlperf_job:handle(Job),
+    TimeUs = erlperf_job:measure(Job, SampleCount),
+    %% measure the same thing now with continuous benchmark
+    ok = erlperf_job:set_concurrency(Job, 1),
+    %% fetch a sample
+    Start = erlperf_job:sample(Sampler),
+    timer:sleep(1000),
+    Finish = erlperf_job:sample(Sampler),
+    gen:stop(Job),
+    ContinuousQPS = Finish - Start,
+    TimeProratedQPS = SampleCount * 1000000 div TimeUs,
+    Effy = ContinuousQPS * 100 div TimeProratedQPS,
+    ct:pal("Continuous benchmarking efficiency: ~b% (~b time for ~b, ~b continuous)~n",
+        [Effy, TimeUs, SampleCount, ContinuousQPS]),
+    ?assert(Effy > 50, {efficiency, Effy}).
+
+%% code below is a simple hack to make run with some parallelism. Original code
+%%  just had RunnerVariants as a list comprehension.
+
+runner_code(Config) when is_list(Config) ->
+    variants("timer:sleep(1).", ?FUNCTION_NAME).
+
+runner_mfa(Config) when is_list(Config) ->
+    variants({timer, sleep, [1]}, ?FUNCTION_NAME).
+
+runner_fun(Config) when is_list(Config) ->
+    variants(fun () -> timer:sleep(1) end, ?FUNCTION_NAME).
+
+runner_mod_fun(Config) when is_list(Config) ->
+    variants(fun ?MODULE:sleep/1, ?FUNCTION_NAME).
+
+runner_fun1(Config) when is_list(Config) ->
+    variants(fun (1) -> timer:sleep(1), 1 end, ?FUNCTION_NAME).
+
+runner_fun2(Config) when is_list(Config) ->
+    variants(fun (1, 1) -> timer:sleep(1), 1 end, ?FUNCTION_NAME).
+
+runner_code_fun(Config) when is_list(Config) ->
+    variants("runner() -> timer:sleep(1).", ?FUNCTION_NAME).
+
+runner_code_fun1(Config) when is_list(Config) ->
+    variants("runner(1) -> timer:sleep(1), 1.", ?FUNCTION_NAME).
+
+runner_code_fun2(Config) when is_list(Config) ->
+    variants("runner(1, 1) -> timer:sleep(1), 1.", ?FUNCTION_NAME).
+
+runner_code_name2(Config) when is_list(Config) ->
+    variants("baz(1, 1) -> timer:sleep(1), 1.", ?FUNCTION_NAME).
+
+variants() ->
+    [{doc, "Tests various representations of the code mapping to resulting Erlang"}].
+
+variants(Runner, ProcName) ->
+    ProcStr = atom_to_list(ProcName),
+    Sep = io_lib:format("~n    ", []),
+    %% register this process to send messages from init/init_runner/done
+    register(ProcName, self()),
+    %%
+    InitVariants = [
+        undefined,
+        {erlang, send, [ProcName, init]},
+        fun () -> erlang:send(ProcName, init) end,
+        lists:concat(["erlang:send(" ++ ProcStr ++ ", init)."]),
+        lists:concat(["init() -> erlang:send(" ++ ProcStr ++ ", init)."]),
+        lists:concat(["foo() -> erlang:send(" ++ ProcStr ++ ", init)."])
+    ],
+    InitRunnerVariants = [
+        undefined,
+        {erlang, send, [ProcName, 1]},
+        fun () -> erlang:send(ProcName, 1) end,
+        fun (init) -> erlang:send(ProcName, 1) end,
+        "erlang:send(" ++ ProcStr ++ ", 1).",
+        lists:concat(["init_runner() ->", Sep, "erlang:send(" ++ ProcStr ++ ", 1)."]),
+        lists:concat(["init_runner(init) ->", Sep, "erlang:send(" ++ ProcStr ++ ", 1)."]),
+        lists:concat(["bar(init) ->", Sep, "erlang:send(" ++ ProcStr ++ ", 1)."])
+    ],
+    DoneVariants = [
+        undefined,
+        {erlang, send, [ProcName, done]},
+        fun () -> erlang:send(ProcName, done) end,
+        fun (init) -> erlang:send(ProcName, done) end,
+        lists:concat(["erlang:send(" ++ ProcStr ++ ", done)."]),
+        lists:concat(["done() -> erlang:send(" ++ ProcStr ++ ", done)."]),
+        lists:concat(["done(init) -> erlang:send(" ++ ProcStr ++ ", done)."]),
+        lists:concat(["buzz(init) -> erlang:send(" ++ ProcStr ++ ", done)."])
+    ],
+    %% try all variants
+    Variants = [#{init => Init, init_runner => InitRunner, runner => Runner, done => Done}
+        || Init <- InitVariants, InitRunner <- InitRunnerVariants, Done <- DoneVariants],
+    %% filter "undefined" entries from the map
+    Maps = [maps:filter(fun (_Key, Value) -> Value =/= undefined end, Variant)
+        || Variant <- Variants],
+
+    %% generate code for each variant and measure performance
+    [measure_variant(Variant)|| Variant <- Maps].
+
+measure_variant(Code) ->
+    try
+        ct:log("Source: ~200p~n", [Code]),
+        {ok, Job} = erlperf_job:start_link(Code),
+        %% must not start the /1 runners
+        %%
+        Sampler = erlperf_job:handle(Job),
+        is_map_key(init, Code) andalso receive init -> ok after 1000 -> throw({init, timeout}) end,
+        %% ensure it does not crash attempting to do a single measurement
+        TimeUs = erlperf_job:measure(Job, 10),
+        ?assert(TimeUs > 0),
+        %%
+        ok = erlperf_job:set_concurrency(Job, 1),
+        %% wait for 1 job to get started
+        is_map_key(init_runner, Code) andalso receive 1 -> ok after 1000 -> throw({init_runner, timeout}) end,
+        %% fetch a single 50 ms sample
+        timer:sleep(50),
+        Sample = erlperf_job:sample(Sampler),
+        %% stop the job
+        gen:stop(Job),
+        is_map_key(done, Code) andalso receive done -> ok after 1000 -> throw({done, timeout}) end,
+        %% timer:sleep must be run between 20 and 51 times
+        ?assert(Sample > 10 andalso Sample < 51, {sample, Sample})
+    catch error:{generate, {What, Arity, requires, Dependency}} ->
+        %% verify this combination is indeed invalid
+        ?assertNot(is_map_key(Dependency, Code)),
+        ?assert((What =:= init_runner andalso Arity =:= 1) orelse (What =:= runner andalso Arity > 0)
+            orelse (What =:= done andalso Arity =:= 1))
+        %% io:format(user, "Invalid combination: ~s/~b requires ~s~n~n", [What, Arity, Dependency])
+    end.
diff --git a/test/erlperf_monitor_SUITE.erl b/test/erlperf_monitor_SUITE.erl
new file mode 100644
index 0000000..029bed2
--- /dev/null
+++ b/test/erlperf_monitor_SUITE.erl
@@ -0,0 +1,61 @@
+%%%-------------------------------------------------------------------
+%%% @author Maxim Fedorov <maximfca@gmail.com>
+%%% @copyright (c) 2019-2022 Maxim Fedorov
+%%% @doc
+%%%     Tests monitor
+%%% @end
+
+-module(erlperf_monitor_SUITE).
+
+-include_lib("stdlib/include/assert.hrl").
+
+%% Test server callbacks
+-export([
+    suite/0,
+    all/0
+]).
+
+%% Test cases
+-export([
+    subscribe/0, subscribe/1
+]).
+
+suite() ->
+    [{timetrap, {seconds, 10}}].
+
+all() ->
+    [subscribe].
+
+%%--------------------------------------------------------------------
+%% TEST CASES
+
+subscribe() ->
+    [{doc, "Tests monitoring subscription"}].
+
+subscribe(_Config) ->
+    {ok, Mon} = erlperf_sup:start_link(), %% instead of starting the app
+    ok = pg:join(erlperf, {erlperf_monitor, node()}, self()),
+    % start a benchmark and see it running on 1 scheduler
+    {ok, Job} = erlperf_job:start_link(#{runner => {timer, sleep, [10]}}),
+    ok = erlperf_job:set_concurrency(Job, 4),
+    ?assertEqual(4, erlperf_job:concurrency(Job)),
+    % wait for 3 seconds, receive updates
+    First = receive_updates(Job, 0, 2),
+    ok = erlperf_job:set_concurrency(Job, 2),
+    Second = receive_updates(Job, 0, 1),
+    ok = gen_server:stop(Job),
+    ?assert(First > 0),
+    ?assert(Second > 0),
+    ?assert(First > Second),
+    pg:leave(erlperf, {erlperf_monitor, node()}, self()),
+    gen:stop(Mon).
+
+receive_updates(_, Total, 0) ->
+    Total;
+receive_updates(Job, Total, Count) ->
+    receive
+        #{jobs := [{Job, Cycles}]} ->
+            receive_updates(Job, Total + Cycles, Count - 1);
+        Other ->
+            ?assertEqual([], Other)
+    end.
diff --git a/test/test_helpers.erl b/test/test_helpers.erl
deleted file mode 100644
index 3a85572..0000000
--- a/test/test_helpers.erl
+++ /dev/null
@@ -1,59 +0,0 @@
-%%%-------------------------------------------------------------------
-%%% @author Maxim Fedorov <maximfca@gmail.com>
-%%% @copyright (c) 2019 Maxim Fedorov
-%%% @doc
-%%%     Helpers: cluster/app sentinels.
-%%% @end
-
--module(test_helpers).
--author("maximfca@gmail.com").
-
-%% API
--export([
-    ensure_distributed/1,
-    ensure_started/2,
-    ensure_stopped/1,
-    maybe_undistribute/1
-]).
-
-ensure_started(App, Config) ->
-    Apps = proplists:get_value(apps, Config, []),
-    case lists:member(App, Apps) of
-        true ->
-            Config;
-        false ->
-            {ok, New} = application:ensure_all_started(App),
-            lists:keystore(apps, 1, Config, {apps, New ++ Apps})
-    end.
-
-ensure_stopped(Config) ->
-    [ok = application:stop(App) || App <- proplists:get_value(apps, Config, [])],
-    proplists:delete(apps, Config).
-
-ensure_distributed(Config) ->
-    case proplists:get_value(distribution, Config) of
-        undefined ->
-            case erlang:is_alive() of
-                false ->
-                    % verify epmd running (otherwise next call fails)
-                    (erl_epmd:names("localhost") =:= {error, address}) andalso
-                        ([] = os:cmd("epmd -daemon")),
-                    % start machine@localhost (avoiding inet:gethostname() as it fails to resolve in DNS sometimes)
-                    NodeName = list_to_atom(atom_to_list(?MODULE) ++ "@localhost"),
-                    {ok, Pid} = net_kernel:start([NodeName, shortnames]),
-                    [{distribution, Pid} | Config];
-                true ->
-                    Config
-            end;
-        Pid when is_pid(Pid) ->
-            Config
-    end.
-
-maybe_undistribute(Config) ->
-    case proplists:get_value(distribution, Config) of
-        undefined ->
-            Config;
-        Pid when is_pid(Pid) ->
-            net_kernel:stop(),
-            proplists:delete(distribution, Config)
-    end.