diff --git a/ChangeLog b/ChangeLog index 89aedf2309..3806939a5d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -21,6 +21,7 @@ New features: * Add starpu_data_register_victim_selector to let schedulers select eviction victims. * Add bus performance model for HIP driver. + * New scheduler darts (Data-Aware Reactive Task Scheduling) Small features: * Add FXT option -use-task-color to propagate the specified task diff --git a/configure.ac b/configure.ac index e3dabe520f..bfb3e4524a 100644 --- a/configure.ac +++ b/configure.ac @@ -4160,6 +4160,39 @@ AC_SUBST(STARPU_LIB_PATH, $(eval echo ${prefix}/lib)) AC_SUBST(STARPU_MODULE_LIBS, "$module_libs") AC_SUBST(STARPU_OPTION_LIBS, "$option_libs") +############################################################################### +# # +# DARTS settings # +# # +############################################################################### + +AC_MSG_CHECKING(whether DARTS debug messages should be displayed) +AC_ARG_ENABLE(darts-verbose, [AS_HELP_STRING([--enable-darts-verbose], + [display DARTS verbose debug messages])], + enable_darts_verbose=$enableval, enable_darts_verbose=no) +AC_MSG_RESULT($enable_darts_verbose) +if test x$enable_darts_verbose = xyes; then + AC_DEFINE(STARPU_DARTS_VERBOSE, [1], [display DARTS verbose debug messages]) +fi + +AC_MSG_CHECKING(whether DARTS statistics should be enabled) +AC_ARG_ENABLE(darts-stats, [AS_HELP_STRING([--enable-darts-stats], + [enable DARTS statistics])], + enable_darts_stats=$enableval, enable_darts_stats=no) +AC_MSG_RESULT($enable_darts_stats) +if test x$enable_darts_stats = xyes; then + AC_DEFINE(STARPU_DARTS_STATS, [1], [enable DARTS statistics]) +fi + +AC_MSG_CHECKING(whether DARTS linear mutex should be used) +AC_ARG_ENABLE(darts-linear-mutex, [AS_HELP_STRING([--enable-darts-linear-mutex], + [enable DARTS linear mutex])], + enable_darts_linear_mutex=$enableval, enable_darts_linear_mutex=no) +AC_MSG_RESULT($enable_darts_linear_mutex) +if test x$enable_darts_linear_mutex = xyes; then + AC_DEFINE(STARPU_DARTS_LINEAR_MUTEX, [1], [enable DARTS linear mutex]) +fi + ############################################################################### # # # Final settings # diff --git a/doc/doxygen/Makefile.am b/doc/doxygen/Makefile.am index 2efa490fd7..d1b91cc9dd 100644 --- a/doc/doxygen/Makefile.am +++ b/doc/doxygen/Makefile.am @@ -78,6 +78,7 @@ chapters = \ chapters/starpu_extensions/socl_opencl_extensions.doxy \ chapters/starpu_extensions/bubble.doxy \ chapters/starpu_extensions/parallel_worker.doxy \ + chapters/starpu_extensions/darts.doxy \ chapters/starpu_extensions/interoperability.doxy \ chapters/starpu_extensions/scheduling_policy_definition.doxy \ chapters/starpu_extensions/simgrid.doxy \ diff --git a/doc/doxygen/chapters/starpu_extensions/darts.doxy b/doc/doxygen/chapters/starpu_extensions/darts.doxy new file mode 100644 index 0000000000..b230d88fcf --- /dev/null +++ b/doc/doxygen/chapters/starpu_extensions/darts.doxy @@ -0,0 +1,140 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2009-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +/*! \page DARTS Data-aware Scheduler and Visualization Tool + +\section DARTS_Scheduling_Policy Overview + +DARTS is a research scheduler designed to address memory constraints. +Study of results available as a conference paper: https://ieeexplore.ieee.org/abstract/document/9820704 +Further study as a pre-publication: https://inria.hal.science/hal-04146714v1 + +\subsection darts_purpose Purpose + +DARTS (for Data-Aware Reactive Task Scheduling) is a scheduling policy that aims to achieve good performance under memory constraints. +DARTS looks for the "best" data, that is, the data that has the smallest ratio of transfer time to computation made available without additional data load. +DARTS computes all tasks using this "best" data and the data already loaded into memory. +If no data allows at least one task to be computed without additional load, the highest priority task is scheduled next. +DARTS can be used with or without a memory constraint. + +\subsection darts_features Features + +DARTS has been tested on the outer product, GEMM, the Cholesky and LU factorizations. +These applications are typically used as follows: +\verbatim +./examples/cholesky/cholesky_implicit -size $((block_size*N)) -nblocks $((N)) -niter 1 +./examples/mult/sgemm -xy $((block_size*N)) -nblocks $((N)) -iter 1 +./examples/mult/sgemm -xyz $((block_size*N)) -nblocks $((N)) -nblocksz $((N)) -iter 1 +./examples/lu/lu_implicit_example_float -size $((block_size*N)) -nblocks $((N)) -iter 1 +\endverbatim +In theory, DARTS can be used for any task-based application. + +\section darts_best_practices Best Practices + +It is highly recommended to use only GPUs for best performance. +It is therefore recommended to set the variables \ref STARPU_NOPENCL and \ref STARPU_NCPU to 0. + +If the application does not use dependencies (such as the outer product), use the following environment variables: +\verbatim +STARPU_DARTS_DEPENDANCES=0 +STARPU_DARTS_PRIO=0 +\endverbatim + +For example, a set of parameters for DARTS that achieves the best performance is +\verbatim +STARPU_SCHED_READY=1 STARPU_SIMGRID_CUDA_MALLOC_COST=0 STARPU_EXPECTED_TRANSFER_TIME_WRITEBACK=0 STARPU_SCHED=darts STARPU_NTASKS_THRESHOLD=30 STARPU_CUDA_PIPELINE=5 STARPU_MINIMUM_CLEAN_BUFFERS=0 STARPU_TARGET_CLEAN_BUFFERS=0 STARPU_NCPU=0 STARPU_NCUDA=$((NGPU)) STARPU_NOPENCL=0 ${APPLICATION} +\endverbatim + +\section DARTS_Building_Visualizations Building Visualizations + +DARTS is also equipped with a visualization tool that allows to plot the processing order of the task set by a processing unit on a matrix multiplication or a Cholesky factorization. +The files that make up the visualization are located in the directory \c tools/darts/. +The visualizations only work for Gemm, the outer product, and the Cholesky factorization when using only GPUs. + +\subsection darts_visu_Configuration Configuration + +The configure options required are: --enable-darts-stats --enable-darts-verbose. + +\subsection darts_visu_launch Launch Options + +Add the following environment variables when launching the application: + + + +If your target application is Cholesky, use -niter 1. If your target application is Gemm or the outer product, use -iter 1 + +An example of launch options is for the outer product: +\verbatim +STARPU_SCHED_OUTPUT=${OUTPUT_PATH} STARPU_SCHED=darts PRINT_IN_TERMINAL=1 PRINT_N=$((N)) STARPU_NTASKS_THRESHOLD=30 STARPU_CUDA_PIPELINE=5 STARPU_SIMGRID_CUDA_MALLOC_COST=0 STARPU_MINIMUM_CLEAN_BUFFERS=0 STARPU_TARGET_CLEAN_BUFFERS=0 STARPU_NCPU=0 STARPU_NCUDA=$((NGPU)) STARPU_NOPENCL=0 ./examples/mult/sgemm -xy $((block_size*N)) -nblocks $((N)) -iter 1 +python3 ./tools/darts/visualization_darts.py ${N} darts ${NGPU} Matrice_ligne 1 0 ${block_size} ${OUTPUT_PATH} +\endverbatim + +A full example of the command used to build the visualization is available in \c tools/darts/example_script_visualization_darts.sh. + +The output visualization is stored in the current folder. + +\section More_Scheduler More research schedulers about memory-aware scheduling + +Other memory-constrained schedulers are also available for experimental purposes, note they only function with GPUs and on GEMM and the outer product. + + + +*/ diff --git a/doc/doxygen/doxygen-config.cfg.in b/doc/doxygen/doxygen-config.cfg.in index d036ed578b..367ee4f5d9 100644 --- a/doc/doxygen/doxygen-config.cfg.in +++ b/doc/doxygen/doxygen-config.cfg.in @@ -64,6 +64,7 @@ INPUT = @top_srcdir@/doc/doxygen/chapters/starpu_introduction/i @top_srcdir@/doc/doxygen/chapters/starpu_extensions/socl_opencl_extensions.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/bubble.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/parallel_worker.doxy \ + @top_srcdir@/doc/doxygen/chapters/starpu_extensions/darts.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/interoperability.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/simgrid.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/debugging_tools.doxy \ diff --git a/doc/doxygen/refman.tex b/doc/doxygen/refman.tex index af0124a78e..77129a084c 100644 --- a/doc/doxygen/refman.tex +++ b/doc/doxygen/refman.tex @@ -278,6 +278,11 @@ \chapter{Parallel Workers} \hypertarget{ParallelWorker}{} \input{ParallelWorker} +\chapter{Data-aware Scheduler and Visualization Tool} +\label{Darts} +\hypertarget{Darts}{} +\input{Darts} + \chapter{Interoperability Support} \label{InteropSupport} \hypertarget{InteropSupport}{} diff --git a/doc/doxygen_web_extensions/Makefile.am b/doc/doxygen_web_extensions/Makefile.am index be2e9fa4e1..e7d01909a5 100644 --- a/doc/doxygen_web_extensions/Makefile.am +++ b/doc/doxygen_web_extensions/Makefile.am @@ -48,6 +48,7 @@ chapters = \ ../doxygen/chapters/starpu_extensions/socl_opencl_extensions.doxy \ ../doxygen/chapters/starpu_extensions/bubble.doxy \ ../doxygen/chapters/starpu_extensions/parallel_worker.doxy \ + ../doxygen/chapters/starpu_extensions/darts.doxy \ ../doxygen/chapters/starpu_extensions/interoperability.doxy \ ../doxygen/chapters/starpu_extensions/scheduling_policy_definition.doxy \ ../doxygen/chapters/starpu_extensions/simgrid.doxy \ diff --git a/doc/doxygen_web_extensions/doxygen-config.cfg.in b/doc/doxygen_web_extensions/doxygen-config.cfg.in index 04a478226d..66d8c96570 100644 --- a/doc/doxygen_web_extensions/doxygen-config.cfg.in +++ b/doc/doxygen_web_extensions/doxygen-config.cfg.in @@ -34,6 +34,7 @@ INPUT = @top_srcdir@/doc/doxygen/chapters/starpu_extensions/ext @top_srcdir@/doc/doxygen/chapters/starpu_extensions/socl_opencl_extensions.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/bubble.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/parallel_worker.doxy \ + @top_srcdir@/doc/doxygen/chapters/starpu_extensions/darts.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/interoperability.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/simgrid.doxy \ @top_srcdir@/doc/doxygen/chapters/starpu_extensions/debugging_tools.doxy \ diff --git a/doc/doxygen_web_extensions/refman.tex b/doc/doxygen_web_extensions/refman.tex index 6dcc507f31..2be91334be 100644 --- a/doc/doxygen_web_extensions/refman.tex +++ b/doc/doxygen_web_extensions/refman.tex @@ -113,6 +113,11 @@ \chapter{Parallel Workers} \hypertarget{ParallelWorker}{} \input{ParallelWorker} +\chapter{Data-aware Scheduler and Visualization Tool} +\label{Darts} +\hypertarget{Darts}{} +\input{Darts} + \chapter{Interoperability Support} \label{InteropSupport} \hypertarget{InteropSupport}{} diff --git a/src/Makefile.am b/src/Makefile.am index 226621b77d..d68ae3bad2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -164,7 +164,10 @@ noinst_HEADERS = \ util/starpu_task_insert_utils.h \ util/starpu_data_cpy.h \ sched_policies/prio_deque.h \ - sched_policies/sched_component.h + sched_policies/sched_component.h \ + sched_policies/darts.h \ + sched_policies/HFP.h \ + sched_policies/sched_visu.h libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = \ common/barrier.c \ @@ -314,6 +317,12 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = \ sched_policies/modular_eager_prio.c \ sched_policies/modular_eager_prefetching.c \ sched_policies/modular_gemm.c \ + sched_policies/random_order.c \ + sched_policies/mst_policy.c \ + sched_policies/HFP.c \ + sched_policies/sched_visu.c \ + sched_policies/darts.c \ + sched_policies/cuthillmckee_policy.c \ sched_policies/modular_prio.c \ sched_policies/modular_prio_prefetching.c \ sched_policies/modular_random.c \ diff --git a/src/core/sched_policy.c b/src/core/sched_policy.c index 59e9e6edef..a9f6f8ff6a 100644 --- a/src/core/sched_policy.c +++ b/src/core/sched_policy.c @@ -26,6 +26,7 @@ #include #include #include +#include #ifdef HAVE_DLOPEN #include @@ -45,6 +46,7 @@ static const char *sched_lib = NULL; void _starpu_sched_init(void) { + _starpu_visu_init(); _starpu_task_break_on_push = starpu_getenv_number_default("STARPU_TASK_BREAK_ON_PUSH", -1); _starpu_task_break_on_sched = starpu_getenv_number_default("STARPU_TASK_BREAK_ON_SCHED", -1); _starpu_task_break_on_pop = starpu_getenv_number_default("STARPU_TASK_BREAK_ON_POP", -1); @@ -101,6 +103,12 @@ static struct starpu_sched_policy *predefined_policies[] = static struct starpu_sched_policy *predefined_policies_non_default[] = { + &_starpu_sched_darts_policy, + &_starpu_sched_random_order_policy, + &_starpu_sched_HFP_policy, + &_starpu_sched_modular_heft_HFP_policy, + &_starpu_sched_mst_policy, + &_starpu_sched_cuthillmckee_policy, NULL }; @@ -1194,7 +1202,9 @@ struct starpu_task *_starpu_pop_task(struct _starpu_worker *worker) task->prologue_callback_pop_func(task->prologue_callback_pop_arg); _starpu_set_current_task(NULL); } - + + _sched_visu_pop_ready_task(task); + return task; } @@ -1202,6 +1212,9 @@ void _starpu_sched_pre_exec_hook(struct starpu_task *task) { unsigned sched_ctx_id = starpu_sched_ctx_get_ctx_for_task(task); struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id); + + _sched_visu_get_current_tasks_for_visualization(task, sched_ctx_id); + if (sched_ctx->sched_policy && sched_ctx->sched_policy->pre_exec_hook) { _STARPU_SCHED_BEGIN; diff --git a/src/core/sched_policy.h b/src/core/sched_policy.h index d29d77c730..17f8d9fa09 100644 --- a/src/core/sched_policy.h +++ b/src/core/sched_policy.h @@ -94,6 +94,12 @@ extern struct starpu_sched_policy _starpu_sched_modular_eager_policy; extern struct starpu_sched_policy _starpu_sched_modular_eager_prefetching_policy; extern struct starpu_sched_policy _starpu_sched_modular_eager_prio_policy; extern struct starpu_sched_policy _starpu_sched_modular_gemm_policy; +extern struct starpu_sched_policy _starpu_sched_darts_policy; +extern struct starpu_sched_policy _starpu_sched_random_order_policy; +extern struct starpu_sched_policy _starpu_sched_HFP_policy; +extern struct starpu_sched_policy _starpu_sched_modular_heft_HFP_policy; +extern struct starpu_sched_policy _starpu_sched_mst_policy; +extern struct starpu_sched_policy _starpu_sched_cuthillmckee_policy; extern struct starpu_sched_policy _starpu_sched_modular_prio_policy; extern struct starpu_sched_policy _starpu_sched_modular_prio_prefetching_policy; extern struct starpu_sched_policy _starpu_sched_modular_random_policy; diff --git a/src/sched_policies/HFP.c b/src/sched_policies/HFP.c new file mode 100644 index 0000000000..148809d9b8 --- /dev/null +++ b/src/sched_policies/HFP.c @@ -0,0 +1,4447 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2013-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int order_u; +static int multigpu; +static int modular_heft_hfp_mode; +static int task_stealing; +static int interlacing; +static int faster_first_iteration; +static starpu_pthread_mutex_t HFP_mutex; +static int belady; +static int hmetis_n; +static int number_task_out; + +int _starpu_HFP_hmetis; +static int sparse_matrix; + +static int _nb_gpus; +const char* _starpu_HFP_appli; +int _starpu_HFP_NT; +int _starpu_HFP_N; +static double EXPECTED_TIME; +starpu_ssize_t _starpu_HFP_GPU_RAM_M; +bool _starpu_HFP_do_schedule_done; + +/* Variables used for Belady in xgemm.c and HFP.c and mst.c */ +struct starpu_sched_policy _starpu_sched_HFP_policy; +struct starpu_sched_policy _starpu_sched_modular_heft_HFP_policy; + +/* Other environmment variable you should use with HFP: + * STARPU_NTASKS_THRESHOLD=30 ou 10 si on veut moins entrer dans victim_selector peut_être + * STARPU_MINIMUM_CLEAN_BUFFERS=0 + * STARPU_TARGET_CLEAN_BUFFERS=0 + * STARPU_CUDA_PIPELINE=4 + * STARPU_NCPU=0 + * STARPU_NOPENCL=0 + * RANDOM_TASK_ORDER + * RECURSIVE_MATRIX_LAYOUT + * RANDOM_DATA_ACCESS + * STARPU_SCHED_READY=1 + * STARPU_CUDA_PIPELINE=5 + * STARPU_NTASKS_THRESHOLD=10 + */ + +static int _get_number_GPU() +{ + int return_value = starpu_memory_nodes_get_count_by_kind(STARPU_CUDA_RAM); + + if (return_value == 0) /* We are not using GPUs so we are in an out-of-core case using CPUs. Need to return 1. If I want to deal with GPUs AND CPUs we need to adpt this function to return NGPU + 1 */ + { + return 1; + } + + return return_value; +} + +void _starpu_visu_init() +{ + _nb_gpus = _get_number_GPU(); + _sched_visu_init(_nb_gpus); + + _starpu_HFP_do_schedule_done = false; +} + +/* Used only for visualisation of non-HFP schedulers in python */ +void _starpu_HFP_initialize_global_variable(struct starpu_task *task) +{ + _starpu_HFP_N = _print_n; + _nb_gpus = _get_number_GPU(); + + /* Getting the total number of tasks */ + if(_print3d == 0) /* 2D */ + { + _starpu_HFP_NT = _starpu_HFP_N*_starpu_HFP_N; + } + else if(_print3d == 1) /* Z = 4 here */ + { + _starpu_HFP_NT = _starpu_HFP_N*_starpu_HFP_N*4; + } + else if(_print3d == 2) /* It means that Z = N */ + { + _starpu_HFP_NT = _starpu_HFP_N*_starpu_HFP_N*_starpu_HFP_N; + } + + _starpu_HFP_appli = starpu_task_get_name(task); +} + +/* Put a link at the beginning of the linked list */ +void _starpu_HFP_insertion(struct _starpu_HFP_paquets *a) +{ + struct _starpu_HFP_my_list *new = malloc(sizeof(*new)); /* Creation of a new link */ + starpu_task_list_init(&new->sub_list); + new->next = a->temp_pointer_1; + new->nb_task_in_sub_list = 0; + new->expected_time_pulled_out = 0; + new->expected_time = 0; + new->expected_package_computation_time = 0; + new->data_weight = 0; + new->data_to_evict_next = NULL; + starpu_task_list_init(&new->refused_fifo_list); + a->temp_pointer_1 = new; +} + +/* Delete all the empty packages */ +static struct _starpu_HFP_my_list* HFP_delete_link(struct _starpu_HFP_paquets* a) +{ + while (a->first_link != NULL && a->first_link->package_nb_data == 0) + { + a->temp_pointer_1 = a->first_link; + a->first_link = a->first_link->next; + free(a->temp_pointer_1); + } + if (a->first_link != NULL) + { + a->temp_pointer_2 = a->first_link; + a->temp_pointer_3 = a->first_link->next; + while (a->temp_pointer_3 != NULL) + { + while (a->temp_pointer_3 != NULL && a->temp_pointer_3->package_nb_data == 0) + { + a->temp_pointer_1 = a->temp_pointer_3; + a->temp_pointer_3 = a->temp_pointer_3->next; + a->temp_pointer_2->next = a->temp_pointer_3; + free(a->temp_pointer_1); + } + if (a->temp_pointer_3 != NULL) + { + a->temp_pointer_2 = a->temp_pointer_3; + a->temp_pointer_3 = a->temp_pointer_3->next; + } + } + } + return a->first_link; +} + +/* Reverse the order of task in a package for order U */ +static struct _starpu_HFP_my_list* HFP_reverse_sub_list(struct _starpu_HFP_my_list *a) +{ + struct starpu_task_list b; + starpu_task_list_init(&b); + while (!starpu_task_list_empty(&a->sub_list)) + { + starpu_task_list_push_front(&b,starpu_task_list_pop_front(&a->sub_list)); + } + while (!starpu_task_list_empty(&b)) + { + starpu_task_list_push_back(&a->sub_list,starpu_task_list_pop_front(&b)); + } + return a; +} + +/* Print for each GPU the order of processing of each data */ +//static void print_next_use_each_data(struct _starpu_HFP_paquets* a) +//{ +// a->temp_pointer_1 = a->first_link; +// struct starpu_task *task = NULL; +// int current_gpu = 0; +// struct _starpu_HFP_next_use_by_gpu *c = _starpu_HFP_next_use_by_gpu_new(); +// while (a->temp_pointer_1 != NULL) +// { +// printf("Pour le GPU %d.\n", current_gpu); +// for (task = starpu_task_list_begin(&a->temp_pointer_1->sub_list); task != starpu_task_list_end(&a->temp_pointer_1->sub_list); task = starpu_task_list_next(task)) +// { +// printf("Task %p :", task); +// unsigned i; +// for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) +// { +// printf(" %p", STARPU_TASK_GET_HANDLE(task, i)); +// struct _starpu_HFP_next_use *b = STARPU_TASK_GET_HANDLE(task, i)->sched_data; +// for (c = _starpu_HFP_next_use_by_gpu_list_begin(b->next_use_tab[current_gpu]); c != _starpu_HFP_next_use_by_gpu_list_end(b->next_use_tab[current_gpu]); c = _starpu_HFP_next_use_by_gpu_list_next(c)) +// { +// printf("->%d", c->value_next_use); +// } +// printf(" |"); +// } +// printf("\n-----\n"); +// } +// a->temp_pointer_1 = a->temp_pointer_1->next; +// current_gpu++; +// } +//} + +/* TODO a suppr */ +//~ struct timeval time_start_getorderbelady; +//~ struct timeval time_end_getorderbelady; +//~ long long time_total_getorderbelady = 0; + +/* Utile pour printing mais surtout pour l'itération 1 plus rapide */ +static int iteration; + +/* Read the tasks's order and each time it se a data, it add a value of it's next use in the task list. + * Then in the post_exec_hook we pop the value of the handles of the task processed. In belady we just look at these value + * for each data on node and evict the one with the furthest first value. + * TODO : A noer/dire que si ready modifie l'ordre et bien les pop de valuers dans le post exec hook + * ne sont plus exacts. mis bon cela ne devrait pas trop impacter les performances. */ +static void get_ordre_utilisation_donnee(struct _starpu_HFP_paquets* a, int nb_gpu) +{ + //~ gettimeofday(&time_start_getorderbelady, NULL); + + struct starpu_task *task = NULL; + a->temp_pointer_1 = a->first_link; + int current_gpu = 0; + int j = 0; + int compteur = 0; + struct _starpu_HFP_next_use *b = NULL; + + while (a->temp_pointer_1 != NULL) + { + for (task = starpu_task_list_begin(&a->temp_pointer_1->sub_list); task != starpu_task_list_end(&a->temp_pointer_1->sub_list); task = starpu_task_list_next(task)) + { + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + compteur++; + struct _starpu_HFP_next_use_by_gpu *c = _starpu_HFP_next_use_by_gpu_new(); + c->value_next_use = compteur; + if (STARPU_TASK_GET_HANDLE(task, i)->sched_data == NULL) /* If it's empty I create the list in the handle */ + { + /* J'initialise à vide la liste pour chaque case du tableau */ + b = malloc(sizeof(*b)); + b->next_use_tab = malloc(sizeof(*b->next_use_tab)); + for (j = 0; j < nb_gpu; j++) + { + b->next_use_tab[j] = _starpu_HFP_next_use_by_gpu_list_new(); + } + _starpu_HFP_next_use_by_gpu_list_push_back(b->next_use_tab[current_gpu], c); + STARPU_TASK_GET_HANDLE(task, i)->sched_data = b; + } + else /* Else I just add a new int */ + { + b = STARPU_TASK_GET_HANDLE(task, i)->sched_data; + _starpu_HFP_next_use_by_gpu_list_push_back(b->next_use_tab[current_gpu], c); + STARPU_TASK_GET_HANDLE(task, i)->sched_data = b; + } + } + } + current_gpu++; + a->temp_pointer_1 = a->temp_pointer_1->next; + compteur = 0; + } + //~ gettimeofday(&time_end_getorderbelady, NULL); + //~ time_total_getorderbelady += (time_end_getorderbelady.tv_sec - time_start_getorderbelady.tv_sec)*1000000LL + time_end_getorderbelady.tv_usec - time_start_getorderbelady.tv_usec; +} + +/* TODO a suppr */ +//~ struct timeval time_start_getcommondataorderu; +//~ struct timeval time_end_getcommondataorderu; +//~ long long time_total_getcommondataorderu = 0; + +/* For order U. Return the number of common data of each sub package when merging I and J */ +static int get_common_data_last_package(struct _starpu_HFP_my_list *I, struct _starpu_HFP_my_list *J, int evaluation_I, int evaluation_J, bool IJ_inferieur_GPU_RAM, starpu_ssize_t GPU_RAM_M) +{ + //~ gettimeofday(&time_start_getcommondataorderu, NULL); + + int split_ij = 0; + /* evaluation: 0 = tout, 1 = début, 2 = fin */ + struct starpu_task *task = NULL; + bool insertion_ok = false; + bool donnee_deja_presente = false; + int j = 0; + int common_data_last_package = 0; + long int poids_tache_en_cours = 0; + long int poids = 0; + int index_tab_donnee_I = 0; + int index_tab_donnee_J = 0; + int parcours_liste = 0; + int i_bis = 0; + starpu_data_handle_t * donnee_I = NULL; + starpu_data_handle_t * donnee_J = NULL; + + if (strcmp(_starpu_HFP_appli, "chol_model_11") == 0) + { + donnee_J = calloc((J->package_nb_data*1.5), sizeof(J->package_data[0])); + donnee_I = malloc((I->package_nb_data*1.5) * sizeof(I->package_data[0])); + } + else + { + donnee_J = calloc((J->package_nb_data), sizeof(J->package_data[0])); + donnee_I = malloc((I->package_nb_data) * sizeof(I->package_data[0])); + } + + if (evaluation_I == 0) + { + int i; + for (i = 0; i < I->package_nb_data; i++) + { + donnee_I[i] = I->package_data[i]; + } + index_tab_donnee_I = I->package_nb_data; + } + else if (evaluation_I == 1 && IJ_inferieur_GPU_RAM == false) + { + poids = 0; insertion_ok = false; + task = starpu_task_list_begin(&I->sub_list); + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_I[i] = STARPU_TASK_GET_HANDLE(task, i); + poids += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task, i)); + } + index_tab_donnee_I = STARPU_TASK_GET_NBUFFERS(task); + while(1) + { + task = starpu_task_list_next(task); + if (task == NULL) + { + break; + } + poids_tache_en_cours = 0; + starpu_data_handle_t * tab_tache_en_cours = malloc((STARPU_TASK_GET_NBUFFERS(task)) * sizeof(I->package_data[0])); + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + tab_tache_en_cours[i] = NULL; + } + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_deja_presente = false; + for (j = 0; j < I->package_nb_data; j++) + { + if (STARPU_TASK_GET_HANDLE(task,i) == donnee_I[j]) + { + donnee_deja_presente = true; + break; + } + } + if (donnee_deja_presente == false) + { + poids_tache_en_cours += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task, i)); + tab_tache_en_cours[i] = STARPU_TASK_GET_HANDLE(task, i); + } + } + if (poids + poids_tache_en_cours <= GPU_RAM_M) + { + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if (tab_tache_en_cours[i] != NULL) + { + donnee_I[index_tab_donnee_I] = tab_tache_en_cours[i]; + index_tab_donnee_I++; + insertion_ok = true; + } + } + if (insertion_ok == true) + { + poids += poids_tache_en_cours; + } + insertion_ok = false; + } + else + { + break; + } + } + } + else if (evaluation_I == 2 && IJ_inferieur_GPU_RAM == false) + { + poids = 0; + i_bis = 1; insertion_ok = false; + task = starpu_task_list_begin(&I->sub_list); + while(starpu_task_list_next(task) != NULL) + { + task = starpu_task_list_next(task); + } + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_I[i] = STARPU_TASK_GET_HANDLE(task,i); + poids += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task,i)); + } + index_tab_donnee_I = STARPU_TASK_GET_NBUFFERS(task); + while(1) + { + i_bis++; + task = starpu_task_list_begin(&I->sub_list); + for (parcours_liste = I->nb_task_in_sub_list - i_bis; parcours_liste > 0; parcours_liste--) + { + task = starpu_task_list_next(task); + } + poids_tache_en_cours = 0; + starpu_data_handle_t * tab_tache_en_cours = malloc((STARPU_TASK_GET_NBUFFERS(task)) * sizeof(I->package_data[0])); + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + tab_tache_en_cours[i] = NULL; + } + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_deja_presente = false; + for (j = 0; j < I->package_nb_data; j++) + { + if (STARPU_TASK_GET_HANDLE(task,i) == donnee_I[j]) + { + donnee_deja_presente = true; + break; + } + } + if (donnee_deja_presente == false) + { + poids_tache_en_cours += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task,i)); + tab_tache_en_cours[i] = STARPU_TASK_GET_HANDLE(task,i); + } + } + if (poids + poids_tache_en_cours <= GPU_RAM_M) + { + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if (tab_tache_en_cours[i] != NULL) + { + donnee_I[index_tab_donnee_I] = tab_tache_en_cours[i]; + index_tab_donnee_I++; + insertion_ok = true; + } + } + if (insertion_ok == true) + { + poids += poids_tache_en_cours; + } + insertion_ok = false; + } + else + { + break; + } + } + } + else if (IJ_inferieur_GPU_RAM == true) + { + if (evaluation_I == 0) + { + printf("Error evaluation de I alors que I et J <= GPU_RAM\n"); + exit(0); + } + if (evaluation_I == 2) + { + split_ij = I->nb_task_in_sub_list - I->split_last_ij + 1; + } + else + { + split_ij = I->split_last_ij + 1; + } + task = starpu_task_list_begin(&I->sub_list); + if (evaluation_I == 2) + { + while(starpu_task_list_next(task) != NULL) + { + task = starpu_task_list_next(task); + } + } + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_I[i] = STARPU_TASK_GET_HANDLE(task,i); + } + index_tab_donnee_I = STARPU_TASK_GET_NBUFFERS(task); + for (i_bis = 2; i_bis < split_ij; i_bis++) + { + if (evaluation_I == 2) + { + task = starpu_task_list_begin(&I->sub_list); + for (parcours_liste = I->nb_task_in_sub_list - i_bis; parcours_liste > 0; parcours_liste--) + { + task = starpu_task_list_next(task); + } + } + else + { + task = starpu_task_list_next(task); + } + starpu_data_handle_t * tab_tache_en_cours = malloc((STARPU_TASK_GET_NBUFFERS(task)) * sizeof(I->package_data[0])); + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + tab_tache_en_cours[i] = NULL; + } + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_deja_presente = false; + for (j = 0; j < I->package_nb_data; j++) + { + if (STARPU_TASK_GET_HANDLE(task,i) == donnee_I[j]) + { + donnee_deja_presente = true; + break; + } + } + if (donnee_deja_presente == false) + { + tab_tache_en_cours[i] = STARPU_TASK_GET_HANDLE(task,i); + } + } + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if (tab_tache_en_cours[i] != NULL) + { + donnee_I[index_tab_donnee_I] = tab_tache_en_cours[i]; + index_tab_donnee_I++; + } + } + } + } + + if (evaluation_J == 0) + { + int i; + for (i = 0; i < J->package_nb_data; i++) + { + donnee_J[i] = J->package_data[i]; + } + index_tab_donnee_J = J->package_nb_data; + } + else if (evaluation_J == 1 && IJ_inferieur_GPU_RAM == false) + { + poids = 0; + insertion_ok = false; + task = starpu_task_list_begin(&J->sub_list); + unsigned int i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_J[i] = STARPU_TASK_GET_HANDLE(task,i); + poids += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task,i)); + } + index_tab_donnee_J = STARPU_TASK_GET_NBUFFERS(task); + while(1) + { + task = starpu_task_list_next(task); + if (task == NULL) + { + break; + } + poids_tache_en_cours = 0; + starpu_data_handle_t * tab_tache_en_cours = malloc((STARPU_TASK_GET_NBUFFERS(task)) * sizeof(J->package_data[0])); + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + tab_tache_en_cours[i] = NULL; + } + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_deja_presente = false; + for (j = 0; j < J->package_nb_data; j++) + { + if (STARPU_TASK_GET_HANDLE(task,i) == donnee_J[j]) + { + donnee_deja_presente = true; + break; + } + } + if (donnee_deja_presente == false) + { + poids_tache_en_cours += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task,i)); + tab_tache_en_cours[i] = STARPU_TASK_GET_HANDLE(task,i); + } + } + if (poids + poids_tache_en_cours <= GPU_RAM_M) + { + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if (tab_tache_en_cours[i] != NULL) + { + donnee_J[index_tab_donnee_J] = tab_tache_en_cours[i]; + index_tab_donnee_J++; + insertion_ok = true; + } + } + if (insertion_ok == true) + { + poids += poids_tache_en_cours; + } + insertion_ok = false; + } + else + { + break; + } + } + } + else if (evaluation_J == 2 && IJ_inferieur_GPU_RAM == false) + { + poids = 0; + i_bis = 1; insertion_ok = false; + /* Se placer sur la dernière tâche du paquet J */ + task = starpu_task_list_begin(&J->sub_list); + while(starpu_task_list_next(task) != NULL) + { + task = starpu_task_list_next(task); + } + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_J[i] = STARPU_TASK_GET_HANDLE(task,i); + poids += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task,i)); + } + index_tab_donnee_J = STARPU_TASK_GET_NBUFFERS(task); + while(1) + { + i_bis++; + task = starpu_task_list_begin(&J->sub_list); + for (parcours_liste = J->nb_task_in_sub_list - i_bis; parcours_liste > 0; parcours_liste--) + { + task = starpu_task_list_next(task); + } + poids_tache_en_cours = 0; + starpu_data_handle_t * tab_tache_en_cours = malloc((STARPU_TASK_GET_NBUFFERS(task)) * sizeof(J->package_data[0])); + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + tab_tache_en_cours[i] = NULL; + } + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_deja_presente = false; + for (j = 0; j < J->package_nb_data; j++) + { + if (STARPU_TASK_GET_HANDLE(task,i) == donnee_J[j]) + { + donnee_deja_presente = true; + break; + } + } + if (donnee_deja_presente == false) + { + poids_tache_en_cours += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task,i)); + tab_tache_en_cours[i] = STARPU_TASK_GET_HANDLE(task,i); + } + } + if (poids + poids_tache_en_cours <= GPU_RAM_M) + { + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if (tab_tache_en_cours[i] != NULL) + { + donnee_J[index_tab_donnee_J] = tab_tache_en_cours[i]; + index_tab_donnee_J++; + insertion_ok = true; + } + } + if (insertion_ok == true) + { + poids += poids_tache_en_cours; + } + insertion_ok = false; + } + else + { + break; + } + } + } + else if (IJ_inferieur_GPU_RAM == true) + { + if (evaluation_J == 0) + { + printf("Error evaluation de J alors que I et J <= GPU_RAM\n"); + exit(0); + } + if (evaluation_J == 2) + { + split_ij = J->nb_task_in_sub_list - J->split_last_ij + 1; + } + else + { + split_ij = J->split_last_ij + 1; + } + task = starpu_task_list_begin(&J->sub_list); + if (evaluation_J == 2) + { + while(starpu_task_list_next(task) != NULL) + { + task = starpu_task_list_next(task); + } + } + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_J[i] = STARPU_TASK_GET_HANDLE(task,i); + } + index_tab_donnee_J = STARPU_TASK_GET_NBUFFERS(task); + for (i_bis = 2; i_bis < split_ij; i_bis++) + { + if (evaluation_J == 2) + { + task = starpu_task_list_begin(&J->sub_list); + for (parcours_liste = J->nb_task_in_sub_list - i_bis; parcours_liste > 0; parcours_liste--) + { + task = starpu_task_list_next(task); + } + } + else + { + task = starpu_task_list_next(task); + } + starpu_data_handle_t * tab_tache_en_cours = malloc((STARPU_TASK_GET_NBUFFERS(task)) * sizeof(J->package_data[0])); + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + tab_tache_en_cours[i] = NULL; + } + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + donnee_deja_presente = false; + for (j = 0; j < J->package_nb_data; j++) + { + if (STARPU_TASK_GET_HANDLE(task,i) == donnee_J[j]) + { + donnee_deja_presente = true; + break; + } + } + if (donnee_deja_presente == false) + { + tab_tache_en_cours[i] = STARPU_TASK_GET_HANDLE(task,i); + } + } + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if (tab_tache_en_cours[i] != NULL) + { + donnee_J[index_tab_donnee_J] = tab_tache_en_cours[i]; + index_tab_donnee_J++; + } + } + } + } + int i; + for (i = 0; i < index_tab_donnee_I; i++) + { + for (j = 0; j < index_tab_donnee_J; j++) + { + if (donnee_I[i] == donnee_J[j]) + { + common_data_last_package++; + break; + } + } + } + + //~ gettimeofday(&time_end_getcommondataorderu, NULL); + //~ time_total_getcommondataorderu += (time_end_getcommondataorderu.tv_sec - time_start_getcommondataorderu.tv_sec)*1000000LL + time_end_getcommondataorderu.tv_usec - time_start_getcommondataorderu.tv_usec; + + return common_data_last_package; +} + +/* Comparator used to sort the data of a packages to erase the duplicate in O(n) */ +static int HFP_pointeurComparator(const void *first, const void *second) +{ + return (*(int*)first - *(int*)second); +} + +//TODO : ne fonctionne plus en 3D car le fichier dans le quel j'écrit je met x y z gpu mainteannt et non x y gpu en 3D +//static void visualisation_tache_matrice_format_tex(char *algo) +//{ +// printf("debut visualisation, %d\n", _starpu_HFP_N); +// int i, j, red, green, blue, x, y, gpu, k, ZN; +// int processing_order[_nb_gpus]; /* One for each GPU */ +// for (i = 0; i < _nb_gpus; i++) { processing_order[i] = 0; } +// int size = strlen("Output_maxime/Data_coordinates_order_last_.tex") + strlen(algo) + 1; +// char *path = (char *)malloc(size); +// strcpy(path, "Output_maxime/Data_coordinates_order_last_"); +// strcat(path, algo); +// strcat(path, ".tex"); +// +// printf("coord : %s\n", path); +// +// FILE * fcoordinate_order_last = fopen(path, "w"); +// size = strlen("Output_maxime/Data_coordinates_order_last_.txt") + strlen(algo) + 1; +// path = (char *)malloc(size); +// strcpy(path, "Output_maxime/Data_coordinates_order_last_"); +// strcat(path, algo); +// strcat(path, ".txt"); +// +// printf("input : %s\n", path); +// +// FILE *f_input = fopen(path, "r"); +// fprintf(fcoordinate_order_last,"\\documentclass{article}\\usepackage{color}\\usepackage{fullpage}\\usepackage{colortbl}\\usepackage{caption}\\usepackage{subcaption}\\usepackage{float}\\usepackage{graphics}\n\n\\begin{document}\n\n\\begin{figure}[H]"); +// i = 0; k = 0; +// if (_print3d != 0) /* Printing a 3D matrix, we print 4 tabular because we have nblocksz 4 */ +// { +// if (_print3d == 2) +// { +// ZN = _starpu_HFP_N; +// } +// else +// { +// ZN = 4; +// } +// +// printf("ZN = %d\n", ZN); +// +// int tab_order_1[ZN][_starpu_HFP_N][_starpu_HFP_N]; +// for (k = 0; k < ZN; k++) +// { +// for (i = 0; i < _starpu_HFP_N; i++) +// { +// for (j = 0; j < _starpu_HFP_N; j++) +// { +// tab_order_1[k][i][j] = -1; +// } +// } +// } +// int tab_gpu_1[ZN][_starpu_HFP_N][_starpu_HFP_N]; +// for (k = 0; k < ZN; k++) +// { +// for (i = 0; i < _starpu_HFP_N; i++) +// { +// for (j = 0; j < _starpu_HFP_N; j++) +// { +// tab_gpu_1[k][i][j] = -1; +// } +// } +// } +// i = 0; +// if (f_input != NULL && fcoordinate_order_last != NULL) +// { +// while (!feof (f_input)) +// { +// if (fscanf(f_input, "%d %d %d", &x, &y, &gpu) != 3) +// { +// //~ perror("error fscanf in visualisation_tache_matrice_format_tex\n"); exit(EXIT_FAILURE); +// } +// if (tab_order_1[0][x][y] == -1) +// { +// tab_order_1[0][x][y] = processing_order[gpu]; +// processing_order[gpu]++; +// } +// else if (tab_order_1[1][x][y] == -1) +// { +// tab_order_1[1][x][y] = processing_order[gpu]; +// processing_order[gpu]++; +// } +// else if (tab_order_1[2][x][y] == -1) +// { +// tab_order_1[2][x][y] = processing_order[gpu]; +// processing_order[gpu]++; +// } +// else +// { +// tab_order_1[3][x][y] = processing_order[gpu]; +// processing_order[gpu]++; +// } +// if (tab_gpu_1[0][x][y] == -1) +// { +// tab_gpu_1[0][x][y] = gpu; +// } +// else if (tab_gpu_1[1][x][y] == -1) +// { +// tab_gpu_1[1][x][y] = gpu; +// } +// else if (tab_gpu_1[2][x][y] == -1) +// { +// tab_gpu_1[2][x][y] = gpu; +// } +// else +// { +// tab_gpu_1[3][x][y] = gpu; +// } +// i++; +// } +// } +// else +// { +// perror("Impossible d'ouvrir au moins 1 fichier dans visualisation_tache_matrice_format_tex() dans if 3D\n"); +// exit(EXIT_FAILURE); +// } +// printf("ok\n"); +// tab_order_1[3][x][y] = tab_order_1[3][x][y] - 1; +// for (k = 0; k < ZN; k++) +// { +// fprintf(fcoordinate_order_last, "\n\\begin{subfigure}{.5\\textwidth}\\centering\\begin{tabular}{|"); +// for (i = 0; i < _starpu_HFP_N - 1; i++) +// { +// fprintf(fcoordinate_order_last,"c|"); +// } +// fprintf(fcoordinate_order_last,"c|}\n\\hline"); +// for (i = 0; i < _starpu_HFP_N; i++) +// { +// for (j = 0; j < _starpu_HFP_N - 1; j++) +// { +// if (tab_gpu_1[k][j][i] == 0) +// { +// red = 255; green = 255; blue = 255; +// } +// else if (tab_gpu_1[k][j][i] == 6) +// { +// red = 70; green = 130; blue = 180; +// } +// else +// { +// rgb(tab_gpu_1[k][j][i], &red, &green, &blue); +// } +// fprintf(fcoordinate_order_last,"\\cellcolor[RGB]{%d,%d,%d}%d&", red,green,blue, tab_order_1[k][j][i]); +// } +// if (tab_gpu_1[k][j][i] == 0) +// { +// red = 255; green = 255; blue = 255; +// } +// else if (tab_gpu_1[k][j][i] == 6) +// { +// red = 70; green = 130; blue = 180; +// } +// else +// { +// rgb(tab_gpu_1[k][j][i], &red, &green, &blue); +// } +// fprintf(fcoordinate_order_last,"\\cellcolor[RGB]{%d,%d,%d}%d",red,green,blue,tab_order_1[k][j][i]); +// fprintf(fcoordinate_order_last," \\\\"); fprintf(fcoordinate_order_last,"\\hline"); +// } +// fprintf(fcoordinate_order_last, "\\end{tabular}\\caption{Z = %d}\\end{subfigure}\n", k + 1); +// } +// fprintf(fcoordinate_order_last, "\n\\caption{Task's processing order on a 3D matrix}\\end{figure}\n\n\\end{document}"); +// } +// else /* Printing a 2D matrix so only one matrix */ +// { +// fprintf(fcoordinate_order_last, "\n\\centering\\begin{tabular}{|"); +// for (i = 0; i < _starpu_HFP_N - 1; i++) +// { +// fprintf(fcoordinate_order_last,"c|"); +// } +// i = 0; +// fprintf(fcoordinate_order_last,"c|}\n\\hline"); +// int tab_order[_starpu_HFP_N][_starpu_HFP_N]; +// int tab_gpu[_starpu_HFP_N][_starpu_HFP_N]; +// if (f_input != NULL && fcoordinate_order_last != NULL) +// { +// printf("reading, N = %d, NT = %d\n", _starpu_HFP_N, _starpu_HFP_NT); +// while (!feof (f_input)) +// { +// if (fscanf(f_input, "%d %d %d", &x, &y, &gpu) != 3) +// { +// //~ perror("error fscanf in visualisation_tache_matrice_format_tex_HEFT\n"); exit(EXIT_FAILURE); +// } +// tab_order[x][y] = processing_order[gpu]; +// processing_order[gpu]++; +// tab_gpu[x][y] = gpu; +// i++; +// } +// } +// else +// { +// perror("Impossible d'ouvrir au moins 1 fichier dans visualisation_tache_matrice_format_tex()\n"); exit(EXIT_FAILURE); +// } +// tab_order[x][y] = tab_order[x][y] - 1; +// for (i = 0; i < _starpu_HFP_N; i++) +// { +// for (j = 0; j < _starpu_HFP_N - 1; j++) +// { +// if (tab_gpu[j][i] == 0) +// { +// red = 255; green = 255; blue = 255; +// } +// else if (tab_gpu[j][i] == 6) +// { +// red = 70; green = 130; blue = 180; +// } +// else +// { +// rgb(tab_gpu[j][i], &red, &green, &blue); +// } +// fprintf(fcoordinate_order_last,"\\cellcolor[RGB]{%d,%d,%d}%d&", red,green,blue, tab_order[j][i]); +// } +// if (tab_gpu[j][i] == 0) +// { +// red = 255; green = 255; blue = 255; +// } +// else if (tab_gpu[j][i] == 6) +// { +// red = 70; green = 130; blue = 180; +// } +// else +// { +// rgb(tab_gpu[j][i], &red, &green, &blue); +// } +// fprintf(fcoordinate_order_last,"\\cellcolor[RGB]{%d,%d,%d}%d",red,green,blue,tab_order[j][i]); +// fprintf(fcoordinate_order_last," \\\\"); fprintf(fcoordinate_order_last,"\\hline"); +// } +// fprintf(fcoordinate_order_last, "\\end{tabular}\n\\caption{Task's processing order}\\end{figure}\n\n\\end{document}"); +// } +// fclose(fcoordinate_order_last); +// fclose(f_input); +// printf("fin visualisation\n"); +//} + +/* To print data order and number of data to load, only for 2D */ +//static void visualisation_tache_matrice_format_tex_with_data_2D() +//{ +// printf("Début de visualisation_tache_matrice_format_tex_with_data_2D()\n"); +// int i, j, red, green, blue, x, y, gpu, data_to_load, tikz_index; +// int processing_order[_nb_gpus]; for (i = 0; i < _nb_gpus; i++) { processing_order[i] = 0; } +// FILE * f_input_data_to_load = fopen("Output_maxime/Data_to_load_SCHEDULER.txt", "r"); +// FILE * f_input_data_coordinate = fopen("Output_maxime/Data_coordinates_order_last_SCHEDULER.txt", "r"); +// FILE * f_output = fopen("Output_maxime/visualisation_matrice_2D.tex", "w"); +// +// fprintf(f_output,"\\documentclass{article}\\usepackage{colortbl,tikz,float,caption}\\makeatletter\\tikzset{hatch distance/.store in=\\hatchdistance,hatch distance=5pt,hatch thickness/.store in=\\hatchthickness,hatch thickness=5pt}\\pgfdeclarepatternformonly[\\hatchdistance,\\hatchthickness]{north east hatch}{\\pgfqpoint{-1pt}{-1pt}}{\\pgfqpoint{\\hatchdistance}{\\hatchdistance}}{\\pgfpoint{\\hatchdistance-1pt}{\\hatchdistance-1pt}}{\\pgfsetcolor{\\tikz@pattern@color}\\pgfsetlinewidth{\\hatchthickness}\\pgfpathmoveto{\\pgfqpoint{0pt}{0pt}}\\pgfpathlineto{\\pgfqpoint{\\hatchdistance}{\\hatchdistance}}\\pgfusepath{stroke}}\\makeatother\\usetikzlibrary{calc,shadings,patterns,tikzmark}\\newcommand\\HatchedCell[5][0pt]{\\begin{tikzpicture}[overlay,remember picture]\\path ($(pic cs:#2)!0.5!(pic cs:#3)$)coordinate(aux1)(pic cs:#4)coordinate(aux2);\\fill[#5]($(aux1)+(-0.23*0.075\\textwidth,1.9ex)$)rectangle($(aux1 |- aux2)+(0.23*0.075\\textwidth,-#1*\\baselineskip-.8ex)$);\\end{tikzpicture}}\n\n\\begin{document}\n\n\\begin{figure}[H]\\centering\\begin{tabular}{|"); +// for (i = 0; i < _starpu_HFP_N - 1; i++) +// { +// fprintf(f_output,"c|"); +// } +// fprintf(f_output,"c|}\n\\hline"); +// i = 0; +// int tab_order[_starpu_HFP_N][_starpu_HFP_N]; +// int tab_gpu[_starpu_HFP_N][_starpu_HFP_N]; +// int tab_data_to_load[_starpu_HFP_N][_starpu_HFP_N]; +// while (!feof (f_input_data_coordinate)) +// { +// if (fscanf(f_input_data_coordinate, "%d %d %d", &x, &y, &gpu) != 3) +// { +// } +// tab_order[x][y] = processing_order[gpu]; +// processing_order[gpu]++; +// tab_gpu[x][y] = gpu; +// if (fscanf(f_input_data_to_load, "%d %d %d", &x, &y, &data_to_load) != 3) +// { +// } +// tab_data_to_load[x][y] = data_to_load; +// //~ printf("Dans visu, x = %d, y = %d, data to load = %d\n", x, y, data_to_load); +// i++; +// } +// +// tikz_index = 1; +// /* Because eof is one line too far */ +// tab_order[x][y] = tab_order[x][y] - 1; +// processing_order[gpu] = processing_order[gpu] - 1; +// //~ tab_data_to_load[x][y] = tab_data_to_load[x][y] - 1; +// //~ printf("%d pour x = %d, y = %d\n", tab_data_to_load[x][y], x, y); +// for (i = 0; i < _starpu_HFP_N; i++) +// { +// for (j = 0; j < _starpu_HFP_N - 1; j++) +// { +// //~ if (tab_gpu[j][i] == 0) { red = 255; green = 255; blue = 255; } +// //~ else if (tab_gpu[j][i] == 6) { red = 70; green = 130; blue = 180; } +// //~ else +// //~ { +// rgb_gradient(tab_gpu[j][i], tab_order[j][i], processing_order[tab_gpu[j][i]], &red, &green, &blue); +// //~ } +// if (tab_data_to_load[j][i] == 1) +// { +// fprintf(f_output,"\\tikzmark{start%d}\\cellcolor[RGB]{%d,%d,%d}\\tikzmark{middle%d}\\tikzmark{end%d}\\HatchedCell{start%d}{middle%d}{end%d}{pattern color=black!100,pattern=north east hatch,hatch distance=4mm,hatch thickness=.3pt}&", tikz_index, red, green, blue, tikz_index, tikz_index, tikz_index, tikz_index, tikz_index); +// tikz_index++; +// } +// else if (tab_data_to_load[j][i] == 2) +// { +// fprintf(f_output,"\\tikzmark{start%d}\\cellcolor[RGB]{%d,%d,%d}\\tikzmark{middle%d}\\tikzmark{end%d}\\HatchedCell{start%d}{middle%d}{end%d}{pattern color=black!100,pattern=north east hatch,hatch distance=2mm,hatch thickness=.3pt}&", tikz_index, red, green, blue, tikz_index, tikz_index, tikz_index, tikz_index, tikz_index); +// tikz_index++; +// } +// else +// { +// fprintf(f_output,"\\cellcolor[RGB]{%d,%d,%d}&", red, green, blue); +// } +// } +// rgb_gradient(tab_gpu[j][i], tab_order[j][i], processing_order[tab_gpu[j][i]], &red, &green, &blue); +// if (tab_data_to_load[j][i] == 1) +// { +// fprintf(f_output,"\\tikzmark{start%d}\\cellcolor[RGB]{%d,%d,%d}\\tikzmark{middle%d}\\tikzmark{end%d}\\HatchedCell{start%d}{middle%d}{end%d}{pattern color=black!100,pattern=north east hatch,hatch distance=4mm,hatch thickness=.3pt}", tikz_index, red, green, blue, tikz_index, tikz_index, tikz_index, tikz_index, tikz_index); +// tikz_index++; +// } +// else if (tab_data_to_load[j][i] == 2) +// { +// fprintf(f_output,"\\tikzmark{start%d}\\cellcolor[RGB]{%d,%d,%d}\\tikzmark{middle%d}\\tikzmark{end%d}\\HatchedCell{start%d}{middle%d}{end%d}{pattern color=black!100,pattern=north east hatch,hatch distance=2mm,hatch thickness=.3pt}", tikz_index, red, green, blue, tikz_index, tikz_index, tikz_index, tikz_index, tikz_index); +// tikz_index++; +// } +// else +// { +// fprintf(f_output,"\\cellcolor[RGB]{%d,%d,%d}", red, green, blue); +// } +// fprintf(f_output," \\\\\\hline"); +// } +// fprintf(f_output, "\\end{tabular}\n\\caption{2D matrix visualization}\\end{figure}\n\n\\end{document}"); +// fclose(f_output); +// fclose(f_input_data_to_load); +// fclose(f_input_data_coordinate); +// +// printf("Fin de visualisation_tache_matrice_format_tex_with_data_2D()\n"); +//} + +/* For multi gpu with expected package time (MULTIGPU == 6). + * Which is different than expected time used in our experiments. */ +static struct _starpu_HFP_data_on_node *init_data_list(starpu_data_handle_t d) +{ + struct _starpu_HFP_data_on_node *liste = malloc(sizeof(*liste)); + struct _starpu_HFP_handle *element = malloc(sizeof(*element)); + + if (liste == NULL || element == NULL) + { + exit(EXIT_FAILURE); + } + + liste->memory_used = starpu_data_get_size(d); + element->h = d; + element->last_use = 0; + element->next = NULL; + liste->first_data = element; + return liste; +} + +/* For gemm that has C tile put in won't use if they are never used again */ +static bool is_it_a_C_tile_data_never_used_again(starpu_data_handle_t h, int i, struct starpu_task_list *l, struct starpu_task *current_task) +{ + struct starpu_task *task = NULL; + if (i == 2) + { + /* Getting on the right data/right task */ + for (task = starpu_task_list_begin(l); task != starpu_task_list_end(l); task = starpu_task_list_next(task)) + { + if (current_task == task) + { + break; + } + } + for (task = starpu_task_list_next(task); task != starpu_task_list_end(l); task = starpu_task_list_next(task)) + { + if (h == STARPU_TASK_GET_HANDLE(task, 2)) + { + return false; + } + } + return true; + } + else + { + return false; + } +} + +/* Encore une fois seulement pour MULTIGPU == 6 */ +static void insertion_data_on_node(struct _starpu_HFP_data_on_node *liste, starpu_data_handle_t nvNombre, int use_order, int i, struct starpu_task_list *l, struct starpu_task *current_task) +{ + struct _starpu_HFP_handle *nouveau = malloc(sizeof(*nouveau)); + if (liste == NULL || nouveau == NULL) + { + perror("List in void insertion_data_on_node is NULL\n"); + exit(EXIT_FAILURE); + } + liste->memory_used += starpu_data_get_size(nvNombre); + nouveau->h = nvNombre; + nouveau->next = liste->first_data; + if (strcmp(_starpu_HFP_appli, "starpu_sgemm_gemm") == 0) + { + if (is_it_a_C_tile_data_never_used_again(nouveau->h, i, l, current_task) == true) + { + nouveau->last_use = -1; + } + else + { + nouveau->last_use = use_order; + } + } + else + { + nouveau->last_use = use_order; + } + liste->first_data = nouveau; +} + +//static void afficher_data_on_node(struct _starpu_HFP_my_list *liste) +//{ +// if (liste == NULL) +// { +// exit(EXIT_FAILURE); +// } +// +// struct _starpu_HFP_handle *actuel = liste->pointer_node->first_data; +// +// printf("Memory used = %ld | Expected time = %f / ", liste->pointer_node->memory_used, liste->expected_package_computation_time); +// while (actuel != NULL) +// { +// printf("%p | %d -> ", actuel->h, actuel->last_use); +// actuel = actuel->next; +// } +// printf("NULL\n"); +//} + +/* Search a data on the linked list of data */ +static bool SearchTheData(struct _starpu_HFP_data_on_node *pNode, starpu_data_handle_t iElement, int use_order) +{ + pNode->pointer_data_list = pNode->first_data; + while (pNode->pointer_data_list != NULL) + { + if(pNode->pointer_data_list->h == iElement) + { + if (use_order != -2) { pNode->pointer_data_list->last_use = use_order; } + return true; + } + else + { + pNode->pointer_data_list = pNode->pointer_data_list->next; + } + } + return false; +} + +/* Replace the least recently used data on memory with the new one. + * But we need to look that it's not a data used by current task too! + * We remove first data from C if we are in a gemm application..0 + * Again it's only for MULTI GPU == 6 which we don't use. + */ +static void replace_least_recently_used_data(struct _starpu_HFP_data_on_node *a, starpu_data_handle_t data_to_load, int use_order, struct starpu_task *current_task, struct starpu_task_list *l, int index_handle) +{ + bool data_currently_used = false; + int least_recent_use = INT_MAX; + for (a->pointer_data_list = a->first_data; a->pointer_data_list != NULL; a->pointer_data_list = a->pointer_data_list->next) + { + data_currently_used = false; + if (least_recent_use > a->pointer_data_list->last_use) + { + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(current_task); i++) + { + if (STARPU_TASK_GET_HANDLE(current_task, i) == a->pointer_data_list->h) + { + data_currently_used = true; + break; + } + } + if (data_currently_used == false) + { + least_recent_use = a->pointer_data_list->last_use; + } + } + } + for (a->pointer_data_list = a->first_data; a->pointer_data_list != NULL; a->pointer_data_list = a->pointer_data_list->next) + { + if (least_recent_use == a->pointer_data_list->last_use) + { + //~ printf("Données utilisé il y a le plus longtemps : %p | %d\n", a->pointer_data_list->h, a->pointer_data_list->last_use); + a->pointer_data_list->h = data_to_load; + if (strcmp(_starpu_HFP_appli, "starpu_sgemm_gemm") == 0) + { + if (is_it_a_C_tile_data_never_used_again(a->pointer_data_list->h, index_handle, l, current_task) == true) + { + a->pointer_data_list->last_use = -1; + } + else + { + a->pointer_data_list->last_use = use_order; + } + } + else + { + a->pointer_data_list->last_use = use_order; + } + break; + } + } +} + +/* Push back in a package a task + * Used in load_balance + * Does not manage to migrate data of the task too + */ +static void merge_task_and_package(struct _starpu_HFP_my_list *package, struct starpu_task *task) +{ + int tab_runner = 0; int nb_duplicate_data = 0; + package->nb_task_in_sub_list++; + starpu_data_handle_t *temp_data_tab = malloc((package->package_nb_data + STARPU_TASK_GET_NBUFFERS(task))*sizeof(package->package_data[0])); + int i=0; + unsigned j=0; + while (i < package->package_nb_data && j < STARPU_TASK_GET_NBUFFERS(task)) + { + if (package->package_data[i] <= STARPU_TASK_GET_HANDLE(task,j)) + { + temp_data_tab[tab_runner] = package->package_data[i]; + i++; + } + else + { + temp_data_tab[tab_runner] = STARPU_TASK_GET_HANDLE(task,j); + j++; + } + tab_runner++; + } + while (i < package->package_nb_data) + { + temp_data_tab[tab_runner] = package->package_data[i]; + i++; + tab_runner++; + } + while (j < STARPU_TASK_GET_NBUFFERS(task)) + { + temp_data_tab[tab_runner] = STARPU_TASK_GET_HANDLE(task,j); + j++; + tab_runner++; + } + for (i = 0; i < (package->package_nb_data + (int)STARPU_TASK_GET_NBUFFERS(task)); i++) + { + if (temp_data_tab[i] == temp_data_tab[i + 1]) + { + temp_data_tab[i] = 0; + nb_duplicate_data++; + } + } + package->package_data = malloc((package->package_nb_data + STARPU_TASK_GET_NBUFFERS(task) - nb_duplicate_data) * sizeof(starpu_data_handle_t)); + j = 0; + for (i = 0; i < (package->package_nb_data + (int)STARPU_TASK_GET_NBUFFERS(task)); i++) + { + if (temp_data_tab[i] != 0) + { + package->package_data[j] = temp_data_tab[i]; j++; + } + } + package->package_nb_data = STARPU_TASK_GET_NBUFFERS(task) + package->package_nb_data - nb_duplicate_data; + package->expected_time += starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + starpu_task_list_push_back(&package->sub_list, task); +} + +/* Return expected time of the list of task + fill a struct of data on the node, + * so we can more easily simulate adding, removing task in a list, + * without re-calculating everything. + */ +static void get_expected_package_computation_time(struct _starpu_HFP_my_list *l, starpu_ssize_t GPU_RAM) +{ + if (l->nb_task_in_sub_list < 1) + { + l->expected_package_computation_time = 0; + return; + } + int use_order = 1; + struct starpu_task *task; + struct starpu_task *next_task; + double time_to_add = 0; + + task = starpu_task_list_begin(&l->sub_list); + /* Init linked list of data in this package */ + l->pointer_node = init_data_list(STARPU_TASK_GET_HANDLE(task, 0)); + l->expected_package_computation_time = starpu_transfer_predict(0, 1, starpu_data_get_size(STARPU_TASK_GET_HANDLE(task, 0))); + /* Put the remaining data on simulated memory */ + unsigned i; + for (i = 1; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + insertion_data_on_node(l->pointer_node, STARPU_TASK_GET_HANDLE(task, i), use_order, i, &l->sub_list, task); + l->expected_package_computation_time += starpu_transfer_predict(0, 1, starpu_data_get_size(STARPU_TASK_GET_HANDLE(task, i))); + use_order++; + } + for (next_task = starpu_task_list_next(task); next_task != starpu_task_list_end(&l->sub_list); next_task = starpu_task_list_next(next_task)) + { + time_to_add = 0; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(next_task); i++) + { + if (SearchTheData(l->pointer_node, STARPU_TASK_GET_HANDLE(next_task, i), use_order) == false) + { + if (l->pointer_node->memory_used + starpu_transfer_predict(0, 1, starpu_data_get_size(STARPU_TASK_GET_HANDLE(next_task, i))) <= GPU_RAM) + { + insertion_data_on_node(l->pointer_node, STARPU_TASK_GET_HANDLE(next_task, i), use_order, i, &l->sub_list, task); + use_order++; + time_to_add += starpu_transfer_predict(0, 1, starpu_data_get_size(STARPU_TASK_GET_HANDLE(next_task, i))); + } + else + { + /* Need to evict a data and replace it */ + replace_least_recently_used_data(l->pointer_node, STARPU_TASK_GET_HANDLE(next_task, i), use_order, task, &l->sub_list, i); + use_order++; + time_to_add += starpu_transfer_predict(0, 1, starpu_data_get_size(STARPU_TASK_GET_HANDLE(next_task, i))); + } + } + else + { + /* A data already on memory will be used, need to increment use_order */ + use_order++; + } + } + /* Who cost more time ? Task T_{i-1} or data load from T_{i} */ + if (time_to_add > starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0)) + { + l->expected_package_computation_time += time_to_add; + } + else + { + l->expected_package_computation_time += starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + } + task = starpu_task_list_next(task); + } + l->expected_package_computation_time += starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); +} + +/* Equilibrates package in order to have packages with the same expected computation time, + * including transfers and computation/transfers overlap. + * Called in HFP_pull_task once all packages are done. + * It is called when MULTIGPU = 6 or 7. + */ +static void load_balance_expected_package_computation_time(struct _starpu_HFP_paquets *p, starpu_ssize_t GPU_RAM) +{ + //~ if (strcmp(_starpu_HFP_appli, "starpu_sgemm_gemm") && strcmp(_starpu_HFP_appli, "random_set_of_task") != 0) + //~ { + /* What is different mainly is with the task of C that is in won't use for LRU with gemms once it used. + * We do something in replace_least_recently_used_data that maybe we can't do in cholesky or random graphs? */ + //~ perror("load_balance_expected_package_computation_time not implemented yet for non-gemm applications\n"); exit(EXIT_FAILURE); + //~ } + struct starpu_task *task; + task = starpu_task_list_begin(&p->temp_pointer_1->sub_list); + p->temp_pointer_1 = p->first_link; + while (p->temp_pointer_1 != NULL) + { + get_expected_package_computation_time(p->temp_pointer_1, GPU_RAM); + p->temp_pointer_1 = p->temp_pointer_1->next; + } + + int package_with_min_expected_time, package_with_max_expected_time; + int last_package_with_min_expected_time = 0; + int last_package_with_max_expected_time = 0; + double min_expected_time, max_expected_time; + bool load_balance_needed = true; + //~ int percentage = 1; /* percentage of difference between packages */ + /* Selecting the smallest and biggest package */ + while (load_balance_needed == true) + { + p->temp_pointer_1 = p->first_link; + min_expected_time = p->temp_pointer_1->expected_package_computation_time; + max_expected_time = p->temp_pointer_1->expected_package_computation_time; + package_with_min_expected_time = 0; + package_with_max_expected_time = 0; + int i = 0; + p->temp_pointer_1 = p->temp_pointer_1->next; + while (p->temp_pointer_1 != NULL) + { + i++; + if (min_expected_time > p->temp_pointer_1->expected_package_computation_time) + { + min_expected_time = p->temp_pointer_1->expected_package_computation_time; + package_with_min_expected_time = i; + } + if (max_expected_time < p->temp_pointer_1->expected_package_computation_time) + { + max_expected_time = p->temp_pointer_1->expected_package_computation_time; + package_with_max_expected_time = i; + } + p->temp_pointer_1 = p->temp_pointer_1->next; + } + //~ if (_print_in_terminal == 1) { printf("min et max : %f et %f, paquets %d et %d\n",min_expected_time, max_expected_time, package_with_min_expected_time, package_with_max_expected_time); } + + /* To avoid looping indefinitely */ + if (last_package_with_min_expected_time == package_with_max_expected_time && last_package_with_max_expected_time == package_with_min_expected_time) + { + break; + } + + /* Stealing as much task from the last tasks of the biggest packages */ + /* Getting on the right packages */ + p->temp_pointer_1 = p->first_link; + for (i = 0; i < package_with_min_expected_time; i++) + { + p->temp_pointer_1 = p->temp_pointer_1->next; + } + p->temp_pointer_2 = p->first_link; + for (i = 0; i < package_with_max_expected_time; i++) + { + p->temp_pointer_2 = p->temp_pointer_2->next; + } + while (p->temp_pointer_1->expected_package_computation_time >= p->temp_pointer_2->expected_package_computation_time - ((p->temp_pointer_2->expected_package_computation_time*max_expected_time)/100)) + { + task = starpu_task_list_pop_back(&p->temp_pointer_2->sub_list); + p->temp_pointer_2->expected_time -= starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + merge_task_and_package(p->temp_pointer_1, task); + p->temp_pointer_2->nb_task_in_sub_list--; + free(p->temp_pointer_1->pointer_node); + free(p->temp_pointer_2->pointer_node); + get_expected_package_computation_time(p->temp_pointer_1, GPU_RAM); + get_expected_package_computation_time(p->temp_pointer_2, GPU_RAM); + if ( p->temp_pointer_1->expected_package_computation_time >= p->temp_pointer_2->expected_package_computation_time) + { + break; + } + } + last_package_with_min_expected_time = package_with_min_expected_time; + last_package_with_max_expected_time = package_with_max_expected_time; + } +} + +/* Called in HFP_pull_task. Cut in half the package and interlace task from end of left part and beginning of right part. + * This way we alternate with task sharing data (the middle of the package) then end with task sharing few data (extremities). + * This is only called if environemment value INTERLACING is set te something else than 1. + * Example: 0 1 2 3 4 5 6 7 8 9 10 -> 5 6 4 7 3 8 2 9 1 10 0 */ +static void interlacing_task_list(struct _starpu_HFP_paquets *a) +{ + a->temp_pointer_1 = a->first_link; + int middle = 0; + int i = 0; + struct starpu_task_list sub_list_left; + starpu_task_list_init(&sub_list_left); + struct starpu_task_list sub_list_right; + starpu_task_list_init(&sub_list_right); + + while (a->temp_pointer_1 != NULL) + { + middle = a->temp_pointer_1->nb_task_in_sub_list/2; + if (a->temp_pointer_1->nb_task_in_sub_list%2 == 1) + { + /* So the biggest package is the one on the left, the one with which I start. */ + middle++; + } + /* Filling two sub_list, right and left */ + for (i = 0; i < middle; i++) + { + starpu_task_list_push_back(&sub_list_left, starpu_task_list_pop_front(&a->temp_pointer_1->sub_list)); + } + for (i = middle; i < a->temp_pointer_1->nb_task_in_sub_list; i++) + { + starpu_task_list_push_back(&sub_list_right, starpu_task_list_pop_front(&a->temp_pointer_1->sub_list)); + } + /* Re-filling the package alterning left and right */ + for (i = 0; i < a->temp_pointer_1->nb_task_in_sub_list; i++) + { + if (i%2 == 0) + { + starpu_task_list_push_back(&a->temp_pointer_1->sub_list, starpu_task_list_pop_back(&sub_list_left)); + } + else + { + starpu_task_list_push_back(&a->temp_pointer_1->sub_list, starpu_task_list_pop_front(&sub_list_right)); + } + } + a->temp_pointer_1 = a->temp_pointer_1->next; + } +} + +/* TODO : a supprimer une fois les mesures du temps terminées */ +//~ struct timeval time_start_gettasktoreturn; +//~ struct timeval time_end_gettasktoreturn; +//~ long long time_total_gettasktoreturn = 0; + +/* Called in HFP_pull_task when we need to return a task. It is used when we have multiple GPUs + * In case of modular-heft-HFP, it needs to do a round robin on the task it returned. So we use expected_time_pulled_out, + * an element of struct my_list in order to track which package pulled out the least expected task time. So heft can can + * better divide tasks between GPUs */ +struct starpu_task *_starpu_HFP_get_task_to_return(struct starpu_sched_component *component, struct starpu_sched_component *to, struct _starpu_HFP_paquets* a, int nb_gpu) +{ + //~ gettimeofday(&time_start_gettasktoreturn, NULL); + int max_task_time = 0; + int index_package_max_task_time = 0; + a->temp_pointer_1 = a->first_link; + int i = 0; struct starpu_task *task; double min_expected_time_pulled_out = 0; int package_min_expected_time_pulled_out = 0; + /* If there is only one big package */ + if (multigpu == 0 && _starpu_HFP_hmetis == 0 && _nb_gpus == 1) + { + //~ printf("Will pop front\n"); + task = starpu_task_list_pop_front(&a->temp_pointer_1->sub_list); + _sched_visu_print_data_to_load_prefetch(task, starpu_worker_get_id() + 1, 0); + + //~ gettimeofday(&time_end_gettasktoreturn, NULL); + //~ time_total_gettasktoreturn += (time_end_gettasktoreturn.tv_sec - time_start_gettasktoreturn.tv_sec)*1000000LL + time_end_gettasktoreturn.tv_usec - time_start_gettasktoreturn.tv_usec; + return task; + } + else + { + /* If we use modular heft I look at the expected time pulled out of each package to alternate between packages */ + if (modular_heft_hfp_mode != 0) + { + package_min_expected_time_pulled_out = 0; + min_expected_time_pulled_out = DBL_MAX; + for (i = 0; i < nb_gpu; i++) + { + /* We also need to check that the package is not empty */ + if (a->temp_pointer_1->expected_time_pulled_out < min_expected_time_pulled_out && !starpu_task_list_empty(&a->temp_pointer_1->sub_list)) + { + min_expected_time_pulled_out = a->temp_pointer_1->expected_time_pulled_out; + package_min_expected_time_pulled_out = i; + } + a->temp_pointer_1 = a->temp_pointer_1->next; + } + a->temp_pointer_1 = a->first_link; + for (i = 0; i < package_min_expected_time_pulled_out; i++) + { + a->temp_pointer_1 = a->temp_pointer_1->next; + } + task = starpu_task_list_pop_front(&a->temp_pointer_1->sub_list); + a->temp_pointer_1->expected_time_pulled_out += starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + + //~ gettimeofday(&time_end_gettasktoreturn, NULL); + //~ time_total_gettasktoreturn += (time_end_gettasktoreturn.tv_sec - time_start_gettasktoreturn.tv_sec)*1000000LL + time_end_gettasktoreturn.tv_usec - time_start_gettasktoreturn.tv_usec; + + + return task; + } + else + { + /* We are using HFP */ + for (i = 0; i < nb_gpu; i++) + { + if (to == component->children[i]) + { + break; + } + else + { + a->temp_pointer_1 = a->temp_pointer_1->next; + } + } + if (!starpu_task_list_empty(&a->temp_pointer_1->sub_list)) + { + //~ printf("Will pop front\n"); + task = starpu_task_list_pop_front(&a->temp_pointer_1->sub_list); + a->temp_pointer_1->expected_time -= starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + a->temp_pointer_1->nb_task_in_sub_list--; + _sched_visu_print_data_to_load_prefetch(task, starpu_worker_get_id() + 1, 0); + + //~ gettimeofday(&time_end_gettasktoreturn, NULL); + //~ time_total_gettasktoreturn += (time_end_gettasktoreturn.tv_sec - time_start_gettasktoreturn.tv_sec)*1000000LL + time_end_gettasktoreturn.tv_usec - time_start_gettasktoreturn.tv_usec; + //~ printf("Return %p\n", task); + return task; + } + else + { + /* Our current gpu's package is empty, we want to steal! */ + if (task_stealing == 1) + { + /* Stealing from package with the most tasks time duration. + * temp_pointer_2 = biggest package, temp_pointer_1 = empty package that will steal from temp_pointer_2. */ + a->temp_pointer_2 = a->first_link; + i = 0; + max_task_time = a->temp_pointer_2->expected_time; + index_package_max_task_time = 0; + while (a->temp_pointer_2->next != NULL) + { + a->temp_pointer_2 = a->temp_pointer_2->next; + i++; + if (max_task_time < a->temp_pointer_2->expected_time) + { + max_task_time = a->temp_pointer_2->expected_time; + index_package_max_task_time = i; + } + } + if (max_task_time != 0) + { + a->temp_pointer_2 = a->first_link; + for (i = 0; i < index_package_max_task_time; i++) + { + a->temp_pointer_2 = a->temp_pointer_2->next; + } + task = starpu_task_list_pop_back(&a->temp_pointer_2->sub_list); + a->temp_pointer_2->expected_time -= starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + a->temp_pointer_2->nb_task_in_sub_list--; + _sched_visu_print_data_to_load_prefetch(task, starpu_worker_get_id() + 1, 0); + + //~ gettimeofday(&time_end_gettasktoreturn, NULL); + //~ time_total_gettasktoreturn += (time_end_gettasktoreturn.tv_sec - time_start_gettasktoreturn.tv_sec)*1000000LL + time_end_gettasktoreturn.tv_usec - time_start_gettasktoreturn.tv_usec; + + return task; + } + else + { + return NULL; + } + } + else if (task_stealing == 2 || task_stealing == 3) + { + /* Stealing from package with the most expected package time */ + a->temp_pointer_2 = a->first_link; + while (a->temp_pointer_2 != NULL) + { + get_expected_package_computation_time(a->temp_pointer_2, _starpu_HFP_GPU_RAM_M); + a->temp_pointer_2 = a->temp_pointer_2->next; + } + i = 0; + a->temp_pointer_2 = a->first_link; + double max_package_time = a->temp_pointer_2->expected_package_computation_time; + index_package_max_task_time = 0; + while (a->temp_pointer_2->next != NULL) + { + a->temp_pointer_2 = a->temp_pointer_2->next; + i++; + if (max_package_time < a->temp_pointer_2->expected_package_computation_time) + { + max_package_time = a->temp_pointer_2->expected_package_computation_time; + index_package_max_task_time = i; + } + } + if (max_package_time != 0) + { + a->temp_pointer_2 = a->first_link; + for (i = 0; i < index_package_max_task_time; i++) + { + a->temp_pointer_2 = a->temp_pointer_2->next; + } + if (task_stealing == 3) + { + /* We steal half of the package in terms of task duration */ + while (a->temp_pointer_1->expected_time < a->temp_pointer_2->expected_time/2) + { + /* We steal from the end */ + task = starpu_task_list_pop_back(&a->temp_pointer_2->sub_list); + a->temp_pointer_2->expected_time -= starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + a->temp_pointer_2->nb_task_in_sub_list--; + starpu_task_list_push_front(&a->temp_pointer_1->sub_list, task); + a->temp_pointer_1->expected_time += starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + a->temp_pointer_1->nb_task_in_sub_list++; + } + get_expected_package_computation_time(a->temp_pointer_2, _starpu_HFP_GPU_RAM_M); + task = starpu_task_list_pop_front(&a->temp_pointer_1->sub_list); + a->temp_pointer_1->expected_time -= starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + a->temp_pointer_1->nb_task_in_sub_list--; + get_expected_package_computation_time(a->temp_pointer_1, _starpu_HFP_GPU_RAM_M); + } + else + { + /* We only steal one task */ + task = starpu_task_list_pop_back(&a->temp_pointer_2->sub_list); + a->temp_pointer_2->expected_time -= starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + a->temp_pointer_2->nb_task_in_sub_list--; + get_expected_package_computation_time(a->temp_pointer_2, _starpu_HFP_GPU_RAM_M); + } + _sched_visu_print_data_to_load_prefetch(task, starpu_worker_get_id() + 1, 0); + + //~ gettimeofday(&time_end_gettasktoreturn, NULL); + //~ time_total_gettasktoreturn += (time_end_gettasktoreturn.tv_sec - time_start_gettasktoreturn.tv_sec)*1000000LL + time_end_gettasktoreturn.tv_usec - time_start_gettasktoreturn.tv_usec; + return task; + } + else + { + /* Nothing to steal */ + //~ gettimeofday(&time_end_gettasktoreturn, NULL); + //~ time_total_gettasktoreturn += (time_end_gettasktoreturn.tv_sec - time_start_gettasktoreturn.tv_sec)*1000000LL + time_end_gettasktoreturn.tv_usec - time_start_gettasktoreturn.tv_usec; + return NULL; + } + } + else + { + /* We don't use task stealing */ + //~ gettimeofday(&time_end_gettasktoreturn, NULL); + //~ time_total_gettasktoreturn += (time_end_gettasktoreturn.tv_sec - time_start_gettasktoreturn.tv_sec)*1000000LL + time_end_gettasktoreturn.tv_usec - time_start_gettasktoreturn.tv_usec; + return NULL; + } + } + } + } +} + +/* Giving prefetch for each task to modular-heft-HFP */ +static void prefetch_each_task(struct _starpu_HFP_paquets *a, struct starpu_sched_component *component) +{ + struct starpu_task *task; + int i = 0; + a->temp_pointer_1 = a->first_link; + + while (a->temp_pointer_1 != NULL) + { + for (task = starpu_task_list_begin(&a->temp_pointer_1->sub_list); task != starpu_task_list_end(&a->temp_pointer_1->sub_list); task = starpu_task_list_next(task)) + { + /* Putting in workerid the information of the chosen gpu HFP. Then in helper_mct, we can use this information to influence the expected time */ + task->workerid = i; + if (modular_heft_hfp_mode == 1) + { + starpu_prefetch_task_input_on_node_prio(task, starpu_worker_get_memory_node(starpu_bitmap_first(&component->children[0]->children[i]->workers_in_ctx)), 0); + } + else if (modular_heft_hfp_mode == 2) + { + starpu_idle_prefetch_task_input_on_node_prio(task, starpu_worker_get_memory_node(starpu_bitmap_first(&component->children[0]->children[i]->workers_in_ctx)), 0); + } + else + { + printf("Wrong environement variable MODULAR_HEFT_HFP_MODE\n"); + exit(0); + } + } + a->temp_pointer_1 = a->temp_pointer_1->next; printf("next\n"); + i++; + } +} + +//static int get_max_value_common_data_matrix(struct _starpu_HFP_paquets *p, int GPU_limit_switch, int number_task, int min_nb_task_in_sub_list, long int matrice_donnees_commune[][number_task]) +//{ +// struct _starpu_HFP_my_list *l1 = p->first_link; +// struct _starpu_HFP_my_list *l2 = p->first_link; +// int i_bis = 0; +// int j_bis = 0; +// +// int max_value_common_data_matrix = 0; +// for (i_bis = 0; i_bis < number_task; i_bis++) +// { +// if (l1->nb_task_in_sub_list == min_nb_task_in_sub_list) +// { +// for (l2 = p->first_link; l2 != NULL; l2 = l2->next) +// { +// if (i_bis != j_bis) +// { +// //~ if(max_value_common_data_matrix < matrice_donnees_commune[i_bis][j_bis]) +// if (max_value_common_data_matrix < matrice_donnees_commune[i_bis][j_bis] && ((GPU_limit_switch == 0) || (GPU_limit_switch == 1 && (l1->data_weight + l2->data_weight - matrice_donnees_commune[i_bis][j_bis])))) +// { +// max_value_common_data_matrix = matrice_donnees_commune[i_bis][j_bis]; +// } +// } +// j_bis++; +// } +// } +// l1 = l1->next; +// j_bis = 0; +// } +// return max_value_common_data_matrix; +//} + +/* Pushing the tasks */ +static int HFP_push_task(struct starpu_sched_component *component, struct starpu_task *task) +{ + struct _starpu_HFP_sched_data *data = component->data; + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + starpu_task_list_push_front(&data->sched_list, task); + starpu_push_task_end(task); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + /* Tell below that they can now pull */ + component->can_pull(component); + return 0; +} + +/* TODO : a supprimer une fois les mesures du temps terminées */ +//~ struct timeval time_start_scheduling; +//~ struct timeval time_end_scheduling; +//~ long long time_total_scheduling = 0; +//~ struct timeval time_start_find_min_size; +//~ struct timeval time_end_find_min_size; +//~ long long time_total_find_min_size = 0; +//~ struct timeval time_start_init_packages; +//~ struct timeval time_end_init_packages; +//~ long long time_total_init_packages = 0; +//~ struct timeval time_start_fill_matrix_common_data_plus_get_max; +//~ struct timeval time_end_fill_matrix_common_data_plus_get_max; +//~ long long time_total_fill_matrix_common_data_plus_get_max = 0; +//~ struct timeval time_start_order_u_total; +//~ struct timeval time_end_order_u_total; +//~ long long time_total_order_u_total = 0; +//~ struct timeval time_start_reset_init_start_while_loop; +//~ struct timeval time_end_reset_init_start_while_loop; +//~ long long time_total_reset_init_start_while_loop = 0; +//~ struct timeval time_start_merge; +//~ struct timeval time_end_merge; +//~ long long time_total_merge = 0; +//~ struct timeval time_start_iteration_i; +//~ struct timeval time_end_iteration_i; +//~ long long time_total_iteration_i = 0; + +/* Need an empty data paquets_data to build packages + * Output a task list ordered. So it's HFP if we have only one package at the end + * Used for now to reorder task inside a package after load balancing + * Can be used as main HFP like in pull task later + * Things commented are things to print matrix or things like that. + */ +static struct _starpu_HFP_paquets* hierarchical_fair_packing(struct starpu_task_list *task_list, int number_task, int number_of_package_to_build) +{ + struct _starpu_HFP_paquets *paquets_data = malloc(sizeof(*paquets_data)); + struct _starpu_HFP_my_list *my_data = malloc(sizeof(*my_data)); + starpu_task_list_init(&my_data->sub_list); + starpu_task_list_init(&my_data->refused_fifo_list); + my_data->next = NULL; + paquets_data->temp_pointer_1 = my_data; + paquets_data->first_link = paquets_data->temp_pointer_1; + struct starpu_task_list non_connexe; + starpu_task_list_init(&non_connexe); + int nb_duplicate_data = 0; /* Used to store the weight the merging of two packages would be. It is then used to see if it's inferior to the size of the RAM of the GPU */ + long int max_value_common_data_matrix = 0; /* Store the maximum weight of the commons data between two packages for all the tasks */ + long int common_data_last_package_i1_j1 = 0; /* Variables used to compare the affinity between sub package 1i and 1j, 1i and 2j etc... */ + long int common_data_last_package_i1_j2 = 0; long int common_data_last_package_i2_j1 = 0; + long int common_data_last_package_i2_j2 = 0; long int max_common_data_last_package = 0; + long int weight_package_i = 0; /* Used for ORDER_U too */ + long int weight_package_j = 0; + int GPU_limit_switch = 1; int tab_runner = 0; + int index_head_1 = 0; + int index_head_2 = 0; + int common_data_last_package_i2_j = 0; + int common_data_last_package_i1_j = 0; + int common_data_last_package_i_j1 = 0; + int common_data_last_package_i_j2 = 0; + int min_nb_task_in_sub_list = 0; + struct starpu_task *task; int nb_of_loop = 0; + int packaging_impossible = 0; + int n_duplicate_cho = 0; + + /* One task == one link in the linked list */ + int do_not_add_more = number_task - 1; + + while (!starpu_task_list_empty(task_list)) + { + task = starpu_task_list_pop_front(task_list); + _sched_visu_print_data_for_task(task, "Task %p using data(s)"); + + paquets_data->temp_pointer_1->expected_time = starpu_task_expected_length(task, starpu_worker_get_perf_archtype(0, 0), 0); + paquets_data->temp_pointer_1->data_weight = 0; + paquets_data->temp_pointer_1->data_to_evict_next = NULL; /* Mise à NULL de data to evict next pour eviter les pb en réel sur grid5k */ + + /* Si on est sur Cholesky je vire les doublons de données au sein d'une tâche */ + if (strcmp(_starpu_HFP_appli, "chol_model_11") == 0) + { + n_duplicate_cho = 0; + + /* Getting the number of duplicate and filling a temp_tab */ + starpu_data_handle_t *temp_data_tab_cho = malloc(STARPU_TASK_GET_NBUFFERS(task)*sizeof(paquets_data->temp_pointer_1->package_data[0])); + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task) - 1; i++) + { + if (STARPU_TASK_GET_HANDLE(task, i) == STARPU_TASK_GET_HANDLE(task, i + 1)) + { + n_duplicate_cho++; + temp_data_tab_cho[i] = NULL; + } + else + { + temp_data_tab_cho[i] = STARPU_TASK_GET_HANDLE(task, i); + } + } + temp_data_tab_cho[i] = STARPU_TASK_GET_HANDLE(task, i); + paquets_data->temp_pointer_1->package_data = malloc((STARPU_TASK_GET_NBUFFERS(task) - n_duplicate_cho)*sizeof(paquets_data->temp_pointer_1->package_data[0])); + int j = 0; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if (temp_data_tab_cho[i] != NULL) + { + paquets_data->temp_pointer_1->package_data[j] = temp_data_tab_cho[i]; + j++; + } + } + paquets_data->temp_pointer_1->package_nb_data = STARPU_TASK_GET_NBUFFERS(task) - n_duplicate_cho; + } + else + { + paquets_data->temp_pointer_1->package_data = malloc(STARPU_TASK_GET_NBUFFERS(task)*sizeof(paquets_data->temp_pointer_1->package_data[0])); + + /* Mise à NULL de data to evict next pour eviter les pb en réel sur grid5k */ + paquets_data->temp_pointer_1->data_to_evict_next = NULL; + + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + paquets_data->temp_pointer_1->package_data[i] = STARPU_TASK_GET_HANDLE(task,i); + paquets_data->temp_pointer_1->data_weight += starpu_data_get_size(STARPU_TASK_GET_HANDLE(task,i)); + } + paquets_data->temp_pointer_1->package_nb_data = STARPU_TASK_GET_NBUFFERS(task); + } + + /* We sort our data in the packages */ + qsort(paquets_data->temp_pointer_1->package_data, paquets_data->temp_pointer_1->package_nb_data, sizeof(paquets_data->temp_pointer_1->package_data[0]), HFP_pointeurComparator); + + /* Pushing the task and the number of the package in the package */ + starpu_task_list_push_back(&paquets_data->temp_pointer_1->sub_list, task); + /* Initialization of the lists last_packages */ + paquets_data->temp_pointer_1->split_last_ij = 0; + paquets_data->temp_pointer_1->nb_task_in_sub_list = 1; + + if(do_not_add_more != 0) + { + _starpu_HFP_insertion(paquets_data); + + /*TODO utile ??? */ + //~ paquets_data->temp_pointer_1->package_data = malloc(STARPU_TASK_GET_NBUFFERS(task)*sizeof(paquets_data->temp_pointer_1->package_data[0])); + } + do_not_add_more--; + } + paquets_data->first_link = paquets_data->temp_pointer_1; + paquets_data->temp_pointer_2 = paquets_data->first_link; + index_head_2++; + paquets_data->NP = _starpu_HFP_NT; + + /* THE while loop. Stop when no more packaging are possible */ + while (packaging_impossible == 0) + { + beginning_while_packaging_impossible: + nb_of_loop++; + packaging_impossible = 1; + + /* Then we create the common data matrix */ + long int matrice_donnees_commune[number_task][number_task]; + int i; + for (i = 0; i < number_task; i++) + { + int j; + for (j = 0; j < number_task; j++) + { + matrice_donnees_commune[i][j] = 0; + } + } + + /* Faster first iteration by grouping together tasks that share at least one data. Doesn't look + * further after one task have been found */ + //~ if (nb_of_loop == 1 && strcmp(_starpu_HFP_appli, "chol_model_11") != 0 && faster_first_iteration == 1) + if (nb_of_loop == 1 && faster_first_iteration == 1) + { + packaging_impossible = 0; + index_head_1 = 0; + index_head_2 = 0; + for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + { + if (paquets_data->temp_pointer_1->nb_task_in_sub_list == 1) + { + for (paquets_data->temp_pointer_2 = paquets_data->first_link; paquets_data->temp_pointer_2 != NULL; paquets_data->temp_pointer_2 = paquets_data->temp_pointer_2->next) + { + if (index_head_1 != index_head_2 && paquets_data->temp_pointer_2->nb_task_in_sub_list == 1) + { + for (i = 0; i < paquets_data->temp_pointer_1->package_nb_data; i++) + { + int j; + for (j = 0; j < paquets_data->temp_pointer_2->package_nb_data; j++) + { + if (paquets_data->temp_pointer_1->package_data[i] == paquets_data->temp_pointer_2->package_data[j]) + { + /* Merge */ + //~ printf("On va merge le paquet %d et le paquet %d dans nb of loop == 1.\n", index_head_1, index_head_2); + paquets_data->NP--; + + paquets_data->temp_pointer_1->split_last_ij = paquets_data->temp_pointer_1->nb_task_in_sub_list; + + /* Fusion des listes de tâches */ + while (!starpu_task_list_empty(&paquets_data->temp_pointer_2->sub_list)) + { + starpu_task_list_push_back(&paquets_data->temp_pointer_1->sub_list, starpu_task_list_pop_front(&paquets_data->temp_pointer_2->sub_list)); + } + paquets_data->temp_pointer_1->nb_task_in_sub_list += paquets_data->temp_pointer_2->nb_task_in_sub_list; + + int i_bis = 0; + int j_bis = 0; + tab_runner = 0; + nb_duplicate_data = 0; + /* Fusion des tableaux de données */ + //~ _print_in_terminal ("malloc de %d.\n", paquets_data->temp_pointer_2->package_nb_data + paquets_data->temp_pointer_1->package_nb_data); + + starpu_data_handle_t *temp_data_tab = malloc((paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data) * sizeof(paquets_data->temp_pointer_1->package_data[0])); + while (i_bis < paquets_data->temp_pointer_1->package_nb_data && j_bis < paquets_data->temp_pointer_2->package_nb_data) + { + if (paquets_data->temp_pointer_1->package_data[i_bis] == paquets_data->temp_pointer_2->package_data[j_bis]) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_1->package_data[i_bis]; + temp_data_tab[tab_runner + 1] = paquets_data->temp_pointer_2->package_data[j_bis]; + i_bis++; + j_bis++; + tab_runner++; + nb_duplicate_data++; + } + else if (paquets_data->temp_pointer_1->package_data[i_bis] < paquets_data->temp_pointer_2->package_data[j_bis]) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_1->package_data[i_bis]; + i_bis++; + } + else + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_2->package_data[j_bis]; + j_bis++; + } + tab_runner++; + } + /* Remplissage en vidant les données restantes du paquet I ou J */ + while (i_bis < paquets_data->temp_pointer_1->package_nb_data) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_1->package_data[i_bis]; + i_bis++; + tab_runner++; + } + while (j_bis < paquets_data->temp_pointer_2->package_nb_data) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_2->package_data[j_bis]; + j_bis++; + tab_runner++; + } + /* Remplissage du tableau de données en ignorant les doublons */ + paquets_data->temp_pointer_1->data_weight = 0; + //~ print_in_terminal ("malloc de %d.\n", paquets_data->temp_pointer_2->package_nb_data + paquets_data->temp_pointer_1->package_nb_data - nb_duplicate_data); + paquets_data->temp_pointer_1->package_data = malloc((paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data - nb_duplicate_data) * sizeof(starpu_data_handle_t)); + j_bis = 0; + for (i_bis = 0; i_bis < (paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data); i_bis++) + { + paquets_data->temp_pointer_1->package_data[j_bis] = temp_data_tab[i_bis]; + + paquets_data->temp_pointer_1->data_weight += starpu_data_get_size(temp_data_tab[i_bis]); + + if (temp_data_tab[i_bis] == temp_data_tab[i_bis + 1]) + { + i_bis++; + } + j_bis++; + } + + /* Fusion du nombre de données et du temps prévu */ + paquets_data->temp_pointer_1->package_nb_data = paquets_data->temp_pointer_2->package_nb_data + paquets_data->temp_pointer_1->package_nb_data - nb_duplicate_data; + paquets_data->temp_pointer_1->expected_time += paquets_data->temp_pointer_2->expected_time; + + /* Il faut le mettre à 0 pour le suppr ensuite dans HFP_delete_link */ + paquets_data->temp_pointer_2->package_nb_data = 0; + paquets_data->temp_pointer_2->nb_task_in_sub_list = 0; + + //~ for (i_bis = 0; i_bis < paquets_data->temp_pointer_1->package_nb_data; i_bis++) + //~ { + //~ printf("%p ", paquets_data->temp_pointer_1->package_data[i_bis]); + //~ } + //~ printf("\n"); + + goto start_loop_1; + + } + } + } + } + index_head_2++; + } + } + start_loop_1: + index_head_1++; + index_head_2 = 0; + } + goto break_merging_1; + } + + //~ gettimeofday(&time_start_reset_init_start_while_loop, NULL); + + /* Variables we need to reinitialize for a new iteration */ + paquets_data->temp_pointer_1 = paquets_data->first_link; + paquets_data->temp_pointer_2 = paquets_data->first_link; + index_head_1 = 0; + index_head_2 = 0; + tab_runner = 0; + //~ nb_min_task_packages = 0; + min_nb_task_in_sub_list = 0; + max_value_common_data_matrix = 0; + min_nb_task_in_sub_list = paquets_data->temp_pointer_1->nb_task_in_sub_list; + + //~ gettimeofday(&time_end_reset_init_start_while_loop, NULL); + //~ time_total_reset_init_start_while_loop += (time_end_reset_init_start_while_loop.tv_sec - time_start_reset_init_start_while_loop.tv_sec)*1000000LL + time_end_reset_init_start_while_loop.tv_usec - time_start_reset_init_start_while_loop.tv_usec; + /* First we get the number of packages that have the minimal number of tasks */ + + //~ gettimeofday(&time_start_find_min_size, NULL); + + for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + { + if (min_nb_task_in_sub_list > paquets_data->temp_pointer_1->nb_task_in_sub_list) + { + min_nb_task_in_sub_list = paquets_data->temp_pointer_1->nb_task_in_sub_list; + } + } + //~ gettimeofday(&time_end_find_min_size, NULL); + //~ time_total_find_min_size += (time_end_find_min_size.tv_sec - time_start_find_min_size.tv_sec)*1000000LL + time_end_find_min_size.tv_usec - time_start_find_min_size.tv_usec; + + //~ for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + //~ { + //~ if (min_nb_task_in_sub_list == paquets_data->temp_pointer_1->nb_task_in_sub_list) + //~ { + //~ nb_min_task_packages++; + //~ } + //~ } + //~ if (_print_in_terminal == 1) { printf("Il y a %d paquets de taille minimale %d tâche(s)\n", nb_min_task_packages, min_nb_task_in_sub_list); } + + /* Remplissage de la matrice + obtention du max du poids */ + /* Ancienne version quadratique */ + //~ for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + //~ { + //~ if (paquets_data->temp_pointer_1->nb_task_in_sub_list == min_nb_task_in_sub_list) + //~ { + //~ for (paquets_data->temp_pointer_2 = paquets_data->first_link; paquets_data->temp_pointer_2 != NULL; paquets_data->temp_pointer_2 = paquets_data->temp_pointer_2->next) + //~ { + //~ if (index_head_1 != index_head_2) + //~ { + //~ for (i = 0; i < paquets_data->temp_pointer_1->package_nb_data; i++) + //~ { + //~ for (j = 0; j < paquets_data->temp_pointer_2->package_nb_data; j++) + //~ { + //~ printf("On compare %p et %p.\n", paquets_data->temp_pointer_1->package_data[i], paquets_data->temp_pointer_2->package_data[j]); + //~ if ((paquets_data->temp_pointer_1->package_data[i] == paquets_data->temp_pointer_2->package_data[j])) + //~ { + //~ matrice_donnees_commune[index_head_1][index_head_2] += starpu_data_get_size(paquets_data->temp_pointer_2->package_data[j]); + //~ } + //~ } + //~ } + //~ if (max_value_common_data_matrix < matrice_donnees_commune[index_head_1][index_head_2] && (GPU_limit_switch == 0 || (GPU_limit_switch == 1 && (paquets_data->temp_pointer_1->data_weight + paquets_data->temp_pointer_2->data_weight - matrice_donnees_commune[index_head_1][index_head_2]) <= GPU_RAM_M))) + //~ { + //~ /* Sinon on met la valeur */ + //~ max_value_common_data_matrix = matrice_donnees_commune[index_head_1][index_head_2]; + //~ } + //~ } + //~ index_head_2++; + //~ } + //~ } + //~ index_head_1++; + //~ index_head_2 = 0; + //~ } + + /* Nouvelle version linéaire */ + + //~ gettimeofday(&time_start_fill_matrix_common_data_plus_get_max, NULL); + + for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + { + if (paquets_data->temp_pointer_1->nb_task_in_sub_list == min_nb_task_in_sub_list) + { + for (paquets_data->temp_pointer_2 = paquets_data->first_link; paquets_data->temp_pointer_2 != NULL; paquets_data->temp_pointer_2 = paquets_data->temp_pointer_2->next) + { + if (index_head_1 != index_head_2) + { + i = 0; + int j = 0; + while (i < paquets_data->temp_pointer_1->package_nb_data && j < paquets_data->temp_pointer_2->package_nb_data) + { + if (paquets_data->temp_pointer_1->package_data[i] == paquets_data->temp_pointer_2->package_data[j]) + { + matrice_donnees_commune[index_head_1][index_head_2] += starpu_data_get_size(paquets_data->temp_pointer_2->package_data[j]); + i++; + j++; + } + else if (paquets_data->temp_pointer_1->package_data[i] > paquets_data->temp_pointer_2->package_data[j]) + { + j++; + } + else if (paquets_data->temp_pointer_1->package_data[i] < paquets_data->temp_pointer_2->package_data[j]) + { + i++; + } + } + if (max_value_common_data_matrix < matrice_donnees_commune[index_head_1][index_head_2] && (GPU_limit_switch == 0 || (GPU_limit_switch == 1 && (paquets_data->temp_pointer_1->data_weight + paquets_data->temp_pointer_2->data_weight - matrice_donnees_commune[index_head_1][index_head_2]) <= _starpu_HFP_GPU_RAM_M))) + { + max_value_common_data_matrix = matrice_donnees_commune[index_head_1][index_head_2]; + } + } + index_head_2++; + } + } + index_head_1++; + index_head_2 = 0; + } + + //~ gettimeofday(&time_end_fill_matrix_common_data_plus_get_max, NULL); + //~ time_total_fill_matrix_common_data_plus_get_max += (time_end_fill_matrix_common_data_plus_get_max.tv_sec - time_start_fill_matrix_common_data_plus_get_max.tv_sec)*1000000LL + time_end_fill_matrix_common_data_plus_get_max.tv_usec - time_start_fill_matrix_common_data_plus_get_max.tv_usec; + + /* Code to print the common data matrix */ + //~ if (_print_in_terminal == 1) { printf("Common data matrix : \n"); for (i = 0; i < number_task; i++) { for (j = 0; j < number_task; j++) { printf(" %3li ",matrice_donnees_commune[i][j]); } printf("\n"); printf("---------\n"); } } + + /* Ne fonctionne que en mono GPU pour les matrices sparses :/. */ + if (max_value_common_data_matrix == 0 && GPU_limit_switch == 0) + { + /* It means that P_i share no data with others, so we put it in the end of the list + * For this we use a separate list that we merge at the end + * We will put this list at the end of the rest of the packages */ + //~ if (_print_in_terminal == 1) { printf("graphe non connexe\n"); printf("%d paquets.\n", paquets_data->NP); } + paquets_data->temp_pointer_1 = paquets_data->first_link; + for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + { + if (paquets_data->temp_pointer_1->nb_task_in_sub_list == min_nb_task_in_sub_list) + { + while (!starpu_task_list_empty(&paquets_data->temp_pointer_1->sub_list)) + { + starpu_task_list_push_back(&non_connexe, starpu_task_list_pop_front(&paquets_data->temp_pointer_1->sub_list)); + } + paquets_data->temp_pointer_1->package_nb_data = 0; + paquets_data->NP--; + //~ printf("--\n"); + } + } + + /* Il ne faut pas supprimer le dernier paquet qu'il nous reste evidemment. */ + if (paquets_data->NP < _nb_gpus) + { + goto end_while_packaging_impossible; + } + HFP_delete_link(paquets_data); + number_task = paquets_data->NP; + goto beginning_while_packaging_impossible; + } + else if (max_value_common_data_matrix == 0) + { + GPU_limit_switch = 0; + goto beginning_while_packaging_impossible; + } + else /* Searching the package that get max and merge them */ + { + paquets_data->temp_pointer_1 = paquets_data->first_link; + paquets_data->temp_pointer_2 = paquets_data->first_link; + for (i = 0; i < number_task; i++) + { + //~ printf("i = %d, number task = %d.\n", i, number_task); + if (paquets_data->temp_pointer_1->nb_task_in_sub_list == min_nb_task_in_sub_list) + { + int j; + for (j = 0; j < number_task; j++) + { + if (matrice_donnees_commune[i][j] == max_value_common_data_matrix && i != j) + { + /* Merge */ + packaging_impossible = 0; + //~ printf("On va merge le paquet %d et le paquet %d. Ils ont %ld en commun. Ils ont %d et %d tâches.\n", i, j, max_value_common_data_matrix, paquets_data->temp_pointer_1->nb_task_in_sub_list, paquets_data->temp_pointer_2->nb_task_in_sub_list); + + paquets_data->NP--; + + //~ gettimeofday(&time_start_order_u_total, NULL); + + if (order_u == 1) + { + //~ printf("Début U\n"); + weight_package_i = paquets_data->temp_pointer_1->data_weight; + weight_package_j = paquets_data->temp_pointer_2->data_weight; + if (paquets_data->temp_pointer_1->nb_task_in_sub_list != 1 && paquets_data->temp_pointer_2->nb_task_in_sub_list != 1) + { + if (weight_package_i > _starpu_HFP_GPU_RAM_M && weight_package_j <= _starpu_HFP_GPU_RAM_M) + { + common_data_last_package_i1_j = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 0, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 0, false,_starpu_HFP_GPU_RAM_M); + if (common_data_last_package_i1_j > common_data_last_package_i2_j) + { + paquets_data->temp_pointer_1 = HFP_reverse_sub_list(paquets_data->temp_pointer_1); + } + } + else if (weight_package_i <= _starpu_HFP_GPU_RAM_M && weight_package_j > _starpu_HFP_GPU_RAM_M) + { + common_data_last_package_i_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 0, 1, false, _starpu_HFP_GPU_RAM_M); + common_data_last_package_i_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 0, 2, false, _starpu_HFP_GPU_RAM_M); + if (common_data_last_package_i_j2 > common_data_last_package_i_j1) + { + paquets_data->temp_pointer_2 = HFP_reverse_sub_list(paquets_data->temp_pointer_2); + } + } + else + { + if (weight_package_i > _starpu_HFP_GPU_RAM_M && weight_package_j > _starpu_HFP_GPU_RAM_M) + { + common_data_last_package_i1_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 1, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i1_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 2, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 1, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 2, false,_starpu_HFP_GPU_RAM_M); + } + else if (weight_package_i <= _starpu_HFP_GPU_RAM_M && weight_package_j <= _starpu_HFP_GPU_RAM_M) + { + common_data_last_package_i1_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 1, true,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i1_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 2, true,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 1, true,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 2, true,_starpu_HFP_GPU_RAM_M); + } + else + { + printf("Erreur dans ordre U, aucun cas choisi\n"); fflush(stdout); + exit(0); + } + max_common_data_last_package = common_data_last_package_i2_j1; + if (max_common_data_last_package < common_data_last_package_i1_j1) { max_common_data_last_package = common_data_last_package_i1_j1; } + if (max_common_data_last_package < common_data_last_package_i1_j2) { max_common_data_last_package = common_data_last_package_i1_j2; } + if (max_common_data_last_package < common_data_last_package_i2_j2) { max_common_data_last_package = common_data_last_package_i2_j2; } + if (max_common_data_last_package == common_data_last_package_i1_j2) + { + paquets_data->temp_pointer_1 = HFP_reverse_sub_list(paquets_data->temp_pointer_1); + paquets_data->temp_pointer_2 = HFP_reverse_sub_list(paquets_data->temp_pointer_2); + } + else if (max_common_data_last_package == common_data_last_package_i2_j2) + { + paquets_data->temp_pointer_2 = HFP_reverse_sub_list(paquets_data->temp_pointer_2); + } + else if (max_common_data_last_package == common_data_last_package_i1_j1) + { + paquets_data->temp_pointer_1 = HFP_reverse_sub_list(paquets_data->temp_pointer_1); + } + } + } + } + //~ printf("Fin U\n"); + //~ gettimeofday(&time_end_order_u_total, NULL); + //~ time_total_order_u_total += (time_end_order_u_total.tv_sec - time_start_order_u_total.tv_sec)*1000000LL + time_end_order_u_total.tv_usec - time_start_order_u_total.tv_usec; + + //~ gettimeofday(&time_start_merge, NULL); + + paquets_data->temp_pointer_1->data_weight = paquets_data->temp_pointer_1->data_weight + paquets_data->temp_pointer_2->data_weight - matrice_donnees_commune[i][j]; + + /* Mise à 0 pour ne pas re-merge ces tableaux */ + int j_bis; + for (j_bis = 0; j_bis < number_task; j_bis++) + { + matrice_donnees_commune[i][j_bis] = 0; matrice_donnees_commune[j_bis][i] = 0; + } + for (j_bis = 0; j_bis < number_task; j_bis++) + { + matrice_donnees_commune[j][j_bis] = 0; matrice_donnees_commune[j_bis][j] = 0; + } + + paquets_data->temp_pointer_1->split_last_ij = paquets_data->temp_pointer_1->nb_task_in_sub_list; + + /* Fusion des listes de tâches */ + paquets_data->temp_pointer_1->nb_task_in_sub_list += paquets_data->temp_pointer_2->nb_task_in_sub_list; + while (!starpu_task_list_empty(&paquets_data->temp_pointer_2->sub_list)) + { + starpu_task_list_push_back(&paquets_data->temp_pointer_1->sub_list, starpu_task_list_pop_front(&paquets_data->temp_pointer_2->sub_list)); + } + + int i_bis = 0; + j_bis = 0; + tab_runner = 0; + nb_duplicate_data = 0; + + /* Fusion des tableaux de données */ + starpu_data_handle_t *temp_data_tab = malloc((paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data) * sizeof(paquets_data->temp_pointer_1->package_data[0])); + while (i_bis < paquets_data->temp_pointer_1->package_nb_data && j_bis < paquets_data->temp_pointer_2->package_nb_data) + { + if (paquets_data->temp_pointer_1->package_data[i_bis] == paquets_data->temp_pointer_2->package_data[j_bis]) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_1->package_data[i_bis]; + temp_data_tab[tab_runner + 1] = paquets_data->temp_pointer_2->package_data[j_bis]; + i_bis++; + j_bis++; + tab_runner++; + nb_duplicate_data++; + } + else if (paquets_data->temp_pointer_1->package_data[i_bis] < paquets_data->temp_pointer_2->package_data[j_bis]) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_1->package_data[i_bis]; + i_bis++; + } + else + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_2->package_data[j_bis]; + j_bis++; + } + tab_runner++; + } + /* Remplissage en vidant les données restantes du paquet I ou J */ + while (i_bis < paquets_data->temp_pointer_1->package_nb_data) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_1->package_data[i_bis]; + i_bis++; + tab_runner++; + } + while (j_bis < paquets_data->temp_pointer_2->package_nb_data) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_2->package_data[j_bis]; + j_bis++; + tab_runner++; + } + //~ printf("Nb duplicate data = %d.\n", nb_duplicate_data); + + //~ for (i_bis = 0; i_bis < paquets_data->temp_pointer_1->package_nb_data; i_bis++) + //~ { + //~ printf("%p ", paquets_data->temp_pointer_1->package_data[i_bis]); + //~ } + //~ printf("\n"); + //~ for (i_bis = 0; i_bis < paquets_data->temp_pointer_2->package_nb_data; i_bis++) + //~ { + //~ printf("%p ", paquets_data->temp_pointer_2->package_data[i_bis]); + //~ } + //~ printf("\n"); + + /* Remplissage du tableau de données en ignorant les doublons */ + //~ printf("malloc dans le vrai de %d.\n", paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data - nb_duplicate_data); + paquets_data->temp_pointer_1->package_data = malloc((paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data - nb_duplicate_data) * sizeof(starpu_data_handle_t)); + //~ paquets_data->temp_pointer_1->package_data = malloc((paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data - nb_duplicate_data) * sizeof(paquets_data->temp_pointer_2->package_data[0])); + //~ printf("Apres le malloc.\n"); fflush(stdout); + j_bis = 0; + for (i_bis = 0; i_bis < (paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data); i_bis++) + { + //~ printf("getting %p.\n", temp_data_tab[i_bis]); + paquets_data->temp_pointer_1->package_data[j_bis] = temp_data_tab[i_bis]; + if (temp_data_tab[i_bis] == temp_data_tab[i_bis + 1]) + { + i_bis++; + } + j_bis++; + } + //~ printf("Avant fusion des chiffres.\n"); + /* Fusion du nombre de données et du temps prévu */ + paquets_data->temp_pointer_1->package_nb_data = paquets_data->temp_pointer_2->package_nb_data + paquets_data->temp_pointer_1->package_nb_data - nb_duplicate_data; + paquets_data->temp_pointer_1->expected_time += paquets_data->temp_pointer_2->expected_time; + + /* Il faut le mettre à 0 pour le suppr ensuite dans HFP_delete_link */ + paquets_data->temp_pointer_2->package_nb_data = 0; + + //~ nb_duplicate_data = 0; + + //~ gettimeofday(&time_end_merge, NULL); + //~ time_total_merge += (time_end_merge.tv_sec - time_start_merge.tv_sec)*1000000LL + time_end_merge.tv_usec - time_start_merge.tv_usec; + if(paquets_data->NP == number_of_package_to_build) { goto break_merging_1; } + //~ printf("Fin du merge.\n"); + break; + } + paquets_data->temp_pointer_2 = paquets_data->temp_pointer_2->next; + } + } + paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next; + paquets_data->temp_pointer_2 = paquets_data->first_link; + } + } + + break_merging_1: + //~ printf("break merging.\n"); + paquets_data->temp_pointer_1 = HFP_delete_link(paquets_data); + //~ printf("After delete %d.\n", paquets_data->NP); + /* Checking if we have the right number of packages. if MULTIGPU is equal to 0 we want only one package. if it is equal to 1 we want |GPU| packages */ + if (paquets_data->NP == number_of_package_to_build) + { + goto end_while_packaging_impossible; + } + else if (paquets_data->NP == 1) /* If we have only one package we don't have to do more packages */ + { + goto end_while_packaging_impossible; + } + else /* Reset number of packages for the matrix initialisation */ + { + number_task = paquets_data->NP; + //~ printf("la1.\n"); + } + + //~ if ((iteration == 3 && starpu_get_env_number_default("PRINT_TIME", 0) == 1) || starpu_get_env_number_default("PRINT_TIME", 0) == 2) + //~ { + //~ printf("la2.\n"); + //~ gettimeofday(&time_end_iteration_i, NULL); + //~ time_total_iteration_i = (time_end_iteration_i.tv_sec - time_start_iteration_i.tv_sec)*1000000LL + time_end_iteration_i.tv_usec - time_start_iteration_i.tv_usec; + //~ FILE *f = fopen("Output_maxime/HFP_iteration_time.txt", "a"); + //~ fprintf(f, "%d %lld\n", nb_of_loop, time_total_iteration_i); + //~ fclose(f); + //~ } + } /* End of while (packaging_impossible == 0) { */ + + end_while_packaging_impossible: + //~ if ((iteration == 3 && starpu_get_env_number_default("PRINT_TIME", 0) == 1) || starpu_get_env_number_default("PRINT_TIME", 0) == 2) + //~ { + //~ gettimeofday(&time_end_iteration_i, NULL); + //~ time_total_iteration_i = (time_end_iteration_i.tv_sec - time_start_iteration_i.tv_sec)*1000000LL + time_end_iteration_i.tv_usec - time_start_iteration_i.tv_usec; + //~ FILE *f = fopen("Output_maxime/HFP_iteration_time.txt", "a"); + //~ fprintf(f, "%d %lld\n", nb_of_loop, time_total_iteration_i); + //~ fclose(f); + //~ } + + /* Add tasks or packages that were not connexe */ + while(!starpu_task_list_empty(&non_connexe)) + { + starpu_task_list_push_back(&paquets_data->first_link->sub_list, starpu_task_list_pop_front(&non_connexe)); + paquets_data->first_link->nb_task_in_sub_list++; + } + //~ gettimeofday(&time_end_scheduling, NULL); + //~ time_total_scheduling += (time_end_scheduling.tv_sec - time_start_scheduling.tv_sec)*1000000LL + time_end_scheduling.tv_usec - time_start_scheduling.tv_usec; + + return paquets_data; +} + +/* TODO : attention ne fonctinne pas car non corrigé par rapport aux corrections ci dessus (la complexité, le fait + * de ne pas répéter le get_max_value_common_data_matrix, la première itration simplifié et le calcul des intersections + * pour la matrice en temps linéaire + */ +static struct starpu_task_list hierarchical_fair_packing_one_task_list (struct starpu_task_list task_list, int number_task) +{ + struct _starpu_HFP_paquets *paquets_data = malloc(sizeof(*paquets_data)); + struct _starpu_HFP_my_list *my_data = malloc(sizeof(*my_data)); + starpu_task_list_init(&my_data->sub_list); + my_data->next = NULL; + paquets_data->temp_pointer_1 = my_data; + paquets_data->first_link = paquets_data->temp_pointer_1; + + int number_of_package_to_build = 1; + struct starpu_task_list non_connexe; + starpu_task_list_init(&non_connexe); + int nb_duplicate_data = 0; long int weight_two_packages; /* Used to store the weight the merging of two packages would be. It is then used to see if it's inferior to the size of the RAM of the GPU */ + long int max_value_common_data_matrix = 0; /* Store the maximum weight of the commons data between two packages for all the tasks */ + long int common_data_last_package_i1_j1 = 0; /* Variables used to compare the affinity between sub package 1i and 1j, 1i and 2j etc... */ + long int common_data_last_package_i1_j2 = 0; long int common_data_last_package_i2_j1 = 0; + long int common_data_last_package_i2_j2 = 0; long int max_common_data_last_package = 0; + long int weight_package_i = 0; /* Used for ORDER_U too */ + long int weight_package_j = 0; int i = 0; + int bool_data_common = 0; int GPU_limit_switch = 1; int temp_nb_min_task_packages = 0; int i_bis = 0; int j_bis = 0; int tab_runner = 0; int index_head_1 = 0; int index_head_2 = 0; int common_data_last_package_i2_j = 0; int common_data_last_package_i1_j = 0; int common_data_last_package_i_j1 = 0; int common_data_last_package_i_j2 = 0; + int min_nb_task_in_sub_list = 0; int nb_min_task_packages = 0; + struct starpu_task *task; int nb_of_loop = 0; int packaging_impossible = 0; int link_index = 0; int NB_TOTAL_DONNEES = 0; + task = starpu_task_list_begin(&task_list); + paquets_data->temp_pointer_1->package_data = malloc(STARPU_TASK_GET_NBUFFERS(task)*sizeof(paquets_data->temp_pointer_1->package_data[0])); + struct starpu_task *temp_task; + + //~ task = starpu_task_list_begin(&task_list); + //~ paquets_data->temp_pointer_1->package_data = malloc(STARPU_TASK_GET_NBUFFERS(task)*sizeof(paquets_data->temp_pointer_1->package_data[0])); + /* One task == one link in the linked list */ + int do_not_add_more = number_task - 1; + for (task = starpu_task_list_begin(&task_list); task != starpu_task_list_end(&task_list); task = temp_task) + { + temp_task = starpu_task_list_next(task); + task = starpu_task_list_pop_front(&task_list); + + paquets_data->temp_pointer_1->package_data = malloc(STARPU_TASK_GET_NBUFFERS(task)*sizeof(paquets_data->temp_pointer_1->package_data[0])); + + unsigned j; + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(task); j++) + { + paquets_data->temp_pointer_1->package_data[j] = STARPU_TASK_GET_HANDLE(task, j); + } + paquets_data->temp_pointer_1->package_nb_data = STARPU_TASK_GET_NBUFFERS(task); + NB_TOTAL_DONNEES+=STARPU_TASK_GET_NBUFFERS(task); + /* We sort our data in the packages */ + qsort(paquets_data->temp_pointer_1->package_data,paquets_data->temp_pointer_1->package_nb_data,sizeof(paquets_data->temp_pointer_1->package_data[0]),HFP_pointeurComparator); + /* Pushing the task and the number of the package in the package*/ + starpu_task_list_push_back(&paquets_data->temp_pointer_1->sub_list,task); + /* Initialization of the lists last_packages */ + paquets_data->temp_pointer_1->split_last_ij = 0; + link_index++; + paquets_data->temp_pointer_1->nb_task_in_sub_list=1; + + if(do_not_add_more != 0) + { + _starpu_HFP_insertion(paquets_data); + paquets_data->temp_pointer_1->package_data = malloc(STARPU_TASK_GET_NBUFFERS(task)*sizeof(paquets_data->temp_pointer_1->package_data[0])); + } + do_not_add_more--; + } + paquets_data->first_link = paquets_data->temp_pointer_1; + paquets_data->temp_pointer_2 = paquets_data->first_link; + index_head_2++; + + /* Matrix used to store all the common data weights between packages + int coordinate_visualization_matrix_size = N; + int coordinate_visualization_matrix[coordinate_visualization_matrix_size][coordinate_visualization_matrix_size]; + int coordinate_order_visualization_matrix[coordinate_visualization_matrix_size][coordinate_visualization_matrix_size]; + for (i_bis = 0; i_bis < N; i_bis++) { + for (j_bis = 0; j_bis < N; j_bis++) { + coordinate_visualization_matrix[j_bis][i_bis] = 0; + coordinate_order_visualization_matrix[j_bis][i_bis] = 0; + } + } */ + + /* if (_print_in_terminal == 1) { init_visualisation_tache_matrice_format_tex(); } */ + /* THE while loop. Stop when no more packaging are possible */ + while (packaging_impossible == 0) + { + beginning_while_packaging_impossible: + nb_of_loop++; + packaging_impossible = 1; + //~ if (_print_in_terminal == 1) { printf("############# Itération numéro : %d #############\n",nb_of_loop); } + + /* Variables we need to reinitialize for a new iteration */ + paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_2 = paquets_data->first_link; index_head_1 = 0; index_head_2 = 1; link_index = 0; tab_runner = 0; nb_min_task_packages = 0; + min_nb_task_in_sub_list = 0; weight_two_packages = 0; max_value_common_data_matrix = 0; long int matrice_donnees_commune[number_task][number_task]; + min_nb_task_in_sub_list = paquets_data->temp_pointer_1->nb_task_in_sub_list; + for (i = 0; i < number_task; i++) + { + int j; + for (j = 0; j < number_task; j++) + { + matrice_donnees_commune[i][j] = 0; + } + } + + /* First we get the number of packages that have the minimal number of tasks */ + for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + { + if (min_nb_task_in_sub_list > paquets_data->temp_pointer_1->nb_task_in_sub_list) + { + min_nb_task_in_sub_list = paquets_data->temp_pointer_1->nb_task_in_sub_list; + } + } + for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + { + if (min_nb_task_in_sub_list == paquets_data->temp_pointer_1->nb_task_in_sub_list) + { + nb_min_task_packages++; + } + } + //~ if (_print_in_terminal == 1) { printf("Il y a %d paquets de taille minimale %d tâches\n",nb_min_task_packages,min_nb_task_in_sub_list); } + /* Then we create the common data matrix */ + for (paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_1 != NULL; paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next) + { + for (paquets_data->temp_pointer_2 = paquets_data->temp_pointer_1->next; paquets_data->temp_pointer_2 != NULL; paquets_data->temp_pointer_2 = paquets_data->temp_pointer_2->next) + { + for (i = 0; i < paquets_data->temp_pointer_1->package_nb_data; i++) + { + int j; + for (j = 0; j < paquets_data->temp_pointer_2->package_nb_data; j++) + { + if ((paquets_data->temp_pointer_1->package_data[i] == paquets_data->temp_pointer_2->package_data[j])) + { + matrice_donnees_commune[index_head_1][index_head_2] += starpu_data_get_size(paquets_data->temp_pointer_2->package_data[j]) + starpu_data_get_size(paquets_data->temp_pointer_1->package_data[i]); + matrice_donnees_commune[index_head_2][index_head_1] += starpu_data_get_size(paquets_data->temp_pointer_2->package_data[j]) + starpu_data_get_size(paquets_data->temp_pointer_1->package_data[i]); + } + } + } + index_head_2++; + } + index_head_1++; + index_head_2 = index_head_1 + 1; + } + /* Code to print the common data matrix */ + //~ if (_print_in_terminal == 1) { printf("Common data matrix : \n"); for (i = 0; i < number_task; i++) { for (j = 0; j < number_task; j++) { print_in_terminal (" %3li ",matrice_donnees_commune[i][j]); } printf("\n"); printf("---------\n"); }} + + /* Getting back to the beginning of the linked list */ + paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_2 = paquets_data->first_link; + + i_bis = 0; j_bis = 0; + temp_nb_min_task_packages = nb_min_task_packages; + debut_while: + paquets_data->temp_pointer_1 = paquets_data->first_link; + paquets_data->temp_pointer_2 = paquets_data->first_link; + max_value_common_data_matrix = 0; + if (GPU_limit_switch == 1) + { + for (i_bis = 0; i_bis < number_task; i_bis++) + { + if (paquets_data->temp_pointer_1->nb_task_in_sub_list == min_nb_task_in_sub_list) + { + //Si on est sur un paquet de taille minimale + //~ printf("Sur le paquet minimal %d de %d data\n", i_bis, paquets_data->temp_pointer_1->package_nb_data); + for (paquets_data->temp_pointer_2 = paquets_data->first_link; paquets_data->temp_pointer_2 != NULL; paquets_data->temp_pointer_2 = paquets_data->temp_pointer_2->next) + { + //~ if (i_bis != j_bis && matrice_donnees_commune[i_bis][j_bis] != 0) { + if (i_bis != j_bis) + { + //~ printf("Sur le paquet %d de %d data\n", j_bis, paquets_data->temp_pointer_2->package_nb_data); + weight_two_packages = 0; + for (i = 0; i < paquets_data->temp_pointer_1->package_nb_data; i++) + { + weight_two_packages += starpu_data_get_size(paquets_data->temp_pointer_1->package_data[i]); + } + for (i = 0; i < paquets_data->temp_pointer_2->package_nb_data; i++) + { + bool_data_common = 0; + int j; + for (j = 0; j < paquets_data->temp_pointer_1->package_nb_data; j++) + { + if (paquets_data->temp_pointer_2->package_data[i] == paquets_data->temp_pointer_1->package_data[j]) + { + bool_data_common = 1; + } + } + if (bool_data_common != 1) + { + weight_two_packages += starpu_data_get_size(paquets_data->temp_pointer_2->package_data[i]); + } + } + if((max_value_common_data_matrix < matrice_donnees_commune[i_bis][j_bis]) && (weight_two_packages <= _starpu_HFP_GPU_RAM_M)) + { + max_value_common_data_matrix = matrice_donnees_commune[i_bis][j_bis]; + } + } + j_bis++; + } + tab_runner++; + } + paquets_data->temp_pointer_1=paquets_data->temp_pointer_1->next; + j_bis = 0; + } + paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_2 = paquets_data->first_link; + } + /* Else, we are using algo 5, so we don't check the max weight */ + else + { + for (i_bis = 0; i_bis < number_task; i_bis++) + { + if (paquets_data->temp_pointer_1->nb_task_in_sub_list == min_nb_task_in_sub_list) + { //Si on est sur un paquet de taille minimale + for (paquets_data->temp_pointer_2 = paquets_data->first_link; paquets_data->temp_pointer_2 != NULL; paquets_data->temp_pointer_2 = paquets_data->temp_pointer_2->next) + { + if (i_bis != j_bis) + { + weight_two_packages = 0; + for (i = 0; i < paquets_data->temp_pointer_1->package_nb_data; i++) + { + weight_two_packages += starpu_data_get_size(paquets_data->temp_pointer_1->package_data[i]); + } + for (i = 0; i < paquets_data->temp_pointer_2->package_nb_data; i++) + { + bool_data_common = 0; + int j; + for (j = 0; j < paquets_data->temp_pointer_1->package_nb_data; j++) + { + if (paquets_data->temp_pointer_2->package_data[i] == paquets_data->temp_pointer_1->package_data[j]) + { + bool_data_common = 1; + } + } + if (bool_data_common != 1) + { + weight_two_packages += starpu_data_get_size(paquets_data->temp_pointer_2->package_data[i]); + } + } + if(max_value_common_data_matrix < matrice_donnees_commune[i_bis][j_bis]) + { + max_value_common_data_matrix = matrice_donnees_commune[i_bis][j_bis]; + } + } + j_bis++; + } + tab_runner++; + } + paquets_data->temp_pointer_1=paquets_data->temp_pointer_1->next; + j_bis = 0; + } + paquets_data->temp_pointer_1 = paquets_data->first_link; paquets_data->temp_pointer_2 = paquets_data->first_link; + } + //~ printf("la, max value = %ld, limit switch = %d\n", max_value_common_data_matrix, GPU_limit_switch); + if (max_value_common_data_matrix == 0 && GPU_limit_switch == 0) + { + /* It means that P_i share no data with others, so we put it in the end of the list + * For this we use a separate list that we merge at the end + * We will put this list at the end of the rest of the packages */ + //~ if (_print_in_terminal == 1) { printf("graphe non connexe\n"); } + while (paquets_data->temp_pointer_1->nb_task_in_sub_list != min_nb_task_in_sub_list) + { + paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next; + } + while (!starpu_task_list_empty(&paquets_data->temp_pointer_1->sub_list)) + { + starpu_task_list_push_back(&non_connexe,starpu_task_list_pop_front(&paquets_data->temp_pointer_1->sub_list)); + } + paquets_data->temp_pointer_1->package_nb_data = 0; + paquets_data->NP--; + } + + //~ if (max_value_common_data_matrix == 0 && GPU_limit_switch == 0) { + //~ /* It means that P_i share no data with others, so we put it in the end of the list + //~ * For this we use a separate list that we merge at the end + //~ * We will put this list at the end of the rest of the packages */ + //~ if (_print_in_terminal == 1) { printf("Graphe non connexe\n"); } + //~ while (data->p->temp_pointer_1->nb_task_in_sub_list != min_nb_task_in_sub_list) + //~ { + //~ data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + //~ } + //~ while (!starpu_task_list_empty(&data->p->temp_pointer_1->sub_list)) { + //~ starpu_task_list_push_back(&non_connexe, starpu_task_list_pop_front(&data->p->temp_pointer_1->sub_list)); + //~ } + //~ data->p->temp_pointer_1->package_nb_data = 0; + //~ data->p->NP--; + //~ } + else + { + i_bis = 0; j_bis = 0; + for (i = 0; i < number_task; i++) + { + if (paquets_data->temp_pointer_1->nb_task_in_sub_list == min_nb_task_in_sub_list) + { + int j; + for (j = 0; j < number_task; j++) + { + weight_two_packages = 0; weight_package_i = 0; weight_package_j = 0; + for (i_bis = 0; i_bis < paquets_data->temp_pointer_1->package_nb_data; i_bis++) + { + weight_two_packages += starpu_data_get_size(paquets_data->temp_pointer_1->package_data[i_bis]); + } + weight_package_i = weight_two_packages; + for (i_bis = 0; i_bis < paquets_data->temp_pointer_2->package_nb_data; i_bis++) + { + bool_data_common = 0; + for (j_bis = 0; j_bis < paquets_data->temp_pointer_1->package_nb_data; j_bis++) + { + if (paquets_data->temp_pointer_2->package_data[i_bis] == paquets_data->temp_pointer_1->package_data[j_bis]) + { + bool_data_common = 1; + } + } + if (bool_data_common != 1) + { + weight_two_packages += starpu_data_get_size(paquets_data->temp_pointer_2->package_data[i_bis]); + } + weight_package_j += starpu_data_get_size(paquets_data->temp_pointer_2->package_data[i_bis]); + } + if (matrice_donnees_commune[i][j] == max_value_common_data_matrix && i != j && max_value_common_data_matrix != 0) + { + if ((weight_two_packages <= _starpu_HFP_GPU_RAM_M) || (GPU_limit_switch == 0)) + { + /* Merge */ + packaging_impossible = 0; + //~ if (_print_in_terminal == 1) { printf("On va merge le paquet %d et le paquet %d. Ils ont %ld en commun. Ils ont %d et %d tâches.\n", i, j, max_value_common_data_matrix, paquets_data->temp_pointer_1->nb_task_in_sub_list, paquets_data->temp_pointer_2->nb_task_in_sub_list); } + + paquets_data->NP--; + + if (paquets_data->temp_pointer_2->nb_task_in_sub_list == min_nb_task_in_sub_list) + { + temp_nb_min_task_packages--; + } + + for (j_bis = 0; j_bis < number_task; j_bis++) + { + matrice_donnees_commune[i][j_bis] = 0; matrice_donnees_commune[j_bis][i] = 0; + } + for (j_bis = 0; j_bis < number_task; j_bis++) + { + matrice_donnees_commune[j][j_bis] = 0; + matrice_donnees_commune[j_bis][j] = 0; + } + + if (order_u == 1) + { + if (paquets_data->temp_pointer_1->nb_task_in_sub_list == 1 && paquets_data->temp_pointer_2->nb_task_in_sub_list == 1) + { + //~ if (_print_in_terminal == 1) { printf("I = 1 et J = 1\n"); } + } + else if (weight_package_i > _starpu_HFP_GPU_RAM_M && weight_package_j <= _starpu_HFP_GPU_RAM_M) + { + common_data_last_package_i1_j = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 0, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 0, false,_starpu_HFP_GPU_RAM_M); + if (common_data_last_package_i1_j > common_data_last_package_i2_j) + { + paquets_data->temp_pointer_1 = HFP_reverse_sub_list(paquets_data->temp_pointer_1); + } + } + else if (weight_package_i <= _starpu_HFP_GPU_RAM_M && weight_package_j > _starpu_HFP_GPU_RAM_M) + { + common_data_last_package_i_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 0, 1, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 0, 2, false,_starpu_HFP_GPU_RAM_M); + if (common_data_last_package_i_j2 > common_data_last_package_i_j1) + { + paquets_data->temp_pointer_2 = HFP_reverse_sub_list(paquets_data->temp_pointer_2); + } + } + else + { + if (weight_package_i > _starpu_HFP_GPU_RAM_M && weight_package_j > _starpu_HFP_GPU_RAM_M) + { + common_data_last_package_i1_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 1, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i1_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 2, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 1, false,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 2, false,_starpu_HFP_GPU_RAM_M); + } + else if (weight_package_i <= _starpu_HFP_GPU_RAM_M && weight_package_j <= _starpu_HFP_GPU_RAM_M) + { + common_data_last_package_i1_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 1, true,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i1_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 1, 2, true,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j1 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 1, true,_starpu_HFP_GPU_RAM_M); + common_data_last_package_i2_j2 = get_common_data_last_package(paquets_data->temp_pointer_1, paquets_data->temp_pointer_2, 2, 2, true,_starpu_HFP_GPU_RAM_M); + } + else + { + printf("Erreur dans ordre U, aucun cas choisi\n"); + exit(0); + } + max_common_data_last_package = common_data_last_package_i2_j1; + if (max_common_data_last_package < common_data_last_package_i1_j1) + { + max_common_data_last_package = common_data_last_package_i1_j1; + } + if (max_common_data_last_package < common_data_last_package_i1_j2) + { + max_common_data_last_package = common_data_last_package_i1_j2; + } + if (max_common_data_last_package < common_data_last_package_i2_j2) + { + max_common_data_last_package = common_data_last_package_i2_j2; + } + if (max_common_data_last_package == common_data_last_package_i2_j1) + { + //~ if (_print_in_terminal == 1) { printf("Pas de switch\n"); } + } + else if (max_common_data_last_package == common_data_last_package_i1_j2) + { + //~ if (_print_in_terminal == 1) { printf("SWITCH PAQUET I ET J\n"); } + paquets_data->temp_pointer_1 = HFP_reverse_sub_list(paquets_data->temp_pointer_1); + paquets_data->temp_pointer_2 = HFP_reverse_sub_list(paquets_data->temp_pointer_2); + } + else if (max_common_data_last_package == common_data_last_package_i2_j2) + { + //~ if (_print_in_terminal == 1) { printf("SWITCH PAQUET J\n"); } + paquets_data->temp_pointer_2 = HFP_reverse_sub_list(paquets_data->temp_pointer_2); + } + else + { /* max_common_data_last_package == common_data_last_package_i1_j1 */ + //~ if (_print_in_terminal == 1) { printf("SWITCH PAQUET I\n"); } + paquets_data->temp_pointer_1 = HFP_reverse_sub_list(paquets_data->temp_pointer_1); + } + } + //~ if (_print_in_terminal == 1) { printf("Fin de l'ordre U sans doublons\n"); } + } + + paquets_data->temp_pointer_1->split_last_ij = paquets_data->temp_pointer_1->nb_task_in_sub_list; + while (!starpu_task_list_empty(&paquets_data->temp_pointer_2->sub_list)) + { + starpu_task_list_push_back(&paquets_data->temp_pointer_1->sub_list,starpu_task_list_pop_front(&paquets_data->temp_pointer_2->sub_list)); + paquets_data->temp_pointer_1->nb_task_in_sub_list ++; + } + i_bis = 0; j_bis = 0; tab_runner = 0; + starpu_data_handle_t *temp_data_tab = malloc((paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data) * sizeof(paquets_data->temp_pointer_1->package_data[0])); + while (i_bis < paquets_data->temp_pointer_1->package_nb_data && j_bis < paquets_data->temp_pointer_2->package_nb_data) + { + if (paquets_data->temp_pointer_1->package_data[i_bis] <= paquets_data->temp_pointer_2->package_data[j_bis]) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_1->package_data[i_bis]; + i_bis++; + } + else + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_2->package_data[j_bis]; + j_bis++; + } + tab_runner++; + } + while (i_bis < paquets_data->temp_pointer_1->package_nb_data) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_1->package_data[i_bis]; i_bis++; tab_runner++; + } + while (j_bis < paquets_data->temp_pointer_2->package_nb_data) + { + temp_data_tab[tab_runner] = paquets_data->temp_pointer_2->package_data[j_bis]; j_bis++; tab_runner++; + } + for (i_bis = 0; i_bis < (paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data); i_bis++) + { + if (temp_data_tab[i_bis] == temp_data_tab[i_bis + 1]) + { + temp_data_tab[i_bis] = 0; + nb_duplicate_data++; + } + } + paquets_data->temp_pointer_1->package_data = malloc((paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data - nb_duplicate_data) * sizeof(starpu_data_handle_t)); + j_bis = 0; + for (i_bis = 0; i_bis < (paquets_data->temp_pointer_1->package_nb_data + paquets_data->temp_pointer_2->package_nb_data); i_bis++) + { + if (temp_data_tab[i_bis] != 0) + { + paquets_data->temp_pointer_1->package_data[j_bis] = temp_data_tab[i_bis]; j_bis++; + } + } + paquets_data->temp_pointer_1->package_nb_data = paquets_data->temp_pointer_2->package_nb_data + paquets_data->temp_pointer_1->package_nb_data - nb_duplicate_data; + + paquets_data->temp_pointer_1->expected_time += paquets_data->temp_pointer_2->expected_time; + + paquets_data->temp_pointer_2->package_nb_data = 0; + nb_duplicate_data = 0; + paquets_data->temp_pointer_2->nb_task_in_sub_list = 0; + temp_nb_min_task_packages--; + if(paquets_data->NP == number_of_package_to_build) + { + goto break_merging_1; + } + if (temp_nb_min_task_packages > 1) + { + goto debut_while; + } + else + { + j = number_task; i = number_task; + } + } + } + paquets_data->temp_pointer_2=paquets_data->temp_pointer_2->next; + } + } + paquets_data->temp_pointer_1=paquets_data->temp_pointer_1->next; paquets_data->temp_pointer_2=paquets_data->first_link; + } + } + + break_merging_1: + + paquets_data->temp_pointer_1 = paquets_data->first_link; + paquets_data->temp_pointer_1 = HFP_delete_link(paquets_data); + tab_runner = 0; + + /* Code to get the coordinates of each data in the order in which tasks get out of pull_task */ + while (paquets_data->temp_pointer_1 != NULL) + { + /* if ((strcmp(_starpu_HFP_appli,"starpu_sgemm_gemm") == 0) && (_print_in_terminal == 1)) { + for (task = starpu_task_list_begin(&paquets_data->temp_pointer_1->sub_list); task != starpu_task_list_end(&paquets_data->temp_pointer_1->sub_list); task = starpu_task_list_next(task)) { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task,2),2,temp_tab_coordinates); + coordinate_visualization_matrix[temp_tab_coordinates[0]][temp_tab_coordinates[1]] = NT - paquets_data->temp_pointer_1->index_package - 1; + coordinate_order_visualization_matrix[temp_tab_coordinates[0]][temp_tab_coordinates[1]] = tab_runner; + tab_runner++; + temp_tab_coordinates[0] = 0; temp_tab_coordinates[1] = 0; + } + } */ + link_index++; + paquets_data->temp_pointer_1 = paquets_data->temp_pointer_1->next; + } + /* if (_print_in_terminal == 1) { visualisation_tache_matrice_format_tex(coordinate_visualization_matrix,coordinate_order_visualization_matrix,nb_of_loop,link_index); } */ + + /* Checking if we have the right number of packages. if MULTIGPU is equal to 0 we want only one package. if it is equal to 1 we want |GPU| packages */ + if (link_index == number_of_package_to_build) + { + goto end_while_packaging_impossible; + } + + for (i = 0; i < number_task; i++) + { + int j; + for (j = 0; j < number_task; j++) + { + matrice_donnees_commune[i][j] = 0; + } + } + /* Reset number_task for the matrix initialisation */ + number_task = link_index; + /* If we have only one package we don't have to do more packages */ + if (number_task == 1) + { + packaging_impossible = 1; + } + } /* End of while (packaging_impossible == 0) { */ + /* We are in algorithm 3, we remove the size limit of a package */ + GPU_limit_switch = 0; goto beginning_while_packaging_impossible; + + end_while_packaging_impossible: + + /* Add tasks or packages that were not connexe */ + while(!starpu_task_list_empty(&non_connexe)) + { + starpu_task_list_push_back(&paquets_data->first_link->sub_list, starpu_task_list_pop_front(&non_connexe)); + paquets_data->first_link->nb_task_in_sub_list++; + } + + return paquets_data->first_link->sub_list; +} + +/* Check if our struct is empty */ +bool _starpu_HFP_is_empty(struct _starpu_HFP_my_list* a) +{ + if (a == NULL) + { + return true; + } + if (!starpu_task_list_empty(&a->sub_list)) + { + return false; + } + while (a->next != NULL) + { + a = a->next; + if (!starpu_task_list_empty(&a->sub_list)) + { + return false; + } + } + return true; +} + +/* TODO : a supprimer une fois les mesures du temps terminées */ +//~ struct timeval time_start_loadbalanceexpectedtime; +//~ struct timeval time_end_loadbalanceexpectedtime; +//~ long long time_total_loadbalanceexpectedtime = 0; + +/* Equilibrates package in order to have packages with the exact same expected task time + * Called in HFP_pull_task once all packages are done + * Used for MULTIGPU == 4 + */ +static void load_balance_expected_time(struct _starpu_HFP_paquets *a, int number_gpu) +{ + //~ gettimeofday(&time_start_loadbalanceexpectedtime, NULL); + + struct starpu_task *task; + double ite = 0; int i = 0; int index = 0; + int package_with_min_expected_time, package_with_max_expected_time; + double min_expected_time, max_expected_time, expected_time_to_steal = 0; + bool load_balance_needed = true; + //~ if (_print_in_terminal == 1) { printf("A package should have %f time +/- 5 percent\n", EXPECTED_TIME/number_gpu); } + /* Selecting the smallest and biggest package */ + while (load_balance_needed == true) + { + a->temp_pointer_1 = a->first_link; + min_expected_time = a->temp_pointer_1->expected_time; + max_expected_time = a->temp_pointer_1->expected_time; + package_with_min_expected_time = 0; + package_with_max_expected_time = 0; + i = 0; + a->temp_pointer_1 = a->temp_pointer_1->next; + while (a->temp_pointer_1 != NULL) + { + i++; + if (min_expected_time > a->temp_pointer_1->expected_time) + { + min_expected_time = a->temp_pointer_1->expected_time; + package_with_min_expected_time = i; + } + if (max_expected_time < a->temp_pointer_1->expected_time) + { + max_expected_time = a->temp_pointer_1->expected_time; + package_with_max_expected_time = i; + } + a->temp_pointer_1 = a->temp_pointer_1->next; + } + //~ if (_print_in_terminal == 1) { printf("min et max : %f et %f\n",min_expected_time, max_expected_time); } + //~ exit(0); + /* Stealing as much task from the last tasks of the biggest packages */ + //~ if (package_with_min_expected_time == package_with_max_expected_time || min_expected_time >= max_expected_time - ((5*max_expected_time)/100)) { + if (package_with_min_expected_time == package_with_max_expected_time) + { + //~ if (_print_in_terminal == 1) { printf("All packages have the same expected time\n"); } + load_balance_needed = false; + } + else + { + /* Getting on the right packages */ + a->temp_pointer_1 = a->first_link; + for (i = 0; i < package_with_min_expected_time; i++) + { + a->temp_pointer_1 = a->temp_pointer_1->next; + } + a->temp_pointer_2 = a->first_link; index = 0; + for (i = 0; i < package_with_max_expected_time; i++) + { + a->temp_pointer_2 = a->temp_pointer_2->next; + index++; + } + if (a->temp_pointer_2->expected_time - ((EXPECTED_TIME/number_gpu) - a->temp_pointer_1->expected_time) >= EXPECTED_TIME/number_gpu) + { + //~ printf("if\n"); + expected_time_to_steal = (EXPECTED_TIME/number_gpu) - a->temp_pointer_1->expected_time; + } + else + { + //~ printf("else\n"); + expected_time_to_steal = a->temp_pointer_2->expected_time - EXPECTED_TIME/number_gpu; + } + task = starpu_task_list_pop_back(&a->temp_pointer_2->sub_list); + if (starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0) > expected_time_to_steal) + { + //~ printf("task et expected : %f, %f\n",starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0), expected_time_to_steal); + starpu_task_list_push_back(&a->temp_pointer_2->sub_list, task); + break; + } + starpu_task_list_push_back(&a->temp_pointer_2->sub_list, task); + ite = 0; + + //Pour visu python. Pas implémenté dans load_balance et load_balance_expected_package_time + FILE *f = fopen("Output_maxime/Data_stolen_load_balance.txt", "a"); + + while (ite < expected_time_to_steal) + { + task = starpu_task_list_pop_back(&a->temp_pointer_2->sub_list); + ite += starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + a->temp_pointer_2->expected_time = a->temp_pointer_2->expected_time - starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + + //Pour visu python + if (_print_in_terminal == 1) + { + int temp_tab_coordinates[2]; + if (_print3d != 0) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, temp_tab_coordinates); + fprintf(f, "%d %d", temp_tab_coordinates[0], temp_tab_coordinates[1]); + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 0), 2, temp_tab_coordinates); + fprintf(f, " %d %d\n", temp_tab_coordinates[0], index); + } + else + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, temp_tab_coordinates); + fprintf(f, "%d %d %d\n", temp_tab_coordinates[0], temp_tab_coordinates[1], index); + } + } + + /* Merging */ + merge_task_and_package(a->temp_pointer_1, task); + a->temp_pointer_2->nb_task_in_sub_list--; + } + + fclose(f); + } + } + + //~ gettimeofday(&time_end_loadbalanceexpectedtime, NULL); + //~ time_total_loadbalanceexpectedtime += (time_end_loadbalanceexpectedtime.tv_sec - time_start_loadbalanceexpectedtime.tv_sec)*1000000LL + time_end_loadbalanceexpectedtime.tv_usec - time_start_loadbalanceexpectedtime.tv_usec; +} + +/* Equilibrates package in order to have packages with the exact same number of tasks +/-1 task + * Called in HFP_pull_task once all packages are done + */ +static void load_balance(struct _starpu_HFP_paquets *a, int number_gpu) +{ + int min_number_task_in_package, package_with_min_number_task, i, max_number_task_in_package, package_with_max_number_task, number_task_to_steal = 0; + bool load_balance_needed = true; + struct starpu_task *task = NULL; + //~ if (_print_in_terminal == 1){ printf("A package should have %d or %d tasks\n", NT/number_gpu, NT/number_gpu+1); } + /* Selecting the smallest and biggest package */ + while (load_balance_needed == true) + { + a->temp_pointer_1 = a->first_link; + min_number_task_in_package = a->temp_pointer_1->nb_task_in_sub_list; + max_number_task_in_package = a->temp_pointer_1->nb_task_in_sub_list; + package_with_min_number_task = 0; + package_with_max_number_task = 0; + i = 0; + a->temp_pointer_1 = a->temp_pointer_1->next; + while (a->temp_pointer_1 != NULL) + { + i++; + if (min_number_task_in_package > a->temp_pointer_1->nb_task_in_sub_list) + { + min_number_task_in_package = a->temp_pointer_1->nb_task_in_sub_list; + package_with_min_number_task = i; + } + if (max_number_task_in_package < a->temp_pointer_1->nb_task_in_sub_list) + { + max_number_task_in_package = a->temp_pointer_1->nb_task_in_sub_list; + package_with_max_number_task = i; + } + a->temp_pointer_1 = a->temp_pointer_1->next; + } + /* Stealing as much task from the last tasks of the biggest packages */ + if (package_with_min_number_task == package_with_max_number_task || min_number_task_in_package == max_number_task_in_package-1) + { + load_balance_needed = false; + } + else + { + a->temp_pointer_1 = a->first_link; + for (i = 0; i < package_with_min_number_task; i++) + { + a->temp_pointer_1 = a->temp_pointer_1->next; + } + a->temp_pointer_2 = a->first_link; + for (i = 0; i < package_with_max_number_task; i++) + { + a->temp_pointer_2 = a->temp_pointer_2->next; + } + if ((_starpu_HFP_NT/number_gpu) - a->temp_pointer_1->nb_task_in_sub_list == 0) + { + number_task_to_steal = 1; + } + else if (a->temp_pointer_2->nb_task_in_sub_list - ((_starpu_HFP_NT/number_gpu) - a->temp_pointer_1->nb_task_in_sub_list) >= _starpu_HFP_NT/number_gpu) + { + number_task_to_steal = (_starpu_HFP_NT/number_gpu) - a->temp_pointer_1->nb_task_in_sub_list; + } + else + { + number_task_to_steal = a->temp_pointer_2->nb_task_in_sub_list - _starpu_HFP_NT/number_gpu; + } + for (i = 0; i < number_task_to_steal; i++) + { + task = starpu_task_list_pop_back(&a->temp_pointer_2->sub_list); + merge_task_and_package(a->temp_pointer_1, task); + a->temp_pointer_2->expected_time -= starpu_task_expected_length(task, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + a->temp_pointer_2->nb_task_in_sub_list--; + } + } + } +} + +/* Printing in a .tex file the number of GPU that a data is used in. + * With red = 255 if it's on GPU 1, blue if it's on GPU 2 and green on GPU 3. + * Thus it only work for 3 GPUs. + * Also print the number of use in each GPU. + * TODO : Faire marcher cette fonction avec n GPUs + */ +//static void visualisation_data_gpu_in_file_hfp_format_tex(struct _starpu_HFP_paquets *p) +//{ +// struct starpu_task *task; +// int i = 0; +// int j = 0; +// int k = 0; +// int red, green, blue; +// int temp_tab_coordinates[2]; +// FILE *f = fopen("Output_maxime/Data_in_gpu_HFP.tex", "w"); +// fprintf(f, "\\documentclass{article}\\usepackage{diagbox}\\usepackage{color}\\usepackage{fullpage}\\usepackage{colortbl}\\usepackage{caption}\\usepackage{subcaption}\\usepackage{float}\\usepackage{graphics}\\begin{document}\n\n\n\\begin{figure}[H]\n"); +// int data_use_in_gpus[_starpu_HFP_N*2][_nb_gpus + 1]; +// for (j = 0; j < 2; j++) +// { +// printf("premier for\n"); +// for (i = 0; i < _starpu_HFP_N*2; i++) +// { +// for (k = 0; k < _nb_gpus + 1; k++) +// { +// data_use_in_gpus[i][k] = 0; +// } +// } +// fprintf(f, "\\begin{subfigure}{.5\\textwidth}\\centering\\begin{tabular}{|"); +// for (i = 0; i < _starpu_HFP_N; i++) +// { +// fprintf(f,"c|"); +// } +// fprintf(f,"c|}\\hline\\diagbox{GPUs}{Data}&"); +// p->temp_pointer_1 = p->first_link; +// i = 0; +// while (p->temp_pointer_1 != NULL) +// { +// for (task = starpu_task_list_begin(&p->temp_pointer_1->sub_list); task != starpu_task_list_end(&p->temp_pointer_1->sub_list); task = starpu_task_list_next(task)) +// { +// starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task,2),2,temp_tab_coordinates); +// data_use_in_gpus[temp_tab_coordinates[j]][i]++; +// } +// p->temp_pointer_1 = p->temp_pointer_1->next; +// i++; +// } +// for (i = 0; i < _starpu_HFP_N - 1; i++) +// { +// red = 0; +// green = 0; +// blue = 0; +// for (k = 0; k < _nb_gpus; k++) +// { +// if (data_use_in_gpus[i][k] != 0) +// { +// if (k%3 == 0) +// { +// red = 1; +// } +// if (k%3 == 1) +// { +// green = 1; +// } +// if (k%3 == 2) +// { +// blue = 1; +// } +// } +// } +// fprintf(f,"\\cellcolor[RGB]{%d,%d,%d}%d&", red*255, green*255, blue*255, i); +// } +// red = 0; +// green = 0; +// blue = 0; +// for (k = 0; k < _nb_gpus; k++) +// { +// if (data_use_in_gpus[_starpu_HFP_N - 1][k] != 0) +// { +// if (k%3 == 0) +// { +// red = 1; +// } +// if (k%3 == 1) +// { +// green = 1; +// } +// if (k%3 == 2) +// { +// blue = 1; +// } +// } +// } +// fprintf(f,"\\cellcolor[RGB]{%d,%d,%d}%d\\\\\\hline", red*255, green*255, blue*255, _starpu_HFP_N - 1); +// for (i = 0; i < _nb_gpus; i++) +// { +// red = 0; +// green = 0; +// blue = 0; +// if (i%3 == 0) +// { +// red = 1; +// } +// if (i%3 == 1) +// { +// green = 1; +// } +// if (i%3 == 2) +// { +// blue = 1; +// } +// fprintf(f, " \\cellcolor[RGB]{%d,%d,%d}GPU %d&", red*255, green*255, blue*255, i); +// for (k = 0; k < _starpu_HFP_N - 1; k++) +// { +// fprintf(f, "%d&", data_use_in_gpus[k][i]); +// } +// fprintf(f, "%d\\\\\\hline", data_use_in_gpus[_starpu_HFP_N - 1][i]); +// } +// fprintf(f, "\\end{tabular}\\caption{Data from matrix "); +// if (j == 0) +// { +// fprintf(f, "A"); +// } +// else +// { +// fprintf(f, "B"); +// } +// fprintf(f, "}\\end{subfigure}\n\n"); +// } +// fprintf(f, "\\caption{Number of use of a data in each GPU}\\end{figure}\n\n\n\\end{document}"); +// fclose(f); +//} + +/* Print the order in one file for each GPU and also print in a tex file the coordinate for 2D matrix, before the ready, so it's only planned */ +static void print_order_in_file_hfp(struct _starpu_HFP_paquets *p) +{ + char str[2]; + unsigned i = 0; + int size = 0; + char *path = NULL; + + p->temp_pointer_1 = p->first_link; + struct starpu_task *task; + while (p->temp_pointer_1 != NULL) + { + sprintf(str, "%d", i); + size = strlen("Output_maxime/Task_order_HFP_") + strlen(str); + path = malloc(sizeof(char)*size); + strcpy(path, "Output_maxime/Task_order_HFP_"); + strcat(path, str); + FILE *f = fopen(path, "w"); + for (task = starpu_task_list_begin(&p->temp_pointer_1->sub_list); task != starpu_task_list_end(&p->temp_pointer_1->sub_list); task = starpu_task_list_next(task)) + { + fprintf(f, "%p\n",task); + } + p->temp_pointer_1 = p->temp_pointer_1->next; + i++; + fclose(f); + } + if (_print_in_terminal == 1 && (strcmp(_starpu_HFP_appli,"starpu_sgemm_gemm") == 0)) + { + i = 0; + p->temp_pointer_1 = p->first_link; + FILE *f = fopen("Output_maxime/Data_coordinates_order_last_HFP.txt", "w"); + int temp_tab_coordinates[2]; + while (p->temp_pointer_1 != NULL) + { + for (task = starpu_task_list_begin(&p->temp_pointer_1->sub_list); task != starpu_task_list_end(&p->temp_pointer_1->sub_list); task = starpu_task_list_next(task)) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task,2),2,temp_tab_coordinates); + fprintf(f, "%d %d %d\n", temp_tab_coordinates[0], temp_tab_coordinates[1], i); + } + p->temp_pointer_1 = p->temp_pointer_1->next; + i++; + } + fclose(f); + //visualisation_tache_matrice_format_tex("HFP"); + } +} + +/* Attention, la dedans je vide la liste l. Et donc si tu lui donne sched_list et que + * derrière t'essaye de la lire comme je fesais dans MST, et bah ca va crasher. + * Aussi si tu lance hMETIS dans un do_schedule, attention de bien mettre do_schedule_done à true + * et de sortir de la fonction avec un return;. + */ +void _starpu_hmetis_scheduling(struct _starpu_HFP_paquets *p, struct starpu_task_list *l, int nb_gpu) +{ + FILE *f1 = fopen("Output_maxime/temp_input_hMETIS.txt", "w+"); + _starpu_HFP_NT = 0; + struct starpu_task *task_1; + int NT = 0; + bool first_write_on_line = true, already_counted = false; + int index_task_1 = 1; int index_task_2 = 0; int number_hyperedge = 0; + + for (task_1 = starpu_task_list_begin(l); task_1 != starpu_task_list_end(l); task_1 = starpu_task_list_next(task_1)) + { + //~ printf("Tâche : %p\n", task_1); + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task_1); i++) + { + struct starpu_task *task_3; + task_3 = starpu_task_list_begin(l); + already_counted = false; + int k; + for (k = 1; k < index_task_1; k++) + { + unsigned m; + for (m = 0; m < STARPU_TASK_GET_NBUFFERS(task_3); m++) + { + if (STARPU_TASK_GET_HANDLE(task_1, i) == STARPU_TASK_GET_HANDLE(task_3, m)) + { + already_counted = true; + break; + } + } + if (already_counted == true) + { + break; + } + task_3 = starpu_task_list_next(task_3); + } + if (already_counted == false) + { + first_write_on_line = true; + index_task_2 = index_task_1 + 1; + struct starpu_task *task_2; + for (task_2 = starpu_task_list_next(task_1); task_2 != starpu_task_list_end(l); task_2 = starpu_task_list_next(task_2)) + { + unsigned j; + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(task_2); j++) + { + if (STARPU_TASK_GET_HANDLE(task_1, i) == STARPU_TASK_GET_HANDLE(task_2, j)) + { + if (first_write_on_line == true) + { + first_write_on_line = false; + fprintf(f1, "%d %d", index_task_1, index_task_2); + number_hyperedge++; + } + else + { + fprintf(f1, " %d", index_task_2); + } + } + } + index_task_2++; + } + if (first_write_on_line == false) + { + fprintf(f1, "\n"); + } + } + } + index_task_1++; + NT++; + } + _starpu_HFP_N = sqrt(NT); + if(_print3d == 1) + { + _starpu_HFP_N = _starpu_HFP_N/2; /* So i can print just like a 2D matrix */ + } + /* Printing expected time of each task */ + for (task_1 = starpu_task_list_begin(l); task_1 != starpu_task_list_end(l); task_1 = starpu_task_list_next(task_1)) + { + fprintf(f1, "%f\n", starpu_task_expected_length(task_1, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0)); + } + /* Printing information for hMETIS on the first line */ + FILE *f_3 = fopen("Output_maxime/input_hMETIS.txt", "w+"); + fprintf(f_3, "%d %d 10\n", number_hyperedge, NT); /* Number of hyperedges, number of task, 10 for weighted vertices but non weighted */ + char ch; + rewind(f1); + while ((ch = fgetc(f1)) != EOF) + fputc(ch, f_3); + fclose(f1); + fclose(f_3); + //TODO : remplacer le 3 par nb_gpu ici + //TODO tester différents paramètres de hmetis et donc modifier ici + FILE *f2 = fopen("Output_maxime/hMETIS_parameters.txt", "r"); + + //~ Nparts : nombre de paquets. + //~ UBfactor : 1 - 49, a tester. Déséquilibre autorisé. + //~ Nruns : 1 - inf, a tester. 1 par défaut. Plus on test plus ce sera bon mais ce sera plus long. + //~ CType : 1 - 5, a tester. 1 par défaut. + //~ RType : 1 - 3, a tester. 1 par défaut. + //~ Vcycle : 1. Sélectionne la meilleure des Nruns. + //~ Reconst : 0 - 1, a tester. 0 par défaut. Normalement ca ne devrait rien changer car ca joue juste sur le fait de reconstruire les hyperedges ou non. + //~ dbglvl : 0. Sert à montrer des infos de debug; Si besoin mettre (1, 2 ou 4). + + int size = strlen("../these_gonthier_maxime/hMETIS/hmetis-1.5-linux/hmetis Output_maxime/input_hMETIS.txt_"); + char buffer[100]; + while (fscanf(f2, "%s", buffer) == 1) + { + size += sizeof(buffer); + } + rewind(f2); + char *system_call = (char *)malloc(size); + strcpy(system_call, "../these_gonthier_maxime/hMETIS/hmetis-1.5-linux/hmetis Output_maxime/input_hMETIS.txt"); + while (fscanf(f2, "%s", buffer)== 1) + { + strcat(system_call, " "); + strcat(system_call, buffer); + } + //~ printf("System call will be: %s\n", system_call); + int cr = system(system_call); + if (cr != 0) + { + printf("Error when calling system(../these_gonthier_maxime/hMETIS/hmetis-1.5-linux/hmetis\n"); + exit(0); + } + starpu_task_list_init(&p->temp_pointer_1->refused_fifo_list); + int i; + for (i = 1; i < nb_gpu; i++) + { + _starpu_HFP_insertion(p); + starpu_task_list_init(&p->temp_pointer_1->refused_fifo_list); + } + p->first_link = p->temp_pointer_1; + char str[2]; + sprintf(str, "%d", nb_gpu); + size = strlen("Output_maxime/input_hMETIS.txt.part.") + strlen(str); + char *path2 = (char *)malloc(size); + strcpy(path2, "Output_maxime/input_hMETIS.txt.part."); + strcat(path2, str); + FILE *f_2 = fopen(path2, "r"); + int number; int error; + for (i = 0; i < NT; i++) + { + error = fscanf(f_2, "%d", &number); + if (error == 0) + { + printf("error fscanf in hMETIS\n"); exit(0); + } + p->temp_pointer_1 = p->first_link; + int j; + for (j = 0; j < number; j++) + { + p->temp_pointer_1 = p->temp_pointer_1->next; + } + task_1 = starpu_task_list_pop_front(l); + p->temp_pointer_1->expected_time += starpu_task_expected_length(task_1, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + starpu_task_list_push_back(&p->temp_pointer_1->sub_list, task_1); + p->temp_pointer_1->nb_task_in_sub_list++; + } + fclose(f_2); + + /* Apply HFP on each package if we have the right option */ + if (_starpu_HFP_hmetis == 2) + { + if (_print_in_terminal == 1) + { + i = 0; + p->temp_pointer_1 = p->first_link; + FILE *f = fopen("Output_maxime/Data_coordinates_order_last_hMETIS.txt", "w"); + int temp_tab_coordinates[2]; + while (p->temp_pointer_1 != NULL) + { + for (task_1 = starpu_task_list_begin(&p->temp_pointer_1->sub_list); task_1 != starpu_task_list_end(&p->temp_pointer_1->sub_list); task_1 = starpu_task_list_next(task_1)) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task_1,2),2,temp_tab_coordinates); + fprintf(f, "%d %d %d\n", temp_tab_coordinates[0], temp_tab_coordinates[1], i); + } + p->temp_pointer_1 = p->temp_pointer_1->next; + i++; + } + fclose(f); + //visualisation_tache_matrice_format_tex("hMETIS"); /* So I can get the matrix visualisation before tempering it with HFP */ + } + p->temp_pointer_1 = p->first_link; + for (i = 0; i < nb_gpu; i++) + { + p->temp_pointer_1->sub_list = hierarchical_fair_packing_one_task_list(p->temp_pointer_1->sub_list, p->temp_pointer_1->nb_task_in_sub_list); + p->temp_pointer_1 = p->temp_pointer_1->next; + } + } +} + +/* Attention, la dedans je vide la liste l. Et donc si tu lui donne sched_list et que + * derrière t'essaye de la lire comme je fesais dans MST, et bah ca va crasher. + * Aussi si tu lance hMETIS dans un do_schedule, attention de bien mettre do_schedule_done à true + * et de sortir de la fonction avec un return;. + * + * CECI EST LA FONCTION POUR QUAND J'AI DEJA LE FICHIER input DE PRET CAR JE L'AI FAIS A l'AVANCE POUR GRID5K PAR EXEMPLE! + */ +static void hmetis_input_already_generated(struct _starpu_HFP_paquets *p, struct starpu_task_list *l, int nb_gpu) +{ + int random_task_order = starpu_get_env_number_default("RANDOM_TASK_ORDER", 0); + sparse_matrix = starpu_get_env_number_default("SPARSE_MATRIX", 0); + + _starpu_HFP_NT = starpu_task_list_size(l); + int i = 0; struct starpu_task *task_1; + int j = 0; + + _starpu_HFP_N = hmetis_n; + if (_starpu_HFP_N == 0) + { + printf("Error, please precis env var HMETIS_N.\n"); + exit(0); + } + int size = 0; + starpu_task_list_init(&p->temp_pointer_1->refused_fifo_list); + for (i = 1; i < nb_gpu; i++) + { + _starpu_HFP_insertion(p); + starpu_task_list_init(&p->temp_pointer_1->refused_fifo_list); + } + p->first_link = p->temp_pointer_1; + + char str[2]; + char Nchar[4]; + sprintf(str, "%d", nb_gpu); + sprintf(Nchar, "%d", _starpu_HFP_N); + if (random_task_order == 1) + { + size = strlen("Output_maxime/Data/input_hMETIS/") + strlen(str) + strlen("GPU_Random_task_order/input_hMETIS_N") + strlen(Nchar) + strlen(".txt"); + } + else if (_starpu_HFP_hmetis == 5) /* Cas matrice 3D */ + { + size = strlen("Output_maxime/Data/input_hMETIS/") + strlen(str) + strlen("GPU_Matrice3D/input_hMETIS_N") + strlen(Nchar) + strlen(".txt"); + } + else if (_starpu_HFP_hmetis == 6) /* Cas cholesky */ + { + size = strlen("Output_maxime/Data/input_hMETIS/") + strlen(str) + strlen("GPU_Cholesky/input_hMETIS_N") + strlen(Nchar) + strlen(".txt"); + } + else + { + size = strlen("Output_maxime/Data/input_hMETIS/") + strlen(str) + strlen("GPU/input_hMETIS_N") + strlen(Nchar) + strlen(".txt"); + } + + /* Cas sparse */ + if (sparse_matrix != 0) + { + size += strlen("_sparse"); + } + + char *path2 = (char *)malloc(size); + strcpy(path2, "Output_maxime/Data/input_hMETIS/"); + strcat(path2, str); + + if (random_task_order == 1) + { + strcat(path2, "GPU_Random_task_order"); + } + else if (_starpu_HFP_hmetis == 5) /* Cas matrice 3D */ + { + strcat(path2, "GPU_Matrice3D"); + } + else if (_starpu_HFP_hmetis == 6) /* Cas Cholesky */ + { + strcat(path2, "GPU_Cholesky"); + } + else + { + strcat(path2, "GPU"); + } + + /* Cas sparse */ + if (sparse_matrix != 0) + { + strcat(path2, "_sparse/input_hMETIS_N"); + } + else + { + strcat(path2, "/input_hMETIS_N"); + } + + strcat(path2, Nchar); + strcat(path2, ".txt"); + FILE *f_2 = fopen(path2, "r"); + int number; int error; + for (i = 0; i < _starpu_HFP_NT; i++) + { + error = fscanf(f_2, "%d", &number); + if (error == 0) + { + printf("error fscanf in hMETIS input already generated\n"); + exit(0); + } + p->temp_pointer_1 = p->first_link; + for (j = 0; j < number; j++) + { + p->temp_pointer_1 = p->temp_pointer_1->next; + } + task_1 = starpu_task_list_pop_front(l); + p->temp_pointer_1->expected_time += starpu_task_expected_length(task_1, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + p->temp_pointer_1->expected_package_computation_time += starpu_task_expected_length(task_1, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); + p->temp_pointer_1->nb_task_in_sub_list++; + starpu_task_list_push_back(&p->temp_pointer_1->sub_list, task_1); + } + fclose(f_2); +} + +static void init_visualisation(struct _starpu_HFP_paquets *a) +{ + print_order_in_file_hfp(a); + //~ if (multigpu != 0 && (strcmp(_starpu_HFP_appli, "starpu_sgemm_gemm") == 0)) + //~ { + //~ visualisation_data_gpu_in_file_hfp_format_tex(a); + //~ } + //TODO corriger la manière dont je vide si il y a plus de 3 GPUs + FILE *f = fopen("Output_maxime/Task_order_effective_0", "w"); /* Just to empty it before */ + fclose(f); + f = fopen("Output_maxime/Task_order_effective_1", "w"); /* Just to empty it before */ + fclose(f); + f = fopen("Output_maxime/Task_order_effective_2", "w"); /* Just to empty it before */ + fclose(f); + f = fopen("Output_maxime/Data_coordinates_order_last_scheduler.txt", "w"); + fclose(f); +} + +/* The function that sort the tasks in packages */ +static struct starpu_task *HFP_pull_task(struct starpu_sched_component *component, struct starpu_sched_component *to) +{ + //~ printf("Début de HFP_pull_task\n"); fflush(stdout); + struct _starpu_HFP_sched_data *data = component->data; + int i = 0; + struct starpu_task *task1 = NULL; + + if (_starpu_HFP_do_schedule_done == true) + { + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + /* If one or more task have been refused */ + data->p->temp_pointer_1 = data->p->first_link; + if (data->p->temp_pointer_1->next != NULL) + { + for (i = 0; i < _nb_gpus; i++) + { + if (to == component->children[i]) + { + break; + } + else + { + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + } + } + } + //~ if (i == _nb_gpus) /* To return NULL to the cpus */ + //~ { + //~ STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + //~ return NULL; + //~ } + if (!starpu_task_list_empty(&data->p->temp_pointer_1->refused_fifo_list)) + { + //~ printf("refused not empty.\n"); + task1 = starpu_task_list_pop_back(&data->p->temp_pointer_1->refused_fifo_list); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + //~ printf("Return in pull_task because data->p->temp_pointer_1->refused_fifo_list not empty %p.\n", task1); + return task1; + } + + /* If the linked list is empty */ + if (_starpu_HFP_is_empty(data->p->first_link) == true) + { + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + //~ printf("Return NULL linked list empty.\n"); + return NULL; + } + //~ printf("go to get task to return.\n"); + task1 = _starpu_HFP_get_task_to_return(component, to, data->p, _nb_gpus); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + //~ printf("Return in pull_task %p.\n", task1); + return task1; + } + //~ printf("Return NULL.\n"); + return NULL; +} + +static int HFP_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to) +{ + struct _starpu_HFP_sched_data *data = component->data; + int didwork = 0; + int i = 0; + + struct starpu_task *task; + task = starpu_sched_component_pump_to(component, to, &didwork); + + if (task) + { + //~ printf("%p failed\n"); + /* Oops, we couldn't push everything, put back this task */ + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + data->p->temp_pointer_1 = data->p->first_link; + int nb_gpu = _get_number_GPU(); + if (data->p->temp_pointer_1->next == NULL) + { + starpu_task_list_push_back(&data->p->temp_pointer_1->refused_fifo_list, task); + } + else + { + //A corriger. En fait il faut push back dans une fifo a part puis pop back dans cette fifo dans pull task + //Ici le pb c'est si plusieurs taches se font refusé + for (i = 0; i < nb_gpu; i++) + { + if (to == component->children[i]) + { + break; + } + else + { + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + } + } + starpu_task_list_push_back(&data->p->temp_pointer_1->refused_fifo_list, task); + } + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + } + else + { + /* Can I uncomment this part ? */ + //~ { + //~ if (didwork) + //~ if (_print_in_terminal == 1) { fprintf(stderr, "pushed some tasks to %p\n", to); } + //~ else + //~ if (_print_in_terminal == 1) { fprintf(stderr, "I didn't have anything for %p\n", to); } + //~ } + } + + /* There is room now */ + return didwork || starpu_sched_component_can_push(component, to); +} + +static int HFP_can_pull(struct starpu_sched_component * component) +{ + return starpu_sched_component_can_pull(component); +} + +/* Fonction qui va appeller le scheduling en fonction multigpu et de hmetis. On peut ignorer son temps dans xgemm directement */ +static void HFP_do_schedule(struct starpu_sched_component *component) +{ + struct _starpu_HFP_sched_data *data = component->data; + struct starpu_task *task1 = NULL; + int nb_of_loop = 0; /* Number of iteration of the while loop */ + int number_of_package_to_build = 0; + number_of_package_to_build = _get_number_GPU(); /* Getting the number of GPUs */ + _starpu_HFP_GPU_RAM_M = (starpu_memory_get_total(starpu_worker_get_memory_node(starpu_bitmap_first(&component->workers_in_ctx)))); /* Here we calculate the size of the RAM of the GPU. We allow our packages to have half of this size */ + + /* If the linked list is empty, we can pull more tasks */ + if (_starpu_HFP_is_empty(data->p->first_link) == true) + { + if (!starpu_task_list_empty(&data->sched_list)) + { + /* Si la liste initiale (sched_list) n'est pas vide, ce sont des tâches non traitées */ + EXPECTED_TIME = 0; + _starpu_HFP_appli = starpu_task_get_name(starpu_task_list_begin(&data->sched_list)); + + if (_starpu_HFP_hmetis != 0) + { + if (_starpu_HFP_hmetis == 3 || _starpu_HFP_hmetis == 4 || _starpu_HFP_hmetis == 5 || _starpu_HFP_hmetis == 6) + { + hmetis_input_already_generated(data->p, &data->sched_list, number_of_package_to_build); + } + else + { + _starpu_hmetis_scheduling(data->p, &data->sched_list, number_of_package_to_build); + } + if (_print_in_terminal == 1) + { + init_visualisation(data->p); + } + _starpu_HFP_do_schedule_done = true; + return; + } + + /* Pulling all tasks and counting them + * TODO : pas besoin de faire ca on peut faire size. Du coup faut suppr popped task list et la remplacer par sched list + */ + + struct starpu_task_list *temp_task_list = starpu_task_list_new(); + starpu_task_list_init(temp_task_list); + + _starpu_HFP_NT = starpu_task_list_size(&data->sched_list); + while (!starpu_task_list_empty(&data->sched_list)) + { + task1 = starpu_task_list_pop_front(&data->sched_list); + if (_print_in_terminal != 0) + { + printf("Tâche %p, %d donnée(s) : ",task1, STARPU_TASK_GET_NBUFFERS(task1)); + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task1); i++) + { + printf("%p ",STARPU_TASK_GET_HANDLE(task1, i)); + } + printf("\n"); + } + if (multigpu != 0) { EXPECTED_TIME += starpu_task_expected_length(task1, starpu_worker_get_perf_archtype(STARPU_CUDA_WORKER, 0), 0); } + + //~ starpu_task_list_push_back(&data->popped_task_list, task1); + starpu_task_list_push_back(temp_task_list, task1); + } + _starpu_HFP_N = sqrt(_starpu_HFP_NT); + + if(_print3d == 1) + { + _starpu_HFP_N = _starpu_HFP_N/2; /* So i can print just like a 2D matrix */ + } + data->p->NP = _starpu_HFP_NT; + + //~ task1 = starpu_task_list_begin(&data->popped_task_list); + //~ printf("%p\n", task1); + //~ data->p = hierarchical_fair_packing(data->popped_task_list, NT, number_of_package_to_build); + data->p = hierarchical_fair_packing(temp_task_list, _starpu_HFP_NT, number_of_package_to_build); + + /* Printing in terminal and also visu python */ + if (_print_in_terminal == 1) + { + _sched_visu_print_packages_in_terminal(data->p, nb_of_loop, "After first execution of HFP we have ---\n"); + int i = 0; + int j = 0; + int temp_tab_coordinates[2]; + FILE *f_last_package = fopen("Output_maxime/last_package_split.txt", "w"); + data->p->temp_pointer_1 = data->p->first_link; + int sub_package = 0; + + while (data->p->temp_pointer_1 != NULL) + { + //~ printf("Début while.\n"); + j = 1; + for (task1 = starpu_task_list_begin(&data->p->temp_pointer_1->sub_list); task1 != starpu_task_list_end(&data->p->temp_pointer_1->sub_list); task1 = starpu_task_list_next(task1)) + { + //~ _print_in_terminal ("On %p in printing.\n", task1); + /* + 1 cause it's the next one that is in the other sub package */ + if (j == data->p->temp_pointer_1->split_last_ij + 1) + { + sub_package++; + } + if (_print3d != 0) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task1, 2), 2, temp_tab_coordinates); + fprintf(f_last_package, "%d %d", temp_tab_coordinates[0], temp_tab_coordinates[1]); + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task1, 0), 2, temp_tab_coordinates); + fprintf(f_last_package, " %d %d %d\n", temp_tab_coordinates[0], i, sub_package); + } + else + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task1, 2), 2, temp_tab_coordinates); + /* Printing X Y GPU SUBPACKAGE(1 - NSUBPACKAGES) */ + fprintf(f_last_package, "%d %d %d %d\n", temp_tab_coordinates[0], temp_tab_coordinates[1], i, sub_package); + } + j++; + } + sub_package++; + i++; + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + //~ printf("Next.\n"); + } + fclose(f_last_package); + //~ printf("End of printing1.\n"); fflush(stdout); + } + + /* Task stealing based on the number of tasks. Only in cases of multigpu */ + if (multigpu == 2 || multigpu == 3) + { + load_balance(data->p, number_of_package_to_build); + _sched_visu_print_packages_in_terminal(data->p, nb_of_loop, "After load balance we have ---\n"); + } + else if (multigpu == 4 || multigpu == 5) /* Task stealing with expected time of each task */ + { + load_balance_expected_time(data->p, number_of_package_to_build); + _sched_visu_print_packages_in_terminal(data->p, nb_of_loop, "After load balance we have with expected time ---\n"); + } + else if (multigpu == 6 || multigpu == 7) + { + /* Task stealing with expected time of each package, with transfers and overlap */ + load_balance_expected_package_computation_time(data->p, _starpu_HFP_GPU_RAM_M); + _sched_visu_print_packages_in_terminal(data->p, nb_of_loop, "After load balance we have with expected package computation time ---\n"); + } + /* Re-apply HFP on each package. + * Once task stealing is done we need to re-apply HFP. For this I use an other instance of HFP_sched_data. + * It is in another function, if it work we can also put the packing above in it. + * Only with MULTIGPU = 2 because if we don't do load balance there is no point in re-applying HFP. + */ + if (multigpu == 3 || multigpu == 5 || multigpu == 7) + { + data->p->temp_pointer_1 = data->p->first_link; + while (data->p->temp_pointer_1 != NULL) + { + data->p->temp_pointer_1->sub_list = hierarchical_fair_packing_one_task_list(data->p->temp_pointer_1->sub_list, data->p->temp_pointer_1->nb_task_in_sub_list); + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + } + _sched_visu_print_packages_in_terminal(data->p, nb_of_loop, "After execution of HFP on each package we have ---\n"); + } + + /* Interlacing package task list order */ + if (interlacing != 0) + { + _sched_visu_print_packages_in_terminal(data->p, 0, "Before interlacing we have:\n"); + interlacing_task_list(data->p); + _sched_visu_print_packages_in_terminal(data->p, 0, "After interlacing we have:\n"); + } + + /* Belady */ + if (belady == 1) + { + get_ordre_utilisation_donnee(data->p, number_of_package_to_build); + } + + /* If you want to get the sum of weight of all different data. Only works if you have only one package */ + //~ //if (_print_in_terminal == 1) { get_weight_all_different_data(data->p->first_link, _starpu_HFP_GPU_RAM_M); } + + /* We prefetch data for each task for modular-heft-HFP */ + if (modular_heft_hfp_mode != 0) + { + prefetch_each_task(data->p, component); + } + + /* Printing in a file the order produced by HFP. If we use modular-heft-HFP, we can compare this order with the one done by modular-heft. We also print here the number of gpu in which a data is used for HFP's order. */ + if (_print_in_terminal == 1) + { + /* Todo a remetrre quand j'aurais corrigé le print_order_in_file_hfp */ + init_visualisation(data->p); + } + + _starpu_HFP_do_schedule_done = true; + } + } +} + +/* TODO a suppr */ +//~ struct timeval time_start_eviction; +//~ struct timeval time_end_eviction; +//~ long long time_total_eviction = 0; + +/* TODO a suppr */ +//~ struct timeval time_start_createtolasttaskfinished; +//~ struct timeval time_end_createtolasttaskfinished; +//~ long long time_total_createtolasttaskfinished = 0; + +static starpu_data_handle_t belady_victim_selector(starpu_data_handle_t toload, unsigned node, enum starpu_is_prefetch is_prefetch, void *component) +{ + (void)toload; + STARPU_PTHREAD_MUTEX_LOCK(&HFP_mutex); + //~ gettimeofday(&time_start_eviction, NULL); + /* Checking if all task are truly valid. Else I return a non valid data + * pas indispensable en 2D mais sera utile plus tard. */ + /* for (i = 0; i < nb_data_on_node; i++) + { + if (valid[i] == 0 && starpu_data_can_evict(data_on_node[i], node, is_prefetch)) + { + free(valid); + returned_handle = data_on_node[i]; + free(data_on_node); + return returned_handle; + } + } */ + + struct starpu_sched_component *temp_component = component; + struct _starpu_HFP_sched_data *data = temp_component->data; + starpu_data_handle_t returned_handle = NULL; + int current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()) - 1; + + /* Je check si une eviction n'a pas été refusé. */ + data->p->temp_pointer_1 = data->p->first_link; + int i; + for (i = 0; i < current_gpu; i++) + { + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + } + if (data->p->temp_pointer_1->data_to_evict_next != NULL) + { + returned_handle = data->p->temp_pointer_1->data_to_evict_next; + data->p->temp_pointer_1->data_to_evict_next = NULL; + STARPU_PTHREAD_MUTEX_UNLOCK(&HFP_mutex); + + //~ gettimeofday(&time_end_eviction, NULL); + //~ time_total_eviction += (time_end_eviction.tv_sec - time_start_eviction.tv_sec)*1000000LL + time_end_eviction.tv_usec - time_start_eviction.tv_usec; + + //~ printf("Return 1 %p.\n", returned_handle); fflush(stdout); + return returned_handle; + } + /* Sinon je cherche dans la mémoire celle utilisé dans le plus longtemps et que j'ai le droit d'évincer */ + starpu_data_handle_t *data_on_node; + unsigned nb_data_on_node = 0; + int *valid; + starpu_data_get_node_data(node, &data_on_node, &valid, &nb_data_on_node); + int latest_use = 0; + int index_latest_use = 0; + struct _starpu_HFP_next_use *b = NULL; + + unsigned j; + for (j = 0; j < nb_data_on_node; j++) + { + if (starpu_data_can_evict(data_on_node[j], node, is_prefetch)) + { + struct _starpu_HFP_next_use_by_gpu *c = _starpu_HFP_next_use_by_gpu_new(); + b = data_on_node[j]->sched_data; + + if (_starpu_HFP_next_use_by_gpu_list_empty(b->next_use_tab[current_gpu])) /* Si c'est vide alors je peux direct renvoyer cette donnée, elle ne sera jamais ré-utilisé */ + { + STARPU_PTHREAD_MUTEX_UNLOCK(&HFP_mutex); + //~ printf("Return %p that is not used again.\n", data_on_node[i]); + + //~ gettimeofday(&time_end_eviction, NULL); + //~ time_total_eviction += (time_end_eviction.tv_sec - time_start_eviction.tv_sec)*1000000LL + time_end_eviction.tv_usec - time_start_eviction.tv_usec; + + //~ printf("Return 2 %p.\n", data_on_node[i]); fflush(stdout); + return data_on_node[j]; + } + + c = _starpu_HFP_next_use_by_gpu_list_begin(b->next_use_tab[current_gpu]); + if (latest_use < c->value_next_use) + { + latest_use = c->value_next_use; + index_latest_use = j; + } + } + } + if (latest_use == 0) /* Si je n'ai eu aucune donnée valide, je renvoie NO_VICTIM */ + { + STARPU_PTHREAD_MUTEX_UNLOCK(&HFP_mutex); + + //~ gettimeofday(&time_end_eviction, NULL); + //~ time_total_eviction += (time_end_eviction.tv_sec - time_start_eviction.tv_sec)*1000000LL + time_end_eviction.tv_usec - time_start_eviction.tv_usec; + + //~ printf("Return NO_VICTIM\n"); fflush (stdout); + return STARPU_DATA_NO_VICTIM; + } + STARPU_PTHREAD_MUTEX_UNLOCK(&HFP_mutex); + + //~ gettimeofday(&time_end_eviction, NULL); + //~ time_total_eviction += (time_end_eviction.tv_sec - time_start_eviction.tv_sec)*1000000LL + time_end_eviction.tv_usec - time_start_eviction.tv_usec; + + return data_on_node[index_latest_use]; +} + +static void belady_victim_eviction_failed(starpu_data_handle_t victim, void *component) +{ + STARPU_PTHREAD_MUTEX_LOCK(&HFP_mutex); + + struct starpu_sched_component *temp_component = component; + struct _starpu_HFP_sched_data *data = temp_component->data; + + /* If a data was not truly evicted I put it back in the list. */ + data->p->temp_pointer_1 = data->p->first_link; + unsigned i; + for (i = 1; i < starpu_worker_get_memory_node(starpu_worker_get_id()); i++) + { + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + } + data->p->temp_pointer_1->data_to_evict_next = victim; + + STARPU_PTHREAD_MUTEX_UNLOCK(&HFP_mutex); +} + +struct starpu_sched_component *starpu_sched_component_HFP_create(struct starpu_sched_tree *tree, void *params STARPU_ATTRIBUTE_UNUSED) +{ + //~ gettimeofday(&time_start_createtolasttaskfinished, NULL); + + /* Global variable */ + order_u = starpu_get_env_number_default("ORDER_U", 1); + belady = starpu_get_env_number_default("BELADY", 1); + multigpu = starpu_get_env_number_default("MULTIGPU", 0); + modular_heft_hfp_mode = starpu_get_env_number_default("MODULAR_HEFT_HFP_MODE", 0); + _starpu_HFP_hmetis = starpu_get_env_number_default("HMETIS", 0); + hmetis_n = starpu_get_env_number_default("HMETIS_N", 0); + task_stealing = starpu_get_env_number_default("TASK_STEALING", 0); + interlacing = starpu_get_env_number_default("INTERLACING", 0); + faster_first_iteration = starpu_get_env_number_default("FASTER_FIRST_ITERATION", 0); + + _starpu_visu_init(); + + starpu_srand48(starpu_get_env_number_default("SEED", 0)); + struct starpu_sched_component *component = starpu_sched_component_create(tree, "HFP"); + + assert(0); + if (_print_in_terminal == 1) + { + FILE *f = fopen("Output_maxime/Data_stolen_load_balance.txt", "w"); + fclose(f); + } + + struct _starpu_HFP_sched_data *data; + struct _starpu_HFP_my_list *my_data = malloc(sizeof(*my_data)); + struct _starpu_HFP_paquets *paquets_data = malloc(sizeof(*paquets_data)); + _STARPU_MALLOC(data, sizeof(*data)); + + STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL); + starpu_task_list_init(&data->sched_list); + //~ starpu_task_list_init(&data->popped_task_list); + starpu_task_list_init(&my_data->sub_list); + starpu_task_list_init(&my_data->refused_fifo_list); + + my_data->next = NULL; + paquets_data->temp_pointer_1 = my_data; + paquets_data->first_link = paquets_data->temp_pointer_1; + data->p = paquets_data; + data->p->temp_pointer_1->nb_task_in_sub_list = 0; + data->p->temp_pointer_1->expected_time_pulled_out = 0; + data->p->temp_pointer_1->data_weight = 0; + + data->p->temp_pointer_1->expected_time = 0; + + component->data = data; + component->do_schedule = HFP_do_schedule; + component->push_task = HFP_push_task; + component->pull_task = HFP_pull_task; + component->can_push = HFP_can_push; + component->can_pull = HFP_can_pull; + + STARPU_PTHREAD_MUTEX_INIT(&HFP_mutex, NULL); + + number_task_out = 0; + iteration = 0; + + /* TODO init du temps a suppr si on mesure plus le temps. A suppr */ + //~ time_total_getorderbelady = 0; + //~ time_total_getcommondataorderu = 0; + //~ time_total_gettasktoreturn = 0; + //~ time_total_scheduling = 0; + //~ time_total_loadbalanceexpectedtime = 0; + //~ time_total_createtolasttaskfinished = 0; + //~ time_total_eviction = 0; + + if (belady == 1) + { + starpu_data_register_victim_selector(belady_victim_selector, belady_victim_eviction_failed, component); + } + + return component; +} + +static void initialize_HFP_center_policy(unsigned sched_ctx_id) +{ + starpu_sched_component_initialize_simple_scheduler((starpu_sched_component_create_t) starpu_sched_component_HFP_create, NULL, + STARPU_SCHED_SIMPLE_DECIDE_MEMNODES | + STARPU_SCHED_SIMPLE_DECIDE_ALWAYS | + STARPU_SCHED_SIMPLE_FIFOS_BELOW | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_READY | /* ready of dmdar plugged into HFP */ + STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP | + STARPU_SCHED_SIMPLE_IMPL, sched_ctx_id); +} + +static void deinitialize_HFP_center_policy(unsigned sched_ctx_id) +{ + struct starpu_sched_tree *tree = (struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id); + starpu_sched_tree_destroy(tree); +} + +/* Get the task that was last executed. Used to update the task list of pulled task */ +static void get_task_done_HFP(struct starpu_task *task, unsigned sci) +{ + STARPU_PTHREAD_MUTEX_LOCK(&HFP_mutex); + number_task_out++; + /* Je me place sur le bon gpu. */ + int current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()) - 1; + + /* Si j'utilse Belady, je pop les valeurs dans les données de la tâche qui vient de se terminer */ + if (belady == 1) + { + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + struct _starpu_HFP_next_use *b = STARPU_TASK_GET_HANDLE(task, i)->sched_data; + if (!_starpu_HFP_next_use_by_gpu_list_empty(b->next_use_tab[current_gpu])) /* Test empty car avec le task stealing ca n'a plus aucun sens */ + { + _starpu_HFP_next_use_by_gpu_list_pop_front(b->next_use_tab[current_gpu]); + STARPU_TASK_GET_HANDLE(task, i)->sched_data = b; + } + } + } + + /* Reset pour prochaine itération à faire ici quand le nombe de tâches sortie == NT si besoin */ + if (_starpu_HFP_NT == number_task_out) + { + iteration++; + _starpu_HFP_do_schedule_done = false; + number_task_out = 0; + + /* TODO a suppr. PRINT_TIME sur 2 permet de forcer l'écriture en simulation car il y a 1 seule itération. */ + //~ if ((iteration == 3 && starpu_get_env_number_default("PRINT_TIME", 0) == 1) || starpu_get_env_number_default("PRINT_TIME", 0) == 2) + //~ { + //~ FILE *f = fopen("Output_maxime/HFP_time.txt", "a"); + //~ fprintf(f, "%0.0f ", sqrt(NT)); + //~ fprintf(f, "%lld ", time_total_scheduling); + //~ fprintf(f, "%lld ", time_total_eviction); + //~ fprintf(f, "%lld ", time_total_getorderbelady); + //~ fprintf(f, "%lld ", time_total_getcommondataorderu); + //~ fprintf(f, "%lld ", time_total_gettasktoreturn); + //~ fprintf(f, "%lld ", time_total_loadbalanceexpectedtime); + //~ gettimeofday(&time_end_createtolasttaskfinished, NULL); + //~ time_total_createtolasttaskfinished += (time_end_createtolasttaskfinished.tv_sec - time_start_createtolasttaskfinished.tv_sec)*1000000LL + time_end_createtolasttaskfinished.tv_usec - time_start_createtolasttaskfinished.tv_usec; + //~ fprintf(f, "%lld ", time_total_createtolasttaskfinished); + //~ fprintf(f, "%lld ", time_total_find_min_size); + //~ fprintf(f, "%lld ", time_total_init_packages); + //~ fprintf(f, "%lld ", time_total_fill_matrix_common_data_plus_get_max); + //~ fprintf(f, "%lld ", time_total_reset_init_start_while_loop); + //~ fprintf(f, "%lld ", time_total_order_u_total); + //~ fprintf(f, "%lld\n", time_total_merge); + //~ fclose(f); + //~ } + } + + STARPU_PTHREAD_MUTEX_UNLOCK(&HFP_mutex); + starpu_sched_component_worker_pre_exec_hook(task, sci); +} + +struct starpu_sched_policy _starpu_sched_HFP_policy = +{ + .init_sched = initialize_HFP_center_policy, + .deinit_sched = deinitialize_HFP_center_policy, + .add_workers = starpu_sched_tree_add_workers, + .remove_workers = starpu_sched_tree_remove_workers, + .do_schedule = starpu_sched_tree_do_schedule, + .push_task = starpu_sched_tree_push_task, + .pop_task = _sched_visu_get_data_to_load, /* To get the number of data needed for the current task, still return the task that we got with starpu_sched_tree_pop_task */ + .pre_exec_hook = _sched_visu_get_current_tasks, /* Getting current task for printing diff later on. Still call starpu_sched_component_worker_pre_exec_hook(task,sci); at the end */ + .post_exec_hook = get_task_done_HFP, /* Sert pour Belady et aussi pour afficher les temps d'exec. A ne pas retirer pour Belady */ + .policy_name = "HFP", + .policy_description = "Affinity aware task ordering", + .worker_type = STARPU_WORKER_LIST, +}; + +static void initialize_heft_hfp_policy(unsigned sched_ctx_id) +{ + starpu_sched_component_initialize_simple_schedulers(sched_ctx_id, 1, (starpu_sched_component_create_t) starpu_sched_component_mct_create, NULL, + STARPU_SCHED_SIMPLE_PRE_DECISION, + (starpu_sched_component_create_t) starpu_sched_component_HFP_create, NULL, + STARPU_SCHED_SIMPLE_DECIDE_MEMNODES | + STARPU_SCHED_SIMPLE_DECIDE_ALWAYS | + STARPU_SCHED_SIMPLE_PERFMODEL | + STARPU_SCHED_SIMPLE_FIFOS_BELOW | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_READY | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP | + STARPU_SCHED_SIMPLE_IMPL); +} + +struct starpu_sched_policy _starpu_sched_modular_heft_HFP_policy = +{ + .init_sched = initialize_heft_hfp_policy, + .deinit_sched = starpu_sched_tree_deinitialize, + .add_workers = starpu_sched_tree_add_workers, + .remove_workers = starpu_sched_tree_remove_workers, + .do_schedule = starpu_sched_tree_do_schedule, + .push_task = starpu_sched_tree_push_task, + .pop_task = starpu_sched_tree_pop_task, + .pre_exec_hook = _sched_visu_get_current_tasks_for_visualization, + //~ .pre_exec_hook = starpu_sched_component_worker_pre_exec_hook, + .post_exec_hook = starpu_sched_component_worker_post_exec_hook, + .policy_name = "modular-heft-HFP", + .policy_description = "heft modular policy", + .worker_type = STARPU_WORKER_LIST, + .prefetches = 1, +}; diff --git a/src/sched_policies/HFP.h b/src/sched_policies/HFP.h new file mode 100644 index 0000000000..d748696a53 --- /dev/null +++ b/src/sched_policies/HFP.h @@ -0,0 +1,149 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2020-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#ifndef __SCHED_HFP_H__ +#define __SCHED_HFP_H__ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#pragma GCC visibility push(hidden) + +#define ORDER_U /* O or 1 */ +#define BELADY /* O or 1 */ +#define MULTIGPU /* 0 : on ne fais rien, 1 : on construit |GPU| paquets et on attribue chaque paquet à un GPU au hasard, 2 : pareil que 1 + load balance, 3 : pareil que 2 + HFP sur chaque paquet, 4 : pareil que 2 mais avec expected time a la place du nb de données, 5 pareil que 4 + HFP sur chaque paquet, 6 : load balance avec expected time d'un paquet en comptant transferts et overlap, 7 : pareil que 6 + HFP sur chaque paquet */ +#define MODULAR_HEFT_HFP_MODE /* 0 we don't use heft, 1 we use starpu_prefetch_task_input_on_node_prio, 2 we use starpu_idle_prefetch_task_input_on_node_prio. Put it at 1 or 2 if you use modular-heft-HFP, else it will crash. The 0 is just here so we don't do prefetch when we use regular HFP. If we do not use modular-heft-HFP, always put this environemment variable on 0. */ +#define HMETIS /* 0 we don't use hMETIS, 1 we use it to form |GPU| package, 2 same as 1 but we then apply HFP on each package. For mst if it is equal to 1 we form |GPU| packages then apply mst on each package, 3 we use hMETIS with already produced input files, 4 same +but we apply HFP on each package (pas codé en réalité car j'avais changé la manière d'appeller hfp, il faudrait mettre hierarchical_fair_packing_one_task_list ou un truc du genre, 5 we use hMETIS in 3D already generated, 6 it's cholesky with already generated. + Il faut préciser en var d'env le N avec HMETIS_N. Already generated is used in Grid5k or PlaFRIM when we don't have access to the hMETIS's executable + it saves precious time. */ +#define HMETIS_N /* Préciser N */ +#define PRINT3D /* 1 we print coordinates and visualize data. 2 same but it is 3D with Z = N. Needed to differentiate 2D from 3D. */ +#define TASK_STEALING /* 0 we don't use it, 1 when a gpu (so a package) has finished all it tasks, it steal a task, starting by the end of the package of the package that has the most tasks left. It can be done with load balance on but was first thinked to be used with no load balance bbut |GPU| packages (MULTIGPU=1), 2 same than 1 but we steal from the package that has the biggest expected package time, 3 same than 2 but we always steal half (arondi à l'inférieur) of the package at once (in term of task duration). All that is implemented in get_task_to_return */ +#define INTERLACING /* 0 we don't use it, 1 we start giving task at the middle of the package then do right, left and so on. */ +#define FASTER_FIRST_ITERATION /* A 0 on ne fais rien, a 1 on le fais. Permet de faire une première itération où on merge ensemble els taches partageant une données sans regarder le max et donc sans calculer la matrice. Ne marche que pour matrice 2D, 3D. */ + +extern int _starpu_HFP_hmetis; + +extern const char* _starpu_HFP_appli; +extern int _starpu_HFP_NT; +//extern int N; +extern starpu_ssize_t _starpu_HFP_GPU_RAM_M; +extern bool _starpu_HFP_do_schedule_done; + +/* Structure used to acces the struct my_list. There are also task's list */ +struct _starpu_HFP_sched_data +{ + //~ struct starpu_task_list popped_task_list; /* List used to store all the tasks at the beginning of the pull_task function */ + struct _starpu_HFP_paquets *p; + struct starpu_task_list sched_list; + starpu_pthread_mutex_t policy_mutex; +}; + +/* Structure used to store all the variable we need and the tasks of each package. Each link is a package */ +struct _starpu_HFP_my_list +{ + int package_nb_data; + int nb_task_in_sub_list; + int index_package; /* Utilisé dans MST pour le scheduling */ + starpu_data_handle_t * package_data; /* List of all the data in the packages. We don't put two times the duplicates */ + struct starpu_task_list sub_list; /* The list containing the tasks */ + struct starpu_task_list refused_fifo_list; /* if a task is refused, it goes in this fifo list so it can be the next task processed by the right gpu */ + struct _starpu_HFP_my_list *next; + int split_last_ij; /* The separator of the last state of the current package */ + //~ starpu_data_handle_t * data_use_order; /* Order in which data will be loaded. used for Belady */ + double expected_time; /* Only task's time */ + double expected_time_pulled_out; /* for load balance but only MULTIGPU = 4, 5 */ + double expected_package_computation_time; /* Computation time with transfer and overlap */ + struct _starpu_HFP_data_on_node *pointer_node; /* linked list of handle use to simulate the memory in load balance with package with expected time */ + long int data_weight; + + starpu_data_handle_t data_to_evict_next; +}; + +struct _starpu_HFP_paquets +{ + /* All the pointer use to navigate through the linked list */ + struct _starpu_HFP_my_list *temp_pointer_1; + struct _starpu_HFP_my_list *temp_pointer_2; + struct _starpu_HFP_my_list *temp_pointer_3; + struct _starpu_HFP_my_list *first_link; /* Pointer that we will use to point on the first link of the linked list */ + int NP; /* Number of packages */ +}; + +/* TODO : ou est-ce que j'utilise ca ? A suppr si inutile */ +struct _starpu_HFP_data_on_node /* Simulate memory, list of handles */ +{ + struct _starpu_HFP_handle *pointer_data_list; + struct _starpu_HFP_handle *first_data; + long int memory_used; +}; + +struct _starpu_HFP_handle /* The handles from above */ +{ + starpu_data_handle_t h; + int last_use; + struct _starpu_HFP_handle *next; +}; + +/** Dans sched_data des données pour avoir la liste des prochaines utilisations que l'on peut pop à chaque utilisation dans get_task_done **/ +LIST_TYPE(_starpu_HFP_next_use_by_gpu, + /* int to the next use, one by GPU */ + int value_next_use; + ); + +struct _starpu_HFP_next_use +{ + struct _starpu_HFP_next_use_by_gpu_list **next_use_tab; +}; + +void _starpu_HFP_initialize_global_variable(struct starpu_task *task); + +/* Put a link at the beginning of the linked list */ +void _starpu_HFP_insertion(struct _starpu_HFP_paquets *a); + +/* + * Called in HFP_pull_task when we need to return a task. It is used + * when we have multiple GPUs. + * In case of modular-heft-HFP, it needs to do a round robin on the + * task it returned. So we use expected_time_pulled_out, an element of + * struct my_list in order to track which package pulled out the least + * expected task time. So heft can better divide tasks between GPUs + */ +struct starpu_task *_starpu_HFP_get_task_to_return(struct starpu_sched_component *component, struct starpu_sched_component *to, struct _starpu_HFP_paquets* a, int nb_gpu); + +/* Check if our struct is empty */ +bool _starpu_HFP_is_empty(struct _starpu_HFP_my_list* a); + +void _starpu_hmetis_scheduling(struct _starpu_HFP_paquets *p, struct starpu_task_list *l, int nb_gpu); + +void _starpu_visu_init(); + +#pragma GCC visibility pop + +#endif // __SCHED_HFP_H__ diff --git a/src/sched_policies/cuthillmckee_policy.c b/src/sched_policies/cuthillmckee_policy.c new file mode 100644 index 0000000000..7e77815e55 --- /dev/null +++ b/src/sched_policies/cuthillmckee_policy.c @@ -0,0 +1,441 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2013-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * Copyright (C) 2013 Simon Archipoff + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +/* CM + */ + +#include +#include +#include +#include +#include +#include +#include "core/task.h" +#include "prio_deque.h" +#include +#include "helper_mct.h" +#include +#include +#include +#include "starpu_stdlib.h" +#include "common/list.h" +#include + +#define REVERSE /* O or 1 */ +static int reverse; +static bool do_schedule_done_cm = false; + +/* Structure used to acces the struct my_list_cm. There are also task's list */ +struct cuthillmckee_sched_data +{ + struct starpu_task_list popped_task_list; /* List used to store all the tasks at the beginning of the pull_task function */ + struct starpu_task_list list_if_fifo_full; /* List used if the fifo list is not empty. It means that task from the last iteration haven't been pushed, thus we need to pop task from this list */ + struct starpu_task_list SIGMA; /* order in which task will go out */ + /* All the pointer use to navigate through the linked list */ + struct my_list_cm *temp_pointer_1; + struct my_list_cm *first_link; /* Pointer that we will use to point on the first link of the linked list */ + //~ int id; + struct starpu_task_list sched_list; + starpu_pthread_mutex_t policy_mutex; +}; + +struct my_list_cm +{ + int index; + struct starpu_task_list sub_list; /* The list containing the tasks */ + struct my_list_cm *next; +}; + +/* Put a link at the beginning of the linked list */ +static void insertion_cuthillmckee(struct cuthillmckee_sched_data *a) +{ + struct my_list_cm *new = malloc(sizeof(*new)); /* Creation of a new link */ + starpu_task_list_init(&new->sub_list); + new->next = a->temp_pointer_1; + a->temp_pointer_1 = new; +} + +/* Pushing the tasks */ +static int cuthillmckee_push_task(struct starpu_sched_component *component, struct starpu_task *task) +{ + struct cuthillmckee_sched_data *data = component->data; + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + starpu_task_list_push_front(&data->sched_list, task); + starpu_push_task_end(task); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + /* Tell below that they can now pull */ + component->can_pull(component); + return 0; +} + +/* The function that sort the tasks in packages */ +static struct starpu_task *cuthillmckee_pull_task(struct starpu_sched_component *component, struct starpu_sched_component *to) +{ + _STARPU_SCHED_PRINT("Début de cuthillmckee_pull_task\n"); + + (void)to; + struct cuthillmckee_sched_data *data = component->data; + struct starpu_task *task1 = NULL; + if (do_schedule_done_cm == true) + { + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + + /* If one or more task have been refused */ + if (!starpu_task_list_empty(&data->list_if_fifo_full)) + { + task1 = starpu_task_list_pop_back(&data->list_if_fifo_full); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + return task1; + } + /* If the linked list is empty, we can pull more tasks */ + if (starpu_task_list_empty(&data->SIGMA)) + { + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + return NULL; + } + else + { + task1 = starpu_task_list_pop_front(&data->SIGMA); + + _sched_visu_print_data_to_load_prefetch(task1, starpu_worker_get_id() + 1, 1); + + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + _STARPU_SCHED_PRINT("Task %p is getting out of pull_task\n", task1); + return task1; + } + } + return NULL; +} + +static int cuthillmckee_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to) +{ + struct cuthillmckee_sched_data *data = component->data; + int didwork = 0; + + struct starpu_task *task; + task = starpu_sched_component_pump_to(component, to, &didwork); + + if (task) + { + /* Oops, we couldn't push everything, put back this task */ + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + starpu_task_list_push_back(&data->list_if_fifo_full, task); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + } + else + { + /* Can I uncomment this part ? */ + //~ { + //~ if (didwork) + //~ fprintf(stderr, "pushed some tasks to %p\n", to); + //~ else + //~ fprintf(stderr, "I didn't have anything for %p\n", to); + //~ } + } + + /* There is room now */ + return didwork || starpu_sched_component_can_push(component, to); +} + +static int cuthillmckee_can_pull(struct starpu_sched_component * component) +{ + return starpu_sched_component_can_pull(component); +} + +static void cuthillmckee_do_schedule(struct starpu_sched_component *component) +{ + int i, j, tab_runner, tab_runner_bis, nb_voisins = 0; + int poids_aretes_min = INT_MAX; int indice_poids_aretes_min = INT_MAX; + struct cuthillmckee_sched_data *data = component->data; + struct starpu_task *task1 = NULL; + struct starpu_task *temp_task_1 = NULL; + struct starpu_task *temp_task_2 = NULL; + + int NT = 0; + + /* If the linked list is empty, we can pull more tasks */ + if (starpu_task_list_empty(&data->SIGMA)) + { + if (!starpu_task_list_empty(&data->sched_list)) + { + /* Pulling all tasks and counting them */ + while (!starpu_task_list_empty(&data->sched_list)) + { + task1 = starpu_task_list_pop_front(&data->sched_list); + NT++; + starpu_task_list_push_back(&data->popped_task_list,task1); + } + + //~ int matrice_adjacence[NT][NT]; for (i = 0; i < NT; i++) { for (j = 0; j < NT; j++) { matrice_adjacence[i][j] = 0; } } + long int matrice_adjacence[NT][NT]; + for (i = 0; i < NT; i++) + { + for (j = 0; j < NT; j++) + { + matrice_adjacence[i][j] = 0; + } + } + temp_task_1 = starpu_task_list_begin(&data->popped_task_list); + temp_task_2 = starpu_task_list_begin(&data->popped_task_list); + for (i = 0; i < NT; i++) + { + for (j = 0; j < NT; j++) + { + if (i != j) + { + unsigned i_bis; + for (i_bis = 0; i_bis < STARPU_TASK_GET_NBUFFERS(temp_task_1); i_bis++) + { + unsigned j_bis; + for (j_bis = 0; j_bis < STARPU_TASK_GET_NBUFFERS(temp_task_2); j_bis++) + { + if (STARPU_TASK_GET_HANDLE(temp_task_1, i_bis) == STARPU_TASK_GET_HANDLE(temp_task_2, j_bis)) + { + matrice_adjacence[i][j]++; + } + /* Pour adapter a des tailles heterogènes ici il faut juste faire : et declrer poids des arretes et matrice adja en long int */ + if (STARPU_TASK_GET_HANDLE(temp_task_1,i_bis) == STARPU_TASK_GET_HANDLE(temp_task_2,j_bis)) + { + matrice_adjacence[i][j] += starpu_data_get_size(STARPU_TASK_GET_HANDLE(temp_task_1, i_bis)); + } + } + } + } + temp_task_2 = starpu_task_list_next(temp_task_2); + } + temp_task_1 = starpu_task_list_next(temp_task_1); + temp_task_2 = starpu_task_list_begin(&data->popped_task_list); + } + /* Affichage de la matrice d'adjacence */ + //~ if (starpu_get_env_number_default("PRINTF",0) == 1) { printf("Matrice d'adjacence :\n"); for (i = 0; i < NT; i++) { for (j = 0; j < NT; j++) { printf("%d ",matrice_adjacence[i][j]); } printf("\n"); } } + + //NEW + int do_not_add_more = 0; + while (!starpu_task_list_empty(&data->popped_task_list)) + { + starpu_task_list_push_back(&data->temp_pointer_1->sub_list,starpu_task_list_pop_front(&data->popped_task_list)); + data->temp_pointer_1->index = do_not_add_more; + if (do_not_add_more != NT-1) + { + insertion_cuthillmckee(data); + } + do_not_add_more++; + } + data->first_link = data->temp_pointer_1; + + //~ int poids_aretes[NT]; + long int poids_aretes[NT]; + int tab_SIGMA[NT]; for (i = 0; i < NT; i++) { tab_SIGMA[i] = -1; } + + /* Calcul du poids des arêtes de chaque sommet */ + for (i = 0; i < NT; i++) + { + poids_aretes[i] = 0; + for (j = 0; j < NT; j++) + { + if (matrice_adjacence[i][j] != 0) + { + poids_aretes[i] += matrice_adjacence[i][j]; + } + } + } + tab_runner = 0; + tab_runner_bis = 0; + while (tab_runner < NT) + { + //~ for (i = 0; i < NT; i++) { if (poids_aretes[i] != -1) { poids_aretes[i] = 0; } } + /* Si tab_SIGMA est vide ou qu'on a déjà exploré tous ses sommets on prend le sommet dont le poids des arêtes est le plus faible, sinon on passe au sommet de tab_SIGMA suivant (cas de graphe non connexe en fait) */ + //OLD + if (tab_SIGMA[tab_runner] == -1) + { + //NEW + //~ if (tab_SIGMA[tab_runner] == -1 && tab_runner != NT) { + /* Recherche du sommet dont le poids des arêtes est minimal */ + poids_aretes_min = INT_MAX; indice_poids_aretes_min = INT_MAX; + for (i = 0; i < NT; i++) + { + if (poids_aretes_min > poids_aretes[i] && poids_aretes[i] != - 1) + { + poids_aretes_min = poids_aretes[i]; indice_poids_aretes_min = i; + } + } + + //~ tab_SIGMA[tab_runner_bis] = indice_poids_aretes_min; + //~ temp_task_1 = starpu_task_list_begin(&data->popped_task_list); for (i = 0; i < indice_poids_aretes_min; i++) { temp_task_1 = starpu_task_list_next(temp_task_1); } + //~ if (starpu_get_env_number_default("PRINTF",0) == 1) { printf("Add %p to sigma\n",temp_task_1); } + tab_SIGMA[tab_runner_bis] = indice_poids_aretes_min; + //~ printf("ok1\n"); + + //~ strcpy(char_SIGMA[tab_runner_bis],starpu_task_get_name(temp_task_1)); + //~ strcpy(char_SIGMA[tab_runner_bis],"oui"); + //~ printf("ok2\n"); + + tab_runner_bis++; + poids_aretes[indice_poids_aretes_min] = -1; + } + else + { + /* On étudie les sommets de tab_SIGMA non explorés */ + while (tab_runner < NT && tab_SIGMA[tab_runner] != -1) + { + //~ tab_runner++; + //~ for (j = 0; j < NT; j++) { printf("%d ",matrice_adjacence[tab_SIGMA[tab_runner]][j]); } printf("\n"); + /* Pour chaque voisins, on les mets dans tab_SIGMA si ils n'y sont pas déjà, par poids des arêtes croissant */ + /* Recherche du nb de voisins pour la boucle suivante */ + for (j = 0; j < NT; j++) + { + if (matrice_adjacence[tab_SIGMA[tab_runner]][j] != 0 && matrice_adjacence[tab_SIGMA[tab_runner]][j] != -1 && poids_aretes[j] != -1) + { + nb_voisins++; + } + } + + if (nb_voisins == 0) + { + tab_runner++; + } + else + { + int i_bis; + for (i_bis = 0; i_bis < nb_voisins; i_bis++) + { + //~ poids_aretes_min_bis = INT_MAX; indice_poids_aretes_min_bis = INT_MAX; + //~ if (poids_aretes[i_bis] != -1) { + /* Recherche du min du poids des arêtes */ + poids_aretes_min = INT_MAX; indice_poids_aretes_min = INT_MAX; + for (j = 0; j < NT; j++) + { + if (poids_aretes_min > matrice_adjacence[tab_SIGMA[tab_runner]][j] && poids_aretes[j] != -1 && matrice_adjacence[tab_SIGMA[tab_runner]][j] != 0) + { + poids_aretes_min = matrice_adjacence[tab_SIGMA[tab_runner]][j]; indice_poids_aretes_min = j; + } + } + + /* Ajout à tab_SIGMA */ + tab_SIGMA[tab_runner_bis] = indice_poids_aretes_min; + tab_runner_bis++; + /* On supprime ce sommet de la liste des possibilité et on recommence à chercher le max parmi les voisins */ + poids_aretes[indice_poids_aretes_min] = -1; + } + nb_voisins = 0; + tab_runner++; + } + } + } + } + + /* I put the task in order in SIGMA */ + _sched_visu_print_vector(tab_SIGMA, NT, "tab_SIGMA[i] : "); + + if (reverse == 1) + { + int tab_SIGMA_2[NT]; + for (i = NT - 1, j = 0; i >= 0; i--, j++) + tab_SIGMA_2[j] = tab_SIGMA[i]; + for (i = 0; i < NT; i++) + tab_SIGMA[i] = tab_SIGMA_2[i]; + } + + i = 0; + data->temp_pointer_1 = data->first_link; + + while (i != NT) + { + //~ temp_task_1 = starpu_task_list_pop_front(&data->temp_pointer_1->sub_list); + if (tab_SIGMA[i] == data->temp_pointer_1->index) + { + //~ if (strcmp(char_SIGMA[i],starpu_task_get_name(temp_task_1) == 0)) { + starpu_task_list_push_back(&data->SIGMA,starpu_task_list_pop_front(&data->temp_pointer_1->sub_list)); + i++; + data->temp_pointer_1 = data->first_link; + } + else + { + data->temp_pointer_1 = data->temp_pointer_1->next; + } + } + do_schedule_done_cm = true; + } + } +} + +struct starpu_sched_component *starpu_sched_component_cuthillmckee_create(struct starpu_sched_tree *tree, void *params STARPU_ATTRIBUTE_UNUSED) +{ + reverse = starpu_get_env_number_default("REVERSE", 0); + _starpu_visu_init(); + + //~ srandom(time(0)); /* For the random selection in ALGO 4 */ + struct starpu_sched_component *component = starpu_sched_component_create(tree, "cuthillmckee"); + + struct cuthillmckee_sched_data *data; + struct my_list_cm *my_data = malloc(sizeof(*my_data)); + _STARPU_MALLOC(data, sizeof(*data)); + + do_schedule_done_cm = false; + + STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL); + starpu_task_list_init(&data->sched_list); + starpu_task_list_init(&data->list_if_fifo_full); + starpu_task_list_init(&data->popped_task_list); + starpu_task_list_init(&data->SIGMA); + + starpu_task_list_init(&my_data->sub_list); + my_data->next = NULL; + data->temp_pointer_1 = my_data; + + component->data = data; + component->do_schedule = cuthillmckee_do_schedule; + component->push_task = cuthillmckee_push_task; + component->pull_task = cuthillmckee_pull_task; + component->can_push = cuthillmckee_can_push; + component->can_pull = cuthillmckee_can_pull; + + return component; +} + +static void initialize_cuthillmckee_center_policy(unsigned sched_ctx_id) +{ + starpu_sched_component_initialize_simple_scheduler((starpu_sched_component_create_t) starpu_sched_component_cuthillmckee_create, NULL, + STARPU_SCHED_SIMPLE_DECIDE_MEMNODES | + STARPU_SCHED_SIMPLE_DECIDE_ALWAYS | + STARPU_SCHED_SIMPLE_FIFOS_BELOW | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_READY | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP | + STARPU_SCHED_SIMPLE_IMPL, sched_ctx_id); +} + +static void deinitialize_cuthillmckee_center_policy(unsigned sched_ctx_id) +{ + struct starpu_sched_tree *tree = (struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id); + starpu_sched_tree_destroy(tree); +} + +struct starpu_sched_policy _starpu_sched_cuthillmckee_policy = +{ + .init_sched = initialize_cuthillmckee_center_policy, + .deinit_sched = deinitialize_cuthillmckee_center_policy, + .add_workers = starpu_sched_tree_add_workers, + .remove_workers = starpu_sched_tree_remove_workers, + .do_schedule = starpu_sched_tree_do_schedule, + .push_task = starpu_sched_tree_push_task, + .pop_task = _sched_visu_get_data_to_load, + .pre_exec_hook = _sched_visu_get_current_tasks, + .post_exec_hook = starpu_sched_component_worker_post_exec_hook, + .policy_name = "cuthillmckee", + .policy_description = "cuthillmckee algorithm", + .worker_type = STARPU_WORKER_LIST, +}; diff --git a/src/sched_policies/darts.c b/src/sched_policies/darts.c new file mode 100644 index 0000000000..9935dd51e4 --- /dev/null +++ b/src/sched_policies/darts.c @@ -0,0 +1,3414 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2013-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +/* Dynamic Data Aware reactive Task (DARTS) scheduling. + * Look for the "best" data, i.e. the data that have the minimal transfer time to computation made available ratio. + * Computes all task using this data and the data already loaded on memory. + * If no task is available compute a task with highest priority. + * DARTS works especially well with GPUs. With both CPUs and GPUs, it does not take into account the speed difference, which leads to poor results. + * Using STARPU_NCPU=0 with STARPU_NOPENCL=0 is thus highly recommended to achieve peak performance when using GPUs. + * Otherwise, DARTS can work with CPUs only and with both GPUs and CPUs. + */ + +#include +#include +#include +#include +#include /* To compute the descendants and consequently add priorities */ +#include + +struct _starpu_darts_sched_data +{ + struct starpu_task_list main_task_list; /* List used to randomly pick a task. We use a second list because it's easier to randomize sched_list. */ + struct starpu_task_list sched_list; + starpu_pthread_mutex_t policy_mutex; +}; + +struct _starpu_darts_pointer_in_task +{ + /* Pointer to the data used by the current task */ + starpu_data_handle_t *pointer_to_D; + struct _starpu_darts_task_using_data **tud; + struct starpu_task *pointer_to_cell; /* Pointer to the cell in the main task list */ +}; + +/** Planned task. One planned task = one processing unit. **/ +struct _starpu_darts_gpu_planned_task +{ + struct starpu_task_list planned_task; + + struct starpu_task_list refused_fifo_list; /* If a task is refused, it goes in this fifo list so it can be the next task processed by the right gpu */ + + void *gpu_data; /* Data not loaded yet. */ + void *new_gpu_data; /* Data not loaded yet that are from the new tasks. This is used only with STARPU_DARTS_DATA_ORDER=1 that randomize the new data and put them at the end of the list. */ + + starpu_data_handle_t data_to_evict_next; /* If an eviction fails, it allows to evict it next. */ + + bool first_task; /* If it's the first task of a GPU, we can directly return it and not look for the "best" data. */ + + int number_data_selection; + + struct starpu_task *first_task_to_pop; /* First task to return if the task order is not randomized, i.e. STARPU_DARTS_TASK_ORDER == 2, which is the default case. */ +}; + +/* Struct dans user_data des handles pour reset MAIS aussi pour savoir le nombre de tâches dans pulled task qui utilise cette donnée */ +struct _starpu_darts_handle_user_data +{ + int last_iteration_DARTS; + int* nb_task_in_pulled_task; + int* nb_task_in_planned_task; + int* last_check_to_choose_from; /* To clarify the last time I looked at this data, so as not to look at it twice in choose best data from 1 in the same iteration of searching for the best data. */ + int* is_present_in_data_not_used_yet; /* Array of the number of GPUs used in push_task to find out whether data is present in a GPU's datanotusedyet. Updated when data is used and removed from the list, and when data is pushed. Provides a quick indication of whether data should be added or not. */ + double sum_remaining_task_expected_length; /* Sum of expected job durations using this data. Used to tie break. Initialized in push task, decreased when adding a task in planned task and increased when removing a task from planned task after an eviction. */ +}; + +/** Task out of pulled task. Updated by post_exec. I'm forced to use a list of single task and not task list because else starpu doesn't allow me to push a tasks in two different task_list **/ +LIST_TYPE(_starpu_darts_pulled_task, + struct starpu_task *pointer_to_pulled_task; +); + +struct _starpu_darts_gpu_pulled_task +{ + int test; + struct _starpu_darts_pulled_task_list *ptl; +}; + +/** In the "packages" of dynamic data aware, each representing a gpu **/ +LIST_TYPE(_starpu_darts_gpu_data_not_used, + starpu_data_handle_t D; /* The data not used yet by the GPU. */ +); + +/** In the handles **/ +LIST_TYPE(_starpu_darts_task_using_data, + /* Pointer to the main task list T */ + struct starpu_task *pointer_to_T; +); + +static starpu_data_handle_t *Dopt; +static bool *data_conflict; + +/** Mutex **/ +#ifdef STARPU_DARTS_LINEAR_MUTEX +static starpu_pthread_mutex_t linear_mutex; /* Mutex that make almost everything linear. Used in the IPDPS version of this algorithm and also to ease debugs. Not used in the default case. */ +#define _LINEAR_MUTEX_LOCK() STARPU_PTHREAD_MUTEX_LOCK(&linear_mutex) +#define _LINEAR_MUTEX_UNLOCK() STARPU_PTHREAD_MUTEX_UNLOCK(&linear_mutex) +#define _LINEAR_MUTEX_INIT() STARPU_PTHREAD_MUTEX_INIT(&linear_mutex, NULL) + +#define _REFINED_MUTEX_LOCK() +#define _REFINED_MUTEX_UNLOCK() +#define _REFINED_MUTEX_INIT() + +#else + +static starpu_pthread_mutex_t refined_mutex; /* Protect the main task list and the data. This is the mutex used by default */ +#define _REFINED_MUTEX_LOCK() STARPU_PTHREAD_MUTEX_LOCK(&refined_mutex) +#define _REFINED_MUTEX_UNLOCK() STARPU_PTHREAD_MUTEX_UNLOCK(&refined_mutex) +#define _REFINED_MUTEX_INIT() STARPU_PTHREAD_MUTEX_INIT(&refined_mutex, NULL) + +#define _LINEAR_MUTEX_LOCK() +#define _LINEAR_MUTEX_UNLOCK() +#define _LINEAR_MUTEX_INIT() +#endif + + +static int can_a_data_be_in_mem_and_in_not_used_yet; +static int eviction_strategy_darts; +static int threshold; +static int app; +static int choose_best_data_from; +static int simulate_memory; +static int task_order; +static int data_order; +static int prio; +static int free_pushed_task_position; +static int dependances; +static int graph_descendants; +static int dopt_selection_order; +static int highest_priority_task_returned_in_default_case; +static int push_free_task_on_gpu_with_least_task_in_planned_task; +static int round_robin_free_task; +static int cpu_only; +static int _nb_gpus; + +static bool new_tasks_initialized; +static struct _starpu_darts_gpu_planned_task *tab_gpu_planned_task; +static struct _starpu_darts_gpu_pulled_task *tab_gpu_pulled_task; +static int NT_DARTS; +static int iteration_DARTS; +static struct starpu_perfmodel_arch *perf_arch; +static int *memory_nodes; +static char *_output_directory; + +#ifdef STARPU_DARTS_STATS +static int nb_return_null_after_scheduling; +static int nb_return_task_after_scheduling; +static int nb_return_null_because_main_task_list_empty; +static int nb_new_task_initialized; +static int nb_refused_task; +static int victim_selector_refused_not_on_node; +static int victim_selector_refused_cant_evict; +static int victim_selector_return_refused; +static int victim_selector_return_unvalid; +static int victim_selector_return_data_not_in_planned_and_pulled; +static int number_data_conflict; +static int number_critical_data_conflict; +static int victim_evicted_compteur; +static int victim_selector_compteur; +static int victim_selector_return_no_victim; +static int victim_selector_belady; +static int nb_1_from_free_task_not_found; +static int number_random_selection; +static int nb_free_choice; +static int nb_1_from_free_choice; +static int nb_data_selection_per_index; +static int nb_task_added_in_planned_task; +static bool data_choice_per_index; +static int nb_data_selection; + +static long long time_total_selector; +static long long time_total_evicted; +static long long time_total_belady; +static long long time_total_schedule; +static long long time_total_choose_best_data; +static long long time_total_fill_planned_task_list; +static long long time_total_initialisation; +static long long time_total_randomize; +static long long time_total_pick_random_task; +static long long time_total_least_used_data_planned_task; +static long long time_total_createtolasttaskfinished; +struct timeval time_start_createtolasttaskfinished; +#endif + +/** If a data is a redux or a scratch it's only used to optimize a computation and + * does not contain any valuable information. Thus we ignore it. **/ +#define STARPU_IGNORE_UTILITIES_HANDLES(task, index) if ((STARPU_TASK_GET_MODE(task, index) & STARPU_SCRATCH) || (STARPU_TASK_GET_MODE(task, index) & STARPU_REDUX)) { continue; } +#define STARPU_IGNORE_UTILITIES_HANDLES_FROM_DATA(handle) if ((handle->current_mode == STARPU_SCRATCH) || (handle->current_mode == STARPU_REDUX)) { continue; } + +static int _get_number_GPU() +{ + int return_value = starpu_memory_nodes_get_count_by_kind(STARPU_CUDA_RAM); + + if (return_value == 0) /* We are not using GPUs so we are in an out-of-core case using CPUs. Need to return 1. If I want to deal with GPUs AND CPUs we need to adpt this function to return NGPU + 1 */ + { + return 1; + } + + return return_value; +} + +/* Return the number of handle used by a task without considering the scratch data used by a cusolver. */ +static int get_nbuffer_without_scratch(struct starpu_task *t) +{ + int count = 0; + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(t); i++) + { + if ((STARPU_TASK_GET_MODE(t, i) & STARPU_SCRATCH) || (STARPU_TASK_GET_MODE(t, i) & STARPU_REDUX)) + { + continue; + } + else + { + count += 1; + } + } + return count; +} + +/* Set priority to the tasks depending on the progression on the task graph. Used when STARPU_DARTS_GRAPH_DESCENDANTS is set to 1 or 2. by default STARPU_DARTS_GRAPH_DESCENDANTS is set to 0. */ +static void set_priority(void *_data, struct _starpu_graph_node *node) +{ + (void)_data; + starpu_worker_relax_on(); + STARPU_PTHREAD_MUTEX_LOCK(&node->mutex); + starpu_worker_relax_off(); + struct _starpu_job *job = node->job; + + if (job) + { + job->task->priority = node->descendants; + + _STARPU_SCHED_PRINT("Descendants of job %p (%s): %d\n", job->task, starpu_task_get_name(job->task), job->task->priority); + } + STARPU_PTHREAD_MUTEX_UNLOCK(&node->mutex); +} + +void _starpu_darts_tab_gpu_planned_task_init() +{ + int i; + for (i = 0; i < _nb_gpus; i++) + { + starpu_task_list_init(&tab_gpu_planned_task[i].planned_task); + starpu_task_list_init(&tab_gpu_planned_task[i].refused_fifo_list); + tab_gpu_planned_task[i].data_to_evict_next = NULL; + tab_gpu_planned_task[i].first_task = true; + tab_gpu_planned_task[i].number_data_selection = 0; + + tab_gpu_planned_task[i].gpu_data = _starpu_darts_gpu_data_not_used_list_new(); + _starpu_darts_gpu_data_not_used_list_init(tab_gpu_planned_task[i].gpu_data); + + if (data_order == 1) + { + tab_gpu_planned_task[i].new_gpu_data = _starpu_darts_gpu_data_not_used_list_new(); + _starpu_darts_gpu_data_not_used_list_init(tab_gpu_planned_task[i].new_gpu_data); + } + + tab_gpu_planned_task[i].first_task_to_pop = NULL; + } +} + +void _starpu_darts_tab_gpu_pulled_task_init() +{ + int i; + for (i = 0; i < _nb_gpus; i++) + { + struct _starpu_darts_pulled_task_list *p = _starpu_darts_pulled_task_list_new(); + tab_gpu_pulled_task[i].ptl = p; + tab_gpu_pulled_task[i].test = 0; + } +} + +/* Function called directly in the applications of starpu to reinit the struct of darts. Used when multiple iteration of a same application are lauched in the same execution. */ +void starpu_darts_reinitialize_structures() +{ + _REFINED_MUTEX_LOCK(); + _LINEAR_MUTEX_LOCK(); + + /* Printing stats in files. Préciser PRINT_N dans les var d'env. */ + _STARPU_SCHED_PRINT("############### Itération n°%d ###############\n", iteration_DARTS + 1); + +#ifdef STARPU_DARTS_STATS + printf("Nb \"random\" task selection: %d\n", number_random_selection); + printf("Nb \"index\" data selection: %d/%d\n", nb_data_selection_per_index, nb_data_selection); + if (iteration_DARTS == 11 || starpu_get_env_number_default("STARPU_SCHED_PRINT_TIME", 0) == 2) /* PRINT_TIME = 2 pour quand on a 1 seule itération. */ + { + { + int size = strlen(_output_directory) + strlen("/Data_DARTS_Nb_conflit_donnee.csv") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_DARTS_Nb_conflit_donnee.csv"); + FILE *f_new_iteration = fopen(path, "a"); + STARPU_ASSERT_MSG(f_new_iteration, "cannot open file <%s>\n", path); + fprintf(f_new_iteration , "%d,%d,%d\n", _print_n, number_data_conflict/11 + number_data_conflict%11, number_critical_data_conflict/11 + number_critical_data_conflict%11); + fclose(f_new_iteration); + } + + struct timeval time_end_createtolasttaskfinished; + gettimeofday(&time_end_createtolasttaskfinished, NULL); + time_total_createtolasttaskfinished += (time_end_createtolasttaskfinished.tv_sec - time_start_createtolasttaskfinished.tv_sec)*1000000LL + time_end_createtolasttaskfinished.tv_usec - time_start_createtolasttaskfinished.tv_usec; + + { + int size = strlen(_output_directory) + strlen("/Data_DARTS_time.csv") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_DARTS_time.csv"); + FILE *f_new_iteration = fopen(path, "a"); + STARPU_ASSERT_MSG(f_new_iteration, "cannot open file <%s>\n", path); + fprintf(f_new_iteration, "%d,%lld,%lld,%lld,%lld,%lld,%lld,%lld,%lld,%lld,%lld,%lld\n", _print_n, time_total_selector/11 + time_total_selector%11, time_total_evicted/11 + time_total_evicted%11, time_total_belady/11 + time_total_belady%11, time_total_schedule/11 + time_total_schedule%11, time_total_choose_best_data/11 + time_total_choose_best_data%11, time_total_fill_planned_task_list/11 + time_total_fill_planned_task_list%11, time_total_initialisation/11 + time_total_initialisation%11, time_total_randomize/11 + time_total_randomize%11, time_total_pick_random_task/11 + time_total_pick_random_task%11, time_total_least_used_data_planned_task/11 + time_total_least_used_data_planned_task%11, time_total_createtolasttaskfinished/11 + time_total_createtolasttaskfinished%11); + fclose(f_new_iteration); + } + + { + int size = strlen(_output_directory) + strlen("/Data_DARTS_Choice_during_scheduling.csv") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_DARTS_Choice_during_scheduling.csv"); + FILE *f_new_iteration = fopen(path, "a"); + STARPU_ASSERT_MSG(f_new_iteration, "cannot open file <%s>\n", path); + fprintf(f_new_iteration, "%d,%d,%d,%d,%d,%d,%d,%d\n", _print_n, nb_return_null_after_scheduling/11 + nb_return_null_after_scheduling%11, nb_return_task_after_scheduling/11 + nb_return_task_after_scheduling%11, nb_return_null_because_main_task_list_empty/11 + nb_return_null_because_main_task_list_empty%11, number_random_selection/11 + number_random_selection%11, nb_1_from_free_task_not_found/11 + nb_1_from_free_task_not_found%11, nb_free_choice/11 + nb_free_choice%11, nb_1_from_free_choice/11 + nb_1_from_free_choice%11); + fclose(f_new_iteration); + } + + { + int size = strlen(_output_directory) + strlen("/Data_DARTS_Choice_victim_selector.csv") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_DARTS_Choice_victim_selector.csv"); + FILE *f_new_iteration = fopen(path, "a"); + STARPU_ASSERT_MSG(f_new_iteration, "cannot open file <%s>\n", path); + fprintf(f_new_iteration, "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n", _print_n, victim_selector_refused_not_on_node/11 + victim_selector_refused_not_on_node%11, victim_selector_refused_cant_evict/11 + victim_selector_refused_cant_evict%11, victim_selector_return_refused/11 + victim_selector_return_refused%11, victim_selector_return_unvalid/11 + victim_selector_return_unvalid%11, victim_selector_return_data_not_in_planned_and_pulled/11 + victim_selector_return_data_not_in_planned_and_pulled%11, victim_evicted_compteur/11 + victim_evicted_compteur%11, victim_selector_compteur/11 + victim_selector_compteur%11, victim_selector_return_no_victim/11 + victim_selector_return_no_victim%11, victim_selector_belady/11 + victim_selector_belady%11); + fclose(f_new_iteration); + } + + { + int size = strlen(_output_directory) + strlen("/Data_DARTS_DARTS_Misc.csv") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_DARTS_DARTS_Misc.csv"); + FILE *f_new_iteration = fopen(path, "a"); + STARPU_ASSERT_MSG(f_new_iteration, "cannot open file <%s>\n", path); + fprintf(f_new_iteration, "%d,%d,%d\n", _print_n, nb_refused_task/11 + nb_refused_task%11, nb_new_task_initialized/11 + nb_new_task_initialized%11); + fclose(f_new_iteration); + } + } +#endif + + /* Re-init for the next iteration of the application */ + free(tab_gpu_planned_task); + iteration_DARTS++; /* Used to know if a data must be added again in the list of data of each planned task. */ + tab_gpu_planned_task = calloc(_nb_gpus, sizeof(struct _starpu_darts_gpu_planned_task)); + _starpu_darts_tab_gpu_planned_task_init(); + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); +} + +static void print_task_info(struct starpu_task *task) +{ + (void)task; +#ifdef PRINT + printf("Task %p has %d data:", task, get_nbuffer_without_scratch(task)); + + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + printf(" %p", STARPU_TASK_GET_HANDLE(task, i)); + } + printf("\n"); +#endif +} + +static void print_task_list(struct starpu_task_list *l, char *s) +{ + (void)l; (void)s; +#ifdef PRINT + printf("%s :\n", s); fflush(stdout); + struct starpu_task *task; + for (task = starpu_task_list_begin(l); task != starpu_task_list_end(l); task = starpu_task_list_next(task)) + { + printf("%p (prio: %d):", task, task->priority); fflush(stdout); + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + printf(" %p", STARPU_TASK_GET_HANDLE(task, i)); fflush(stdout); + } + printf("\n"); fflush(stdout); + } +#endif +} + +static void print_data_not_used_yet() +{ +#ifdef PRINT + int i; + for (i = 0; i < _nb_gpus; i++) + { + printf("On GPU %d, there are %d data not used yet:", i, _starpu_darts_gpu_data_not_used_list_size(tab_gpu_planned_task[i].gpu_data)); fflush(stdout); + struct _starpu_darts_gpu_data_not_used *e; + for (e = _starpu_darts_gpu_data_not_used_list_begin(tab_gpu_planned_task[i].gpu_data); + e != _starpu_darts_gpu_data_not_used_list_end(tab_gpu_planned_task[i].gpu_data); + e = _starpu_darts_gpu_data_not_used_list_next(e)) + { + printf(" %p", e->D); fflush(stdout); + } + printf("\n"); fflush(stdout); + } + printf("\n"); fflush(stdout); +#endif +} + +static void print_pulled_task_one_gpu(struct _starpu_darts_gpu_pulled_task *g, int current_gpu) +{ + (void)g; (void)current_gpu; +#ifdef PRINT + printf("Pulled task for GPU %d:\n", current_gpu); fflush(stdout); + struct _starpu_darts_pulled_task *p; + for (p = _starpu_darts_pulled_task_list_begin(tab_gpu_pulled_task[current_gpu].ptl); p != _starpu_darts_pulled_task_list_end(tab_gpu_pulled_task[current_gpu].ptl); p = _starpu_darts_pulled_task_list_next(p)) + { + printf("%p\n", p->pointer_to_pulled_task); fflush(stdout); + } +#endif +} + +static void print_data_not_used_yet_one_gpu(struct _starpu_darts_gpu_planned_task *g, int current_gpu) +{ + (void)g; (void)current_gpu; +#ifdef PRINT + printf("Data not used yet on GPU %d are:\n", current_gpu); fflush(stdout); + if (g->gpu_data != NULL) + { + struct _starpu_darts_gpu_data_not_used *e; + for (e = _starpu_darts_gpu_data_not_used_list_begin(tab_gpu_planned_task[current_gpu].gpu_data); + e != _starpu_darts_gpu_data_not_used_list_end(tab_gpu_planned_task[current_gpu].gpu_data); + e = _starpu_darts_gpu_data_not_used_list_next(e)) + { + printf(" %p", e->D); fflush(stdout); + } + } + printf("\n"); fflush(stdout); +#endif +} + +#ifdef PRINT +static void check_double_in_data_not_used_yet(struct _starpu_darts_gpu_planned_task *g, int current_gpu) +{ + (void)g; + printf("Careful you are using check_double_in_data_not_used_yet it cost time!\n"); fflush(stdout); + struct _starpu_darts_gpu_data_not_used *e1; + for (e1 = _starpu_darts_gpu_data_not_used_list_begin(tab_gpu_planned_task[current_gpu].gpu_data); + e1 != _starpu_darts_gpu_data_not_used_list_end(tab_gpu_planned_task[current_gpu].gpu_data); + e1 = _starpu_darts_gpu_data_not_used_list_next(e1)) + { + struct _starpu_darts_gpu_data_not_used *e2; + for (e2 = _starpu_darts_gpu_data_not_used_list_next(e1); + e2 != _starpu_darts_gpu_data_not_used_list_end(tab_gpu_planned_task[current_gpu].gpu_data); + e2 = _starpu_darts_gpu_data_not_used_list_next(e2)) + { + if (e1->D == e2->D) + { + printf("Data %p is in double on GPU %d!\n", e1->D, current_gpu); fflush(stdout); + print_data_not_used_yet_one_gpu(&tab_gpu_planned_task[current_gpu], current_gpu); fflush(stdout); + exit(1); + } + } + } +} +#endif + +/* Looking if the task can freely be computed by looking at the memory and the data associated from free task in planned task */ +static bool is_my_task_free(int current_gpu, struct starpu_task *task) +{ + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + if (STARPU_TASK_GET_HANDLE(task, i)->user_data == NULL) + { + return false; + } + struct _starpu_darts_handle_user_data *hud; + hud = STARPU_TASK_GET_HANDLE(task, i)->user_data; + if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(task, i), memory_nodes[current_gpu]) && hud->nb_task_in_planned_task[current_gpu] == 0) + //~ if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(task, i), memory_nodes[current_gpu]) && hud->nb_task_in_planned_task[current_gpu] == 0 && hud->nb_task_in_pulled_task[current_gpu] == 0) + { + return false; + } + } + return true; +} + +/* Initialize for: + * tasks -> pointer to the data it uses, pointer to the pointer of task list in the data, + * pointer to the cell in the main task list (main_task_list). + * data -> pointer to the tasks using this data. + * GPUs -> data not used yet by this GPU. + * In the case with dependencies I have to check if a data need to be added again even if it's struct is empty. + */ +/* Version when you don't have dependencies */ +static void initialize_task_data_gpu_single_task_no_dependencies(struct starpu_task *task, int also_add_data_in_not_used_yet_list) +{ + if (also_add_data_in_not_used_yet_list == 1) + { + /* Adding the data not used yet in all the GPU(s). */ + int i; + for (i = 0; i < _nb_gpus; i++) + { + unsigned j; + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(task); j++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, j); + struct _starpu_darts_gpu_data_not_used *e = _starpu_darts_gpu_data_not_used_new(); + e->D = STARPU_TASK_GET_HANDLE(task, j); + + /* If the data already has an existing structure */ + if (STARPU_TASK_GET_HANDLE(task, j)->user_data != NULL) + { + struct _starpu_darts_handle_user_data *hud = STARPU_TASK_GET_HANDLE(task, j)->user_data; + + if (hud->last_iteration_DARTS != iteration_DARTS || hud->is_present_in_data_not_used_yet[i] == 0) /* It is a new iteration of the same application, so the data must be re-initialized. */ + { + if (data_order == 1) + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[i].new_gpu_data, e); + } + else + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[i].gpu_data, e); + } + } + } + else + { + if (data_order == 1) + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[i].new_gpu_data, e); + } + else + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[i].gpu_data, e); + } + } + } + } + } + + /* Adding the pointer in the task. */ + struct _starpu_darts_pointer_in_task *pt = malloc(sizeof(*pt)); + pt->pointer_to_cell = task; + pt->pointer_to_D = malloc(get_nbuffer_without_scratch(task)*sizeof(STARPU_TASK_GET_HANDLE(task, 0))); + pt->tud = malloc(get_nbuffer_without_scratch(task)*sizeof(_starpu_darts_task_using_data_new())); + + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + /* Pointer toward the main task list in the handles. */ + struct _starpu_darts_task_using_data *e = _starpu_darts_task_using_data_new(); + e->pointer_to_T = task; + + /* Adding the task in the list of task using the data */ + if (STARPU_TASK_GET_HANDLE(task, i)->sched_data == NULL) + { + struct _starpu_darts_task_using_data_list *tl = _starpu_darts_task_using_data_list_new(); + _starpu_darts_task_using_data_list_push_front(tl, e); + STARPU_TASK_GET_HANDLE(task, i)->sched_data = tl; + } + else + { + _starpu_darts_task_using_data_list_push_front(STARPU_TASK_GET_HANDLE(task, i)->sched_data, e); + } + + /* Init hud in the data containing a way to track the number of task in + * planned and pulled_task but also a way to check last iteration_DARTS for this data and last check for CHOOSE_FROM_MEM=1 + * so we don't look twice at the same data. */ + if (STARPU_TASK_GET_HANDLE(task, i)->user_data == NULL) + { + struct _starpu_darts_handle_user_data *hud = malloc(sizeof(*hud)); + hud->last_iteration_DARTS = iteration_DARTS; + + /* Need to init them with the number of GPU */ + hud->nb_task_in_pulled_task = malloc(_nb_gpus*sizeof(int)); + hud->nb_task_in_planned_task = malloc(_nb_gpus*sizeof(int)); + hud->last_check_to_choose_from = malloc(_nb_gpus*sizeof(int)); + hud->is_present_in_data_not_used_yet = malloc(_nb_gpus*sizeof(int)); + hud->sum_remaining_task_expected_length = starpu_task_expected_length(task, perf_arch, 0); + + int j; + for (j = 0; j < _nb_gpus; j++) + { + hud->nb_task_in_pulled_task[j] = 0; + hud->nb_task_in_planned_task[j] = 0; + hud->last_check_to_choose_from[j] = 0; + hud->is_present_in_data_not_used_yet[j] = 1; + } + + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + else + { + struct _starpu_darts_handle_user_data *hud = STARPU_TASK_GET_HANDLE(task, i)->user_data; + hud->sum_remaining_task_expected_length += starpu_task_expected_length(task, perf_arch, 0); + if (hud->last_iteration_DARTS != iteration_DARTS || hud->is_present_in_data_not_used_yet[i] == 0) /* Re-init values in hud. */ + { + int j; + for (j = 0; j < _nb_gpus; j++) + { + hud->nb_task_in_pulled_task[j] = 0; + hud->nb_task_in_planned_task[j] = 0; + hud->last_check_to_choose_from[j] = 0; + hud->is_present_in_data_not_used_yet[j] = 1; + } + hud->last_iteration_DARTS = iteration_DARTS; + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + } + + /* Adding the pointer in the task toward the data. */ + pt->pointer_to_D[i] = STARPU_TASK_GET_HANDLE(task, i); + pt->tud[i] = e; + } + task->sched_data = pt; +} + +/* V3 used for dependencies */ +static void initialize_task_data_gpu_single_task_dependencies(struct starpu_task *task, int also_add_data_in_not_used_yet_list) +{ + unsigned i; + + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + + if (STARPU_TASK_GET_HANDLE(task, i)->user_data == NULL) + { + struct _starpu_darts_handle_user_data *hud = malloc(sizeof(*hud)); + hud->last_iteration_DARTS = iteration_DARTS; + hud->nb_task_in_pulled_task = malloc(_nb_gpus*sizeof(int)); + hud->nb_task_in_planned_task = malloc(_nb_gpus*sizeof(int)); + hud->last_check_to_choose_from = malloc(_nb_gpus*sizeof(int)); + hud->is_present_in_data_not_used_yet = malloc(_nb_gpus*sizeof(int)); + hud->sum_remaining_task_expected_length = starpu_task_expected_length(task, perf_arch, 0); + + _STARPU_SCHED_PRINT("Data is new. Expected length in data %p: %f\n", STARPU_TASK_GET_HANDLE(task, i), hud->sum_remaining_task_expected_length); + + int j; + for (j = 0; j < _nb_gpus; j++) + { + struct _starpu_darts_gpu_data_not_used *e = _starpu_darts_gpu_data_not_used_new(); + e->D = STARPU_TASK_GET_HANDLE(task, i); + + hud->nb_task_in_pulled_task[j] = 0; + hud->nb_task_in_planned_task[j] = 0; + hud->last_check_to_choose_from[j] = 0; + hud->is_present_in_data_not_used_yet[j] = 0; + + if (also_add_data_in_not_used_yet_list == 1 && (can_a_data_be_in_mem_and_in_not_used_yet == 1 || !starpu_data_is_on_node(e->D, memory_nodes[j]))) + { + hud->is_present_in_data_not_used_yet[j] = 1; + if (data_order == 1) + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[j].new_gpu_data, e); + } + else + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[j].gpu_data, e); + } + } + } + _STARPU_SCHED_PRINT("%p gets 1 at is_present_in_data_not_used_yet from NULL struct hud\n", STARPU_TASK_GET_HANDLE(task, i)); + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + else + { + struct _starpu_darts_handle_user_data *hud = STARPU_TASK_GET_HANDLE(task, i)->user_data; + _STARPU_SCHED_PRINT("New task. Expected length in data %p: %f\n", STARPU_TASK_GET_HANDLE(task, i), hud->sum_remaining_task_expected_length); + + if (hud->last_iteration_DARTS != iteration_DARTS) + { + hud->last_iteration_DARTS = iteration_DARTS; + hud->sum_remaining_task_expected_length = starpu_task_expected_length(task, perf_arch, 0); + + int j; + for (j = 0; j < _nb_gpus; j++) + { + struct _starpu_darts_gpu_data_not_used *e = _starpu_darts_gpu_data_not_used_new(); + e->D = STARPU_TASK_GET_HANDLE(task, i); + + hud->nb_task_in_pulled_task[j] = 0; + hud->nb_task_in_planned_task[j] = 0; + hud->last_check_to_choose_from[j] = 0; + hud->is_present_in_data_not_used_yet[j] = 0; + + if (also_add_data_in_not_used_yet_list == 1 && (can_a_data_be_in_mem_and_in_not_used_yet == 1 || !starpu_data_is_on_node(e->D, memory_nodes[j]))) + { + hud->is_present_in_data_not_used_yet[j] = 1; + if (data_order == 1) + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[j].new_gpu_data, e); + } + else + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[j].gpu_data, e); + } + print_data_not_used_yet(); + } + } + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + else + { + hud->sum_remaining_task_expected_length += starpu_task_expected_length(task, perf_arch, 0); + + int j; + for (j = 0; j < _nb_gpus; j++) + { + struct _starpu_darts_gpu_data_not_used *e = _starpu_darts_gpu_data_not_used_new(); + e->D = STARPU_TASK_GET_HANDLE(task, i); + + if (hud->is_present_in_data_not_used_yet[j] == 0 && also_add_data_in_not_used_yet_list == 1 && (can_a_data_be_in_mem_and_in_not_used_yet == 1 || !starpu_data_is_on_node(e->D, memory_nodes[j]))) + { + _STARPU_SCHED_PRINT("%p gets 1 at is_present_in_data_not_used_yet on GPU %d\n", STARPU_TASK_GET_HANDLE(task, i), j); + hud->is_present_in_data_not_used_yet[j] = 1; + if (data_order == 1) + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[j].new_gpu_data, e); + } + else + { + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[j].gpu_data, e); + } + } + } + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + } + } + + /* Adding the pointer in the task. */ + struct _starpu_darts_pointer_in_task *pt = malloc(sizeof(*pt)); + pt->pointer_to_cell = task; + pt->pointer_to_D = malloc(get_nbuffer_without_scratch(task)*sizeof(STARPU_TASK_GET_HANDLE(task, 0))); + pt->tud = malloc(get_nbuffer_without_scratch(task)*sizeof(_starpu_darts_task_using_data_new())); + + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + /* Pointer toward the main task list in the handles. */ + struct _starpu_darts_task_using_data *e = _starpu_darts_task_using_data_new(); + e->pointer_to_T = task; + + /* Adding the task in the list of task using the data */ + if (STARPU_TASK_GET_HANDLE(task, i)->sched_data == NULL) + { + struct _starpu_darts_task_using_data_list *tl = _starpu_darts_task_using_data_list_new(); + _starpu_darts_task_using_data_list_push_front(tl, e); + STARPU_TASK_GET_HANDLE(task, i)->sched_data = tl; + } + else + { + _starpu_darts_task_using_data_list_push_front(STARPU_TASK_GET_HANDLE(task, i)->sched_data, e); + } + + //printf("Adding in tab at position %d out of %d\n", i, get_nbuffer_without_scratch(task) - 1); fflush(stdout); + + /* Adding the pointer in the task toward the data. */ + pt->pointer_to_D[i] = STARPU_TASK_GET_HANDLE(task, i); + pt->tud[i] = e; + } + task->sched_data = pt; +} + +// Merges two subarrays of arr[]. +// First subarray is arr[l..m] +// Second subarray is arr[m+1..r] +static void merge_tab_of_int(int arr[], int l, int m, int r, int tab_of_int[]) +{ + int i, j, k; + int n1 = m - l + 1; + int n2 = r - m; + + /* create temp arrays */ + int L[n1], R[n2]; + int L_task_tab[n1]; + int R_task_tab[n2]; + + /* Copy data to temp arrays L[] and R[] */ + for (i = 0; i < n1; i++) + { + L[i] = arr[l + i]; + L_task_tab[i] = tab_of_int[l + i]; + } + for (j = 0; j < n2; j++) + { + R[j] = arr[m + 1 + j]; + R_task_tab[j] = tab_of_int[m + 1 + j]; + } + + /* Merge the temp arrays back into arr[l..r]*/ + i = 0; // Initial index of first subarray + j = 0; // Initial index of second subarray + k = l; // Initial index of merged subarray + while (i < n1 && j < n2) + { + if (L[i] <= R[j]) + { + arr[k] = L[i]; + tab_of_int[k] = L_task_tab[i]; + i++; + } + else + { + arr[k] = R[j]; + tab_of_int[k] = R_task_tab[j]; + j++; + } + k++; + } + + /* Copy the remaining elements of L[], if there + are any */ + while (i < n1) + { + arr[k] = L[i]; + tab_of_int[k] = L_task_tab[i]; + i++; + k++; + } + + /* Copy the remaining elements of R[], if there + are any */ + while (j < n2) + { + arr[k] = R[j]; + tab_of_int[k] = R_task_tab[j]; + j++; + k++; + } +} + +/* l is for left index and r is right index of the +sub-array of arr to be sorted */ +static void mergeSort_tab_of_int(int *arr, int l, int r, int *tab_of_int) +{ + if (l < r) + { + // Same as (l+r)/2, but avoids overflow for + // large l and h + int m = l + (r - l) / 2; + + // Sort first and second halves + mergeSort_tab_of_int(arr, l, m, tab_of_int); + mergeSort_tab_of_int(arr, m + 1, r, tab_of_int); + + merge_tab_of_int(arr, l, m, r, tab_of_int); + } +} + +static void _starpu_darts_increment_planned_task_data(struct starpu_task *task, int current_gpu) +{ + /* Careful, we do not want duplicates here */ + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + struct _starpu_darts_handle_user_data *hud = STARPU_TASK_GET_HANDLE(task, i)->user_data; + hud->nb_task_in_planned_task[current_gpu] = hud->nb_task_in_planned_task[current_gpu] + 1; + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } +} + +/* Pushing the tasks. Each time a new task enter here, we initialize it. */ +static int darts_push_task(struct starpu_sched_component *component, struct starpu_task *task) +{ +#ifdef PRINT + printf("New task %p (%s, prio: %d, length: %f) in push_task with data(s):", task, starpu_task_get_name(task), task->priority, starpu_task_expected_length(task, perf_arch, 0)); fflush(stdout); + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if (STARPU_TASK_GET_MODE(task, i) & STARPU_SCRATCH) + { + printf(" %p mode is STARPU_SCRATCH\n", STARPU_TASK_GET_HANDLE(task, i)); fflush(stdout); + } + else if (STARPU_TASK_GET_MODE(task, i) & STARPU_REDUX) + { + printf(" %p mode is STARPU_REDUX\n", STARPU_TASK_GET_HANDLE(task, i)); fflush(stdout); + } + else + { + printf(" %p mode is R-RW-W", STARPU_TASK_GET_HANDLE(task, i)); fflush(stdout); + } + } + printf("\n"); fflush(stdout); +#endif + + _REFINED_MUTEX_LOCK(); + _LINEAR_MUTEX_LOCK(); + +#ifdef STARPU_DARTS_STATS + struct timeval time_start_initialisation; + gettimeofday(&time_start_initialisation, NULL); +#endif + +#ifdef PRINT + int x; + for (x = 0; x < _nb_gpus; x++) + { + check_double_in_data_not_used_yet(&tab_gpu_planned_task[x], x); + } +#endif + + /* If push_free_task_on_gpu_with_least_task_in_planned_task is not set to 1, these two variables are not useful */ + int *sorted_gpu_list_by_nb_task_in_planned_task = NULL; + int *planned_task_sizes = NULL; + + /* Pushing free task directly in a gpu's planned task. */ + if (push_free_task_on_gpu_with_least_task_in_planned_task == 1) /* Getting the gpu with the least tasks in planned task */ + { + sorted_gpu_list_by_nb_task_in_planned_task = malloc(_nb_gpus*sizeof(int)); + planned_task_sizes = malloc(_nb_gpus*sizeof(int)); + int j; + for (j = 0; j < _nb_gpus; j++) + { + sorted_gpu_list_by_nb_task_in_planned_task[j] = j; + planned_task_sizes[j] = starpu_task_list_size(&tab_gpu_planned_task[j].planned_task); + } + mergeSort_tab_of_int(planned_task_sizes, 0, _nb_gpus - 1, sorted_gpu_list_by_nb_task_in_planned_task); + } + + if (push_free_task_on_gpu_with_least_task_in_planned_task == 2) + { + round_robin_free_task++; + } + + int j; + for (j = 0; j < _nb_gpus; j++) + { + int gpu_looked_at; + if (push_free_task_on_gpu_with_least_task_in_planned_task == 1) + { + gpu_looked_at = sorted_gpu_list_by_nb_task_in_planned_task[j]; + } + else if (push_free_task_on_gpu_with_least_task_in_planned_task == 2) + { + gpu_looked_at = (j + round_robin_free_task)%_nb_gpus; + } + else + { + gpu_looked_at = j; + } + + _STARPU_SCHED_PRINT("gpu_looked_at = %d\n", gpu_looked_at); + + if (is_my_task_free(gpu_looked_at, task)) + { + _STARPU_SCHED_PRINT("Task %p is free from push_task\n", task); + if (dependances == 1) + { + initialize_task_data_gpu_single_task_dependencies(task, 0); + } + else + { + initialize_task_data_gpu_single_task_no_dependencies(task, 0); + } + + _starpu_darts_increment_planned_task_data(task, gpu_looked_at); + struct _starpu_darts_pointer_in_task *pt = task->sched_data; + unsigned y; + for (y = 0; y < STARPU_TASK_GET_NBUFFERS(task); y++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, y); + if (pt->tud[y] != NULL) + { + _starpu_darts_task_using_data_list_erase(pt->pointer_to_D[y]->sched_data, pt->tud[y]); + pt->tud[y] = NULL; + } + } + + _STARPU_SCHED_PRINT("Free task from push %p is put in planned task\n", task); + + /* Now push this free task into planned task. This can be done at the beginning of the list or after the last free task in planned task. */ + if (free_pushed_task_position == 0) + { + starpu_task_list_push_front(&tab_gpu_planned_task[gpu_looked_at].planned_task, task); + } + else + { + /* Après la dernière tâche gratuite de planned task. */ + struct starpu_task *checked_task; + for (checked_task = starpu_task_list_begin(&tab_gpu_planned_task[gpu_looked_at].planned_task); checked_task != starpu_task_list_end(&tab_gpu_planned_task[gpu_looked_at].planned_task); checked_task = starpu_task_list_next(checked_task)) + { + for (y = 0; y < STARPU_TASK_GET_NBUFFERS(checked_task); y++) + { + STARPU_IGNORE_UTILITIES_HANDLES(checked_task, y); + if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(checked_task, y), memory_nodes[gpu_looked_at])) + { + starpu_task_list_insert_before(&tab_gpu_planned_task[gpu_looked_at].planned_task, task, checked_task); + + if (push_free_task_on_gpu_with_least_task_in_planned_task == 1) + { + free(sorted_gpu_list_by_nb_task_in_planned_task); + free(planned_task_sizes); + } + + /* End now push task, no push in main task list or GPUs data */ + starpu_push_task_end(task); + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + component->can_pull(component); + return 0; + } + } + } + + /* Else push back */ + starpu_task_list_push_back(&tab_gpu_planned_task[gpu_looked_at].planned_task, task); + } + + if (push_free_task_on_gpu_with_least_task_in_planned_task == 1) + { + free(sorted_gpu_list_by_nb_task_in_planned_task); + free(planned_task_sizes); + } + + /* End now push task, no push in main task list or GPUs data */ + starpu_push_task_end(task); + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + component->can_pull(component); + return 0; + } + } + + if (push_free_task_on_gpu_with_least_task_in_planned_task == 1) + { + free(sorted_gpu_list_by_nb_task_in_planned_task); + free(planned_task_sizes); + } + + new_tasks_initialized = true; + + if (dependances == 1) + { + initialize_task_data_gpu_single_task_dependencies(task, 1); + } + else + { + initialize_task_data_gpu_single_task_no_dependencies(task, 1); + } + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_initialisation; + gettimeofday(&time_end_initialisation, NULL); + time_total_initialisation += (time_end_initialisation.tv_sec - time_start_initialisation.tv_sec)*1000000LL + time_end_initialisation.tv_usec - time_start_initialisation.tv_usec; +#endif + + /* Pushing the task in sched_list. It's this list that will be randomized + * and put in main_task_list in pull_task. + */ + struct _starpu_darts_sched_data *data = component->data; + if (task_order == 2 && dependances == 1) /* Cas ordre naturel mais avec dépendances. Pas de points de départs différents. Je met dans le back de la liste de tâches principales. */ + { + starpu_task_list_push_back(&data->main_task_list, task); + } + else + { + starpu_task_list_push_front(&data->sched_list, task); + NT_DARTS++; + } + starpu_push_task_end(task); + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + + component->can_pull(component); + return 0; +} + +static void merge(int arr[], int l, int m, int r, struct starpu_task **task_tab) +{ + int i, j, k; + int n1 = m - l + 1; + int n2 = r - m; + + /* create temp arrays */ + int L[n1], R[n2]; + struct starpu_task *L_task_tab[n1]; + struct starpu_task *R_task_tab[n2]; + + /* Copy data to temp arrays L[] and R[] */ + for (i = 0; i < n1; i++) + { + L[i] = arr[l + i]; + L_task_tab[i] = task_tab[l + i]; + } + for (j = 0; j < n2; j++) + { + R[j] = arr[m + 1 + j]; + R_task_tab[j] = task_tab[m + 1 + j]; + } + + /* Merge the temp arrays back into arr[l..r]*/ + i = 0; // Initial index of first subarray + j = 0; // Initial index of second subarray + k = l; // Initial index of merged subarray + while (i < n1 && j < n2) + { + if (L[i] <= R[j]) + { + arr[k] = L[i]; + task_tab[k] = L_task_tab[i]; + i++; + } + else + { + arr[k] = R[j]; + task_tab[k] = R_task_tab[j]; + j++; + } + k++; + } + + /* Copy the remaining elements of L[], if there + are any */ + while (i < n1) + { + arr[k] = L[i]; + task_tab[k] = L_task_tab[i]; + i++; + k++; + } + + /* Copy the remaining elements of R[], if there + are any */ + while (j < n2) + { + arr[k] = R[j]; + task_tab[k] = R_task_tab[j]; + j++; + k++; + } +} + +/* l is for left index and r is right index of the +sub-array of arr to be sorted */ +static void mergeSort(int *arr, int l, int r, struct starpu_task **task_tab) +{ + if (l < r) + { + // Same as (l+r)/2, but avoids overflow for + // large l and h + int m = l + (r - l) / 2; + + // Sort first and second halves + mergeSort(arr, l, m, task_tab); + mergeSort(arr, m + 1, r, task_tab); + + merge(arr, l, m, r, task_tab); + } +} + +/* Randomize only sched_data, so new task that were made available. */ +static void randomize_new_task_list(struct _starpu_darts_sched_data *d) +{ + int i = 0; + struct starpu_task *task_tab[NT_DARTS]; /* NT_DARTS is the number of "new" available tasks. */ + + for (i = 0; i < NT_DARTS; i++) + { + task_tab[i] = starpu_task_list_pop_front(&d->sched_list); + } + for (i = 0; i < NT_DARTS; i++) + { + int random = rand()%(NT_DARTS - i); + starpu_task_list_push_back(&d->main_task_list, task_tab[random]); + task_tab[random] = task_tab[NT_DARTS - i - 1]; + } +} + +/* Randomize the full set of tasks. */ +static void randomize_full_task_list(struct _starpu_darts_sched_data *d) +{ + /* Version where I choose random numbers for each task, then sort + * at the same time with a merge sort of the array of random integers and the array of + * tasks. Then I run through the main task list, inserting 1 by 1 the + * tasks at their positions. */ + int i; + int size_main_task_list = starpu_task_list_size(&d->main_task_list); + struct starpu_task *task_tab[NT_DARTS]; + struct starpu_task *task = NULL; + int random_number[NT_DARTS]; + int avancement_main_task_list = 0; + + /* Fill in a table with the new tasks + draw a random number + * for each task. */ + for (i = 0; i < NT_DARTS; i++) + { + task_tab[i] = starpu_task_list_pop_front(&d->sched_list); + random_number[i] = rand()%size_main_task_list; + } + + /* Appel du tri fusion. */ + mergeSort(random_number, 0, NT_DARTS - 1, task_tab); + + /* Filling the main task list in order and according to the number drawn. */ + task = starpu_task_list_begin(&d->main_task_list); + for (i = 0; i < NT_DARTS; i++) + { + int j; + for (j = avancement_main_task_list; j < random_number[i]; j++) + { + task = starpu_task_list_next(task); + avancement_main_task_list++; + } + starpu_task_list_insert_before(&d->main_task_list, task_tab[i], task); + } +} + +/* Each GPU has a pointer to its first task to pop. + * Then in scheduling, when you pop for the first time, that's the one. + * In addition, the first pop is managed directly outside random + * thanks to the first_task attribute of struct planned_task. */ +static void natural_order_task_list(struct _starpu_darts_sched_data *d) +{ + int j=0; + int i; + for (i = 0; i < NT_DARTS; i++) + { + if (i == (NT_DARTS/_nb_gpus)*j && j < _nb_gpus) + { + struct starpu_task *task; + task = starpu_task_list_pop_front(&d->sched_list); + tab_gpu_planned_task[j].first_task_to_pop = task; + starpu_task_list_push_back(&d->main_task_list, task); + j++; + } + else + { + starpu_task_list_push_back(&d->main_task_list, starpu_task_list_pop_front(&d->sched_list)); + } + } +} + +/* Randomize the full list of data not used yet for all the GPU. */ +static void randomize_full_data_not_used_yet() +{ + int i; + for (i = 0; i < _nb_gpus; i++) + { + int number_of_data = _starpu_darts_gpu_data_not_used_list_size(tab_gpu_planned_task[i].gpu_data); + struct _starpu_darts_gpu_data_not_used *data_tab[number_of_data]; + int j; + for (j = 0; j < number_of_data; j++) + { + data_tab[j] = _starpu_darts_gpu_data_not_used_list_pop_front(tab_gpu_planned_task[i].gpu_data); + } + struct _starpu_darts_gpu_data_not_used_list *randomized_list = _starpu_darts_gpu_data_not_used_list_new(); + + for (j = 0; j < number_of_data; j++) + { + int random = rand()%(number_of_data - j); + _starpu_darts_gpu_data_not_used_list_push_back(randomized_list, data_tab[random]); + + /* I replace the box with the last task in the table */ + data_tab[random] = data_tab[number_of_data - j - 1]; + } + /* Then replace the list with it. */ + tab_gpu_planned_task[i].gpu_data = randomized_list; + } +} + +/* Randomize the new data and put them at the end of datanotused for all the GPU. */ +static void randomize_new_data_not_used_yet() +{ + int i; + for (i = 0; i < _nb_gpus; i++) + { + if (!_starpu_darts_gpu_data_not_used_list_empty(tab_gpu_planned_task[i].new_gpu_data)) + { + int number_new_data = _starpu_darts_gpu_data_not_used_list_size(tab_gpu_planned_task[i].new_gpu_data); + struct _starpu_darts_gpu_data_not_used *data_tab[number_new_data]; + int j; + for (j = 0; j < number_new_data; j++) + { + data_tab[j] = _starpu_darts_gpu_data_not_used_list_pop_front(tab_gpu_planned_task[i].new_gpu_data); + } + for (j = 0; j < number_new_data; j++) + { + int random = rand()%(number_new_data - j); + _starpu_darts_gpu_data_not_used_list_push_back(tab_gpu_planned_task[i].gpu_data, data_tab[random]); + + data_tab[random] = data_tab[number_new_data - j - 1]; + } + } + } +} + +/* The set of task is not randomized. + * To make GPU work on different part of the applications they all have a version of the task list that start at a different position. + * GPU1 starts at the first task, GPU2 at the n/NGPUth task etc... */ +static void natural_order_data_not_used_yet() +{ + /* I need this for the %random. */ + int number_of_data = _starpu_darts_gpu_data_not_used_list_size(tab_gpu_planned_task[0].gpu_data); + + struct _starpu_darts_gpu_data_not_used *data_tab[number_of_data]; + int i; + for (i = 1; i < _nb_gpus; i++) + { + int j; + for (j = 0; j < (number_of_data/_nb_gpus)*i; j++) + { + data_tab[j] = _starpu_darts_gpu_data_not_used_list_pop_front(tab_gpu_planned_task[i].gpu_data); + } + struct _starpu_darts_gpu_data_not_used_list *natural_order_list = _starpu_darts_gpu_data_not_used_list_new(); + for (j = 0; j < number_of_data - ((number_of_data/_nb_gpus)*i); j++) + { + _starpu_darts_gpu_data_not_used_list_push_back(natural_order_list, _starpu_darts_gpu_data_not_used_list_pop_front(tab_gpu_planned_task[i].gpu_data)); + } + for (j = 0; j < (number_of_data/_nb_gpus)*i; j++) + { + _starpu_darts_gpu_data_not_used_list_push_back(natural_order_list, data_tab[j]); + } + + /* Then replace the list with it. */ + tab_gpu_planned_task[i].gpu_data = natural_order_list; + } +} + +/* Update the "best" data if the candidate data has better values. */ +static void update_best_data_single_decision_tree(int *number_free_task_max, double *remaining_expected_length_max, starpu_data_handle_t *handle_popped, int *priority_max, int *number_1_from_free_task_max, int nb_free_task_candidate, double remaining_expected_length_candidate, starpu_data_handle_t handle_candidate, int priority_candidate, int number_1_from_free_task_candidate, int *data_chosen_index, int i, struct starpu_task* *best_1_from_free_task, struct starpu_task *best_1_from_free_task_candidate, double transfer_min_candidate, double *transfer_min, double temp_length_free_tasks_max, double *ratio_transfertime_freetask_min) +{ + (void)data_chosen_index; + (void)i; + double ratio_transfertime_freetask_candidate = 0; + + /* There are more return that updates, so the if are reversed and if we don't return at all, then we have a new "best" data. */ + if (dopt_selection_order == 0) + { + /* First tiebreak with most free task */ + if (nb_free_task_candidate < *number_free_task_max) + { + return; + } + /* Then with number of 1 from free */ + else if (nb_free_task_candidate == *number_free_task_max) + { + if (number_1_from_free_task_candidate < *number_1_from_free_task_max) + { + return; + } + else if (number_1_from_free_task_candidate == *number_1_from_free_task_max) + { + /* Then with priority */ + if (prio == 1 && *priority_max > priority_candidate) + { + return; + } + /* Then with time of task in the list of task using this data */ + else if ((*priority_max == priority_candidate || prio == 0) && remaining_expected_length_candidate <= *remaining_expected_length_max) + { +#ifdef STARPU_DARTS_STATS + if (remaining_expected_length_candidate == *remaining_expected_length_max) + { + data_choice_per_index = true; + } +#endif + return; + } + } + } + } + else if (dopt_selection_order == 1) + { + if (nb_free_task_candidate < *number_free_task_max) + { + return; + } + else if (nb_free_task_candidate == *number_free_task_max) + { + if (prio == 1 && *priority_max > priority_candidate) + { + return; + } + else if (*priority_max == priority_candidate) + { + if (number_1_from_free_task_candidate < *number_1_from_free_task_max) + { + return; + } + else if ((number_1_from_free_task_candidate == *number_1_from_free_task_max) && remaining_expected_length_candidate <= *remaining_expected_length_max) + { +#ifdef STARPU_DARTS_STATS + if (remaining_expected_length_candidate == *remaining_expected_length_max) + { + data_choice_per_index = true; + } +#endif + return; + } + } + } + } + else if (dopt_selection_order == 2) + { + if (transfer_min_candidate > *transfer_min) + { + return; + } + else if (transfer_min_candidate == *transfer_min) + { + if (nb_free_task_candidate < *number_free_task_max) + { + return; + } + else if (nb_free_task_candidate == *number_free_task_max) + { + if (prio == 1 && *priority_max > priority_candidate) + { + return; + } + else if (*priority_max == priority_candidate) + { + if (number_1_from_free_task_candidate < *number_1_from_free_task_max) + { + return; + } + else if ((number_1_from_free_task_candidate == *number_1_from_free_task_max) && remaining_expected_length_candidate <= *remaining_expected_length_max) + { +#ifdef STARPU_DARTS_STATS + if (remaining_expected_length_candidate == *remaining_expected_length_max) + { + data_choice_per_index = true; + } +#endif + return; + } + } + } + } + } + else if (dopt_selection_order == 3) + { + if (nb_free_task_candidate < *number_free_task_max) + { + return; + } + else if (nb_free_task_candidate == *number_free_task_max) + { + if (transfer_min_candidate > *transfer_min) + { + return; + } + else if (transfer_min_candidate == *transfer_min) + { + if (prio == 1 && *priority_max > priority_candidate) + { + return; + } + else if (*priority_max == priority_candidate) + { + if (number_1_from_free_task_candidate < *number_1_from_free_task_max) + { + return; + } + else if ((number_1_from_free_task_candidate == *number_1_from_free_task_max) && remaining_expected_length_candidate <= *remaining_expected_length_max) + { +#ifdef STARPU_DARTS_STATS + if (remaining_expected_length_candidate == *remaining_expected_length_max) + { + data_choice_per_index = true; + } +#endif + return; + } + } + } + } + } + else if (dopt_selection_order == 4) + { + if (nb_free_task_candidate < *number_free_task_max) + { + return; + } + else if (nb_free_task_candidate == *number_free_task_max) + { + if (prio == 1 && *priority_max > priority_candidate) + { + return; + } + else if (*priority_max == priority_candidate) + { + if (transfer_min_candidate > *transfer_min) + { + return; + } + else if (transfer_min_candidate == *transfer_min) + { + if (number_1_from_free_task_candidate < *number_1_from_free_task_max) + { + return; + } + else if ((number_1_from_free_task_candidate == *number_1_from_free_task_max) && remaining_expected_length_candidate <= *remaining_expected_length_max) + { +#ifdef STARPU_DARTS_STATS + if (remaining_expected_length_candidate == *remaining_expected_length_max) + { + data_choice_per_index = true; + } +#endif + return; + } + } + } + } + } + else if (dopt_selection_order == 5) + { + if (nb_free_task_candidate < *number_free_task_max) + { + return; + } + else if (nb_free_task_candidate == *number_free_task_max) + { + if (prio == 1 && *priority_max > priority_candidate) + { + return; + } + else if (*priority_max == priority_candidate) + { + if (number_1_from_free_task_candidate < *number_1_from_free_task_max) + { + return; + } + else if (number_1_from_free_task_candidate == *number_1_from_free_task_max) + { + if (transfer_min_candidate > *transfer_min) + { + return; + } + else if ((transfer_min_candidate == *transfer_min) && remaining_expected_length_candidate <= *remaining_expected_length_max) + { +#ifdef STARPU_DARTS_STATS + if (remaining_expected_length_candidate == *remaining_expected_length_max) + { + data_choice_per_index = true; + } +#endif + return; + } + } + } + } + } + else if (dopt_selection_order == 6) + { + if (nb_free_task_candidate < *number_free_task_max) + { + return; + } + else if (nb_free_task_candidate == *number_free_task_max) + { + if (prio == 1 && *priority_max > priority_candidate) + { + return; + } + else if (*priority_max == priority_candidate) + { + if (number_1_from_free_task_candidate < *number_1_from_free_task_max) + { + return; + } + else if (number_1_from_free_task_candidate == *number_1_from_free_task_max) + { + if (remaining_expected_length_candidate < *remaining_expected_length_max) + { + return; + } + else if ((remaining_expected_length_candidate == *remaining_expected_length_max) && transfer_min_candidate >= *transfer_min) + { +#ifdef STARPU_DARTS_STATS + if (transfer_min_candidate == *transfer_min) + { + data_choice_per_index = true; + } +#endif + return; + } + } + } + } + } + else if (dopt_selection_order == 7) + { + if (temp_length_free_tasks_max == 0) + { + ratio_transfertime_freetask_candidate = DBL_MAX; + } + else + { + ratio_transfertime_freetask_candidate = transfer_min_candidate/temp_length_free_tasks_max; + } + if (ratio_transfertime_freetask_candidate > *ratio_transfertime_freetask_min) + { + return; + } + else if (ratio_transfertime_freetask_candidate == *ratio_transfertime_freetask_min) + { + if (nb_free_task_candidate < *number_free_task_max) + { + return; + } + else if (nb_free_task_candidate == *number_free_task_max) + { + if (prio == 1 && *priority_max > priority_candidate) + { + return; + } + else if (*priority_max == priority_candidate) + { + if (number_1_from_free_task_candidate < *number_1_from_free_task_max) + { + return; + } + else if ((number_1_from_free_task_candidate == *number_1_from_free_task_max) && remaining_expected_length_candidate <= *remaining_expected_length_max) + { +#ifdef STARPU_DARTS_STATS + if (remaining_expected_length_candidate == *remaining_expected_length_max) + { + data_choice_per_index = true; + } +#endif + + return; + } + } + } + } + } + else + { + printf("Wrong value for STARPU_DARTS_DOPT_SELECTION_ORDER\n"); fflush(stdout); + exit(EXIT_FAILURE); + } + +#ifdef STARPU_DARTS_STATS + data_choice_per_index = false; +#endif + + /* We have a new "best" data! pdate */ + *number_free_task_max = nb_free_task_candidate; + *remaining_expected_length_max = remaining_expected_length_candidate; + *number_1_from_free_task_max = number_1_from_free_task_candidate; + *handle_popped = handle_candidate; + *priority_max = priority_candidate; + *best_1_from_free_task = best_1_from_free_task_candidate; + *transfer_min = transfer_min_candidate; + *ratio_transfertime_freetask_min = ratio_transfertime_freetask_candidate; + +#ifdef STARPU_DARTS_STATS + *data_chosen_index = i + 1; +#endif +} + +static struct starpu_task *get_highest_priority_task(struct starpu_task_list *l) +{ + int max_priority = INT_MIN; + struct starpu_task *highest_priority_task = starpu_task_list_begin(l); + struct starpu_task *t; + for (t = starpu_task_list_begin(l); t != starpu_task_list_end(l); t = starpu_task_list_next(t)) + { + if (t->priority > max_priority) + { + max_priority = t->priority; + highest_priority_task = t; + } + } + return highest_priority_task; +} + +/* Erase a task from the main task list. + * Also erase pointer in the data. + * Only for one GPU. + * Also update the expected length of task using this data. + */ +static void _starpu_darts_erase_task_and_data_pointer(struct starpu_task *task, struct starpu_task_list *l) +{ + struct _starpu_darts_pointer_in_task *pt = task->sched_data; + unsigned j; + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(task); j++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, j); + if (pt->tud[j] != NULL) + { + _starpu_darts_task_using_data_list_erase(pt->pointer_to_D[j]->sched_data, pt->tud[j]); + pt->tud[j] = NULL; + } + + /* Reduce expected length of task using this data */ + struct _starpu_darts_handle_user_data *hud = pt->pointer_to_D[j]->user_data; + hud->sum_remaining_task_expected_length -= starpu_task_expected_length(task, perf_arch, 0); + _STARPU_SCHED_PRINT("Adding in planned task. Expected length in data %p: %f\n", STARPU_TASK_GET_HANDLE(task, j), hud->sum_remaining_task_expected_length); + + pt->pointer_to_D[j]->user_data = hud; + } + starpu_task_list_erase(l, pt->pointer_to_cell); +} + +/* Main function of DARTS scheduling. + * Takes the set of available task, the GPU (or CPU) asking for work as an input. + * Chooses the best data that is not yet in memory of the PU and fill a buffer of task with task associated with this data. */ +static void _starpu_darts_scheduling_3D_matrix(struct starpu_task_list *main_task_list, int current_gpu, struct _starpu_darts_gpu_planned_task *g) +{ + _STARPU_SCHED_PRINT("Début de sched 3D avec GPU %d.\n", current_gpu); + + Dopt[current_gpu] = NULL; + +#ifdef STARPU_DARTS_STATS + struct timeval time_start_schedule; + gettimeofday(&time_start_schedule, NULL); +#endif + double remaining_expected_length_max = 0; + struct starpu_task *best_1_from_free_task = NULL; + struct starpu_task *temp_best_1_from_free_task = NULL; + + /* Values used to know if the currently selected data is better that the pne already chosen */ + int number_free_task_max = 0; /* Number of free task with selected data */ + int temp_number_free_task_max = 0; + int number_1_from_free_task_max = 0; /* Number of task on from free with selected data */ + int temp_number_1_from_free_task_max = 0; + int priority_max = INT_MIN; /* Highest priority of a task using the chosen data that we know will be pushed to planned task. */ + int temp_priority_max = INT_MIN; /* Highest priority of a task using the chosen data that we know will be pushed to planned task. */ + + /* To ty and use starpu_data_expected_transfer_time */ + double transfer_time_min = DBL_MAX; + double temp_transfer_time_min = DBL_MAX; + + /* For the case where DOPT_SELECTION_OERDER >= 7. In this case I look at the transfertime/timeoffree task. thus I need to keep track of the length of the free tasks. */ + double ratio_transfertime_freetask_min = DBL_MAX; + double temp_length_free_tasks_max = 0; + + starpu_data_handle_t handle_popped = NULL; /* Pointer to chosen best data */ + + struct _starpu_darts_handle_user_data *hud = NULL; + + int data_chosen_index = 0; /* Forced to declare it here because of the fonction update */ +#ifdef STARPU_DARTS_STATS + int nb_data_looked_at = 0; /* Uniquement le cas ou on choisis depuis la mémoire */ +#endif + + _STARPU_SCHED_PRINT("Il y a %d données parmi lesquelles choisir pour le GPU %d.\n", _starpu_darts_gpu_data_not_used_list_size(g->gpu_data), current_gpu); + /* If it's the first task of the GPU, no need to schedule anything, just return it. */ + if (g->first_task == true) + { + _STARPU_SCHED_PRINT("Hey! C'est la première tâche du GPU n°%d!\n", current_gpu); +#ifdef STARPU_DARTS_STATS + if (iteration_DARTS == 1) + { + FILE *f = NULL; + int size = strlen(_output_directory) + strlen("/Data_DARTS_data_chosen_stats_GPU_.csv") + 3; + char path[size]; + snprintf(path, size, "%s%s%d%s", _output_directory, "/Data_DARTS_data_chosen_stats_GPU_", current_gpu, ".csv"); + f = fopen(path, "a"); + fprintf(f, "%d,%d,%d,%d\n", g->number_data_selection, 0, 0, 0); + fclose(f); + } +#endif + + g->first_task = false; + + if (task_order == 2 && dependances == 0) /* Cas liste des taches et données naturelles et pas de dpendances donc points de départs différents à l'aide de first_task_to_pop. */ + { + struct starpu_task *task = NULL; + task = g->first_task_to_pop; + g->first_task_to_pop = NULL; + if (!starpu_task_list_ismember(main_task_list, task)) + { + goto random; + } + + unsigned x; + for (x = 0; x < STARPU_TASK_GET_NBUFFERS(task); x++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, x); + if (!_starpu_darts_gpu_data_not_used_list_empty(g->gpu_data)) + { + struct _starpu_darts_gpu_data_not_used *e; + for (e = _starpu_darts_gpu_data_not_used_list_begin(g->gpu_data); e != _starpu_darts_gpu_data_not_used_list_end(g->gpu_data); e = _starpu_darts_gpu_data_not_used_list_next(e)) + { + if (e->D == STARPU_TASK_GET_HANDLE(task, x)) + { + _starpu_darts_gpu_data_not_used_list_erase(g->gpu_data, e); + hud = e->D->user_data; + _STARPU_SCHED_PRINT("%p gets 0 at is_present_in_data_not_used_yet GPU is %d\n", e->D, current_gpu); + hud->is_present_in_data_not_used_yet[current_gpu] = 0; + e->D->user_data = hud; + + _starpu_darts_gpu_data_not_used_delete(e); + break; + } + } + } + } + + _REFINED_MUTEX_LOCK(); + + /* Add it from planned task compteur */ + _starpu_darts_increment_planned_task_data(task, current_gpu); + + _starpu_darts_erase_task_and_data_pointer(task, main_task_list); + starpu_task_list_push_back(&g->planned_task, task); + + _REFINED_MUTEX_UNLOCK(); + + goto end_scheduling; + } + else + { + goto random; + } + } + + if (_starpu_darts_gpu_data_not_used_list_empty(g->gpu_data)) + { + _STARPU_SCHED_PRINT("Random selection car liste des données non utilisées vide.\n"); + goto random; + } + + /* To know if all the data needed for a task are loaded in memory. */ + int data_not_available = 0; + bool data_available = true; + + /* If threshold is different than 0 (set with env var STARPU_DARTS_THRESHOLD), less data are explored to reduce complexity. */ + int choose_best_data_threshold = INT_MAX; + if (threshold == 1) + { + if (app == 0) + { + if (NT_DARTS > 14400) + { + choose_best_data_threshold = 110; + } + } + else if (NT_DARTS > 1599 && dependances == 0) + { + choose_best_data_threshold = 200; + } + else if (dependances == 1) + { + choose_best_data_threshold = 200; + } + } + +#ifdef STARPU_DARTS_STATS + struct timeval time_start_choose_best_data; + gettimeofday(&time_start_choose_best_data, NULL); +#endif + +#ifdef STARPU_DARTS_STATS + data_choice_per_index = false; +#endif + + if (choose_best_data_from == 0) /* We explore all unused data. In the else case we look at data from the task associated with the data already in memory. */ + { +#ifdef STARPU_DARTS_STATS + g->number_data_selection++; +#endif + + int i=0; + struct _starpu_darts_gpu_data_not_used *e; + for (e = _starpu_darts_gpu_data_not_used_list_begin(g->gpu_data); e != _starpu_darts_gpu_data_not_used_list_end(g->gpu_data) && i != choose_best_data_threshold; e = _starpu_darts_gpu_data_not_used_list_next(e), i++) + { + temp_transfer_time_min = starpu_data_expected_transfer_time(e->D, current_gpu, STARPU_R); + _STARPU_SCHED_PRINT("Temp transfer time is %f\n", temp_transfer_time_min); + temp_number_free_task_max = 0; + temp_number_1_from_free_task_max = 0; + temp_priority_max = INT_MIN; + temp_best_1_from_free_task = NULL; + temp_length_free_tasks_max = 0; + +#ifdef STARPU_DARTS_STATS + nb_data_looked_at++; +#endif + + if (e->D->sched_data != NULL) + { + struct _starpu_darts_task_using_data *t; + for (t = _starpu_darts_task_using_data_list_begin(e->D->sched_data); t != _starpu_darts_task_using_data_list_end(e->D->sched_data); t = _starpu_darts_task_using_data_list_next(t)) + { + /* I put it at false if at least one data is missing. */ + data_not_available = 0; + unsigned j; + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(t->pointer_to_T); j++) + { + STARPU_IGNORE_UTILITIES_HANDLES(t->pointer_to_T, j); + /* I test if the data is on memory */ + if (STARPU_TASK_GET_HANDLE(t->pointer_to_T, j) != e->D) + { + if (simulate_memory == 0) + { + if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(t->pointer_to_T, j), memory_nodes[current_gpu])) + { + data_not_available++; + } + } + else if (simulate_memory == 1) + { + hud = STARPU_TASK_GET_HANDLE(t->pointer_to_T, j)->user_data; + if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(t->pointer_to_T, j), memory_nodes[current_gpu]) && hud->nb_task_in_pulled_task[current_gpu] == 0 && hud->nb_task_in_planned_task[current_gpu] == 0) + { + data_not_available++; + } + } + } + } + + if (data_not_available == 0) + { + temp_number_free_task_max++; + + /* With threshold == 2, we stop as soon as we find a data that allow at least one fee task. */ + if (threshold == 2) + { + hud = e->D->user_data; + update_best_data_single_decision_tree(&number_free_task_max, &remaining_expected_length_max, &handle_popped, &priority_max, &number_1_from_free_task_max, temp_number_free_task_max, hud->sum_remaining_task_expected_length, e->D, temp_priority_max, temp_number_1_from_free_task_max, &data_chosen_index, i, &best_1_from_free_task, temp_best_1_from_free_task, temp_transfer_time_min, &transfer_time_min, temp_length_free_tasks_max, &ratio_transfertime_freetask_min); + + goto end_choose_best_data; + } + + /* For the first one I want to forget priority of one from free tasks. */ + if (temp_number_free_task_max == 1) + { + temp_priority_max = t->pointer_to_T->priority; + } + else if (t->pointer_to_T->priority > temp_priority_max) + { + temp_priority_max = t->pointer_to_T->priority; + } + + temp_length_free_tasks_max += starpu_task_expected_length(t->pointer_to_T, perf_arch, 0); + } + else if (data_not_available == 1) + { + temp_number_1_from_free_task_max++; + + /* Getting the max priority */ + if (t->pointer_to_T->priority > temp_priority_max) + { + temp_priority_max = t->pointer_to_T->priority; + temp_best_1_from_free_task = t->pointer_to_T; + } + } + } + + /* Checking if current data is better */ + hud = e->D->user_data; + update_best_data_single_decision_tree(&number_free_task_max, &remaining_expected_length_max, &handle_popped, &priority_max, &number_1_from_free_task_max, temp_number_free_task_max, hud->sum_remaining_task_expected_length, e->D, temp_priority_max, temp_number_1_from_free_task_max, &data_chosen_index, i, &best_1_from_free_task, temp_best_1_from_free_task, temp_transfer_time_min, &transfer_time_min, temp_length_free_tasks_max, &ratio_transfertime_freetask_min); + } + } + } + else if (choose_best_data_from == 1) /* The case where I only look at data (not yet in memory) from tasks using data in memory. */ + { + /* To avoid looking at the same data twice in the same iteration. */ + struct _starpu_darts_handle_user_data *hud_last_check = NULL; + + /* Be careful here it's useful not to put it between ifdef! Because I use it to know if I haven't already looked at the data */ + g->number_data_selection++; + + /* Récupération des données en mémoire */ + starpu_data_handle_t *data_on_node; + unsigned nb_data_on_node = 0; + int *valid; + starpu_data_get_node_data(current_gpu, &data_on_node, &valid, &nb_data_on_node); + + /* I put myself on a data in memory. */ + unsigned x; + for (x = 0; x < nb_data_on_node; x++) + { + STARPU_IGNORE_UTILITIES_HANDLES_FROM_DATA(data_on_node[x]); + _STARPU_SCHED_PRINT("On data nb %d/%d from memory\n", x, nb_data_on_node); + + /* Je me met sur une tâche de cette donnée en question. */ + struct _starpu_darts_task_using_data *t2; + for (t2 = _starpu_darts_task_using_data_list_begin(data_on_node[x]->sched_data); t2 != _starpu_darts_task_using_data_list_end(data_on_node[x]->sched_data); t2 = _starpu_darts_task_using_data_list_next(t2)) + { + _STARPU_SCHED_PRINT("On task %p from this data\n", t2); + + /* I set myself to a data item of this task (which is not the one in memory). */ + unsigned k; + for (k = 0; k < STARPU_TASK_GET_NBUFFERS(t2->pointer_to_T); k++) + { + STARPU_IGNORE_UTILITIES_HANDLES(t2->pointer_to_T, k); + _STARPU_SCHED_PRINT("On data %p from this task\n", STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)); + hud_last_check = STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)->user_data; + + /* Here you should not look at the same data twice if possible. It can happen. */ + if (STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k) != data_on_node[x] && hud_last_check->last_check_to_choose_from[current_gpu] != g->number_data_selection && !starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k), memory_nodes[current_gpu])) + { + _STARPU_SCHED_PRINT("Data %p is being looked at\n", STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)); +#ifdef STARPU_DARTS_STATS + nb_data_looked_at++; +#endif + + /* Update the iteration for the data so as not to look at it twice at that iteration. */ + hud_last_check->last_check_to_choose_from[current_gpu] = g->number_data_selection; + STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)->user_data = hud_last_check; + + temp_number_free_task_max = 0; + temp_number_1_from_free_task_max = 0; + temp_priority_max = INT_MIN; + temp_best_1_from_free_task = NULL; + temp_length_free_tasks_max = 0; + +#ifdef STARPU_DARTS_STATS + nb_data_looked_at++; +#endif + + struct _starpu_darts_task_using_data *t; + for (t = _starpu_darts_task_using_data_list_begin(STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)->sched_data); t != _starpu_darts_task_using_data_list_end(STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)->sched_data); t = _starpu_darts_task_using_data_list_next(t)) + { + _STARPU_SCHED_PRINT("Task %p is using this data\n", t); + + /* I put it at 1 if at least one data is missing. */ + data_not_available = 0; + + unsigned j; + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(t->pointer_to_T); j++) + { + STARPU_IGNORE_UTILITIES_HANDLES(t->pointer_to_T, j); + if (STARPU_TASK_GET_HANDLE(t->pointer_to_T, j) != STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)) + { + if (simulate_memory == 0) + { + if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(t->pointer_to_T, j), memory_nodes[current_gpu])) + { + data_not_available++; + } + } + else if (simulate_memory == 1) + { + hud = STARPU_TASK_GET_HANDLE(t->pointer_to_T, j)->user_data; + if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(t->pointer_to_T, j), memory_nodes[current_gpu]) && hud->nb_task_in_pulled_task[current_gpu] == 0 && hud->nb_task_in_planned_task[current_gpu] == 0) + { + data_not_available++; + } + } + } + } + + if (data_not_available == 0) + { + temp_number_free_task_max++; + + /* Version where I stop as soon as I get a free task. */ + if (threshold == 2) + { + hud = STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)->user_data; + update_best_data_single_decision_tree(&number_free_task_max, &remaining_expected_length_max, &handle_popped, &priority_max, &number_1_from_free_task_max, temp_number_free_task_max, hud->sum_remaining_task_expected_length, STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k), temp_priority_max, temp_number_1_from_free_task_max, &data_chosen_index, x, &best_1_from_free_task, temp_best_1_from_free_task, temp_transfer_time_min, &transfer_time_min, temp_length_free_tasks_max, &ratio_transfertime_freetask_min); + + goto end_choose_best_data; + } + + /* For the first one I want to forget priority of one from free tasks. */ + if (temp_number_free_task_max == 1) + { + temp_priority_max = t->pointer_to_T->priority; + } + else if (t->pointer_to_T->priority > temp_priority_max) + { + temp_priority_max = t->pointer_to_T->priority; + } + temp_length_free_tasks_max += starpu_task_expected_length(t->pointer_to_T, perf_arch, 0); + } + else if (data_not_available == 1) + { + temp_number_1_from_free_task_max++; + if (t->pointer_to_T->priority > temp_priority_max) + { + temp_priority_max = t->pointer_to_T->priority; + temp_best_1_from_free_task = t->pointer_to_T; + } + } + } + + temp_transfer_time_min = starpu_data_expected_transfer_time(STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k), current_gpu ,STARPU_R); + + /* Update best data if needed */ + hud = STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k)->user_data; + update_best_data_single_decision_tree(&number_free_task_max, &remaining_expected_length_max, &handle_popped, &priority_max, &number_1_from_free_task_max, temp_number_free_task_max, hud->sum_remaining_task_expected_length, STARPU_TASK_GET_HANDLE(t2->pointer_to_T, k), temp_priority_max, temp_number_1_from_free_task_max, &data_chosen_index, x, &best_1_from_free_task, temp_best_1_from_free_task, temp_transfer_time_min, &transfer_time_min, temp_length_free_tasks_max, &ratio_transfertime_freetask_min); + } + } + } + } + } + + _STARPU_SCHED_PRINT("Best data is = %p: %d free tasks and %d 1 from free tasks. Transfer time %f\n", handle_popped, number_free_task_max, number_1_from_free_task_max, transfer_time_min); + + end_choose_best_data : ; + +#ifdef STARPU_DARTS_STATS + if (data_choice_per_index == true) + { + nb_data_selection_per_index++; + } + nb_data_selection++; +#endif + + /* Look at data conflict. If there is one I need to re-start the schedule for one of the GPU. */ + data_conflict[current_gpu] = false; + Dopt[current_gpu] = handle_popped; + int i; + for (i = 0; i < _nb_gpus; i++) + { + if (i != current_gpu) + { + if (Dopt[i] == handle_popped && handle_popped != NULL) + { + _STARPU_SCHED_PRINT("Iteration %d. Same data between GPU %d and GPU %d: %p.\n", iteration_DARTS, current_gpu, i + 1, handle_popped); +#ifdef STARPU_DARTS_STATS + number_data_conflict++; +#endif + + data_conflict[current_gpu] = true; + } + } + } + +#ifdef STARPU_DARTS_STATS + if (iteration_DARTS == 1) + { + FILE *f = NULL; + int size = strlen(_output_directory) + strlen("/Data_DARTS_data_chosen_stats_GPU_.csv") + 3; + char path[size]; + snprintf(path, size, "%s%s%d%s", _output_directory, "/Data_DARTS_data_chosen_stats_GPU_", current_gpu, ".csv"); + f = fopen(path, "a"); + if (number_free_task_max != 0) + { + nb_task_added_in_planned_task = number_free_task_max; + } + else + { + nb_task_added_in_planned_task = 1; + } + fprintf(f, "%d,%d,%d,%d\n", g->number_data_selection, data_chosen_index, nb_data_looked_at - data_chosen_index, nb_task_added_in_planned_task); + fclose(f); + } + struct timeval time_end_choose_best_data; + gettimeofday(&time_end_choose_best_data, NULL); + time_total_choose_best_data += (time_end_choose_best_data.tv_sec - time_start_choose_best_data.tv_sec)*1000000LL + time_end_choose_best_data.tv_usec - time_start_choose_best_data.tv_usec; +#endif + + if (number_free_task_max != 0) + { +#ifdef STARPU_DARTS_STATS + struct timeval time_start_fill_planned_task_list; + gettimeofday(&time_start_fill_planned_task_list, NULL); + nb_free_choice++; +#endif + + /* I erase the data from the list of data not used. */ + if (choose_best_data_from == 0) + { + struct _starpu_darts_gpu_data_not_used *e; + e = _starpu_darts_gpu_data_not_used_list_begin(g->gpu_data); + while (e->D != handle_popped) + { + e = _starpu_darts_gpu_data_not_used_list_next(e); + } + + _starpu_darts_gpu_data_not_used_list_erase(g->gpu_data, e); + hud = e->D->user_data; + hud->is_present_in_data_not_used_yet[current_gpu] = 0; + e->D->user_data = hud; + + _starpu_darts_gpu_data_not_used_delete(e); + + _STARPU_SCHED_PRINT("Erased data %p\n", e->D); + print_data_not_used_yet_one_gpu(g->gpu_data, current_gpu); + } + + _STARPU_SCHED_PRINT("The data adding the most free tasks is %p and %d task.\n", handle_popped, number_free_task_max); + + _REFINED_MUTEX_LOCK(); + + struct _starpu_darts_task_using_data *t; + for (t = _starpu_darts_task_using_data_list_begin(handle_popped->sched_data); t != _starpu_darts_task_using_data_list_end(handle_popped->sched_data); t = _starpu_darts_task_using_data_list_next(t)) + { + data_available = true; + unsigned j; + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(t->pointer_to_T); j++) + { + STARPU_IGNORE_UTILITIES_HANDLES(t->pointer_to_T, j); + if (STARPU_TASK_GET_HANDLE(t->pointer_to_T, j) != handle_popped) + { + if (simulate_memory == 0) + { + if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(t->pointer_to_T, j), memory_nodes[current_gpu])) + { + data_available = false; + break; + } + } + else if (simulate_memory == 1) + { + hud = STARPU_TASK_GET_HANDLE(t->pointer_to_T, j)->user_data; + if (!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(t->pointer_to_T, j), memory_nodes[current_gpu]) && hud->nb_task_in_pulled_task[current_gpu] == 0 && hud->nb_task_in_planned_task[current_gpu] == 0) + { + data_available = false; + break; + } + } + } + } + if (data_available == true) + { + _starpu_darts_increment_planned_task_data(t->pointer_to_T, current_gpu); + +#ifdef PRINT + printf("Pushing free %p in planned_task of GPU %d :", t->pointer_to_T, current_gpu); fflush(stdout); + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(t->pointer_to_T); j++) + { + STARPU_IGNORE_UTILITIES_HANDLES(t->pointer_to_T, j); + printf(" %p", STARPU_TASK_GET_HANDLE(t->pointer_to_T, j)); + fflush(stdout); + } + printf("\n"); fflush(stdout); +#endif + + _starpu_darts_erase_task_and_data_pointer(t->pointer_to_T, main_task_list); + starpu_task_list_push_back(&g->planned_task, t->pointer_to_T); + } + } + + _REFINED_MUTEX_UNLOCK(); + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_fill_planned_task_list; + gettimeofday(&time_end_fill_planned_task_list, NULL); + time_total_fill_planned_task_list += (time_end_fill_planned_task_list.tv_sec - time_start_fill_planned_task_list.tv_sec)*1000000LL + time_end_fill_planned_task_list.tv_usec - time_start_fill_planned_task_list.tv_usec; +#endif + } + else if (number_1_from_free_task_max != 0 && app != 0) + { +#ifdef STARPU_DARTS_STATS + struct timeval time_start_fill_planned_task_list; + gettimeofday(&time_start_fill_planned_task_list, NULL); + nb_1_from_free_choice++; +#endif + _STARPU_SCHED_PRINT("The data adding the most (%d) 1_from_free tasks is %p.\n", number_1_from_free_task_max, handle_popped); + + _REFINED_MUTEX_LOCK(); + + if (!starpu_task_list_ismember(main_task_list, best_1_from_free_task)) + { + _REFINED_MUTEX_UNLOCK(); + goto random; + } + + if (best_1_from_free_task == NULL) + { +#ifdef STARPU_DARTS_STATS + nb_1_from_free_task_not_found++; +#endif + _REFINED_MUTEX_UNLOCK(); + goto random; + } + + /* Removing the data from datanotused of the GPU. */ + if (choose_best_data_from == 0) + { + print_task_info(best_1_from_free_task); + unsigned x; + for (x = 0; x < STARPU_TASK_GET_NBUFFERS(best_1_from_free_task); x++) + { + STARPU_IGNORE_UTILITIES_HANDLES(best_1_from_free_task, x); + if (!_starpu_darts_gpu_data_not_used_list_empty(g->gpu_data)) /* TODO : utile ? */ + { + struct _starpu_darts_gpu_data_not_used *e1; + for (e1 = _starpu_darts_gpu_data_not_used_list_begin(g->gpu_data); e1 != _starpu_darts_gpu_data_not_used_list_end(g->gpu_data); e1 = _starpu_darts_gpu_data_not_used_list_next(e1)) + { + if (e1->D == STARPU_TASK_GET_HANDLE(best_1_from_free_task, x)) + { + _starpu_darts_gpu_data_not_used_list_erase(g->gpu_data, e1); + hud = e1->D->user_data; + _STARPU_SCHED_PRINT("%p gets 0 at is_present_in_data_not_used_yet GPU is %d\n", e1->D, current_gpu); + + hud->is_present_in_data_not_used_yet[current_gpu] = 0; + e1->D->user_data = hud; + + _starpu_darts_gpu_data_not_used_delete(e1); + + break; + } + } + } + } + } + + _starpu_darts_increment_planned_task_data(best_1_from_free_task, current_gpu); + _STARPU_SCHED_PRINT("Pushing 1_from_free task %p in planned_task of GPU %d\n", best_1_from_free_task, current_gpu); + + _starpu_darts_erase_task_and_data_pointer(best_1_from_free_task, main_task_list); + starpu_task_list_push_back(&g->planned_task, best_1_from_free_task); + _REFINED_MUTEX_UNLOCK(); + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_fill_planned_task_list; + gettimeofday(&time_end_fill_planned_task_list, NULL); + time_total_fill_planned_task_list += (time_end_fill_planned_task_list.tv_sec - time_start_fill_planned_task_list.tv_sec)*1000000LL + time_end_fill_planned_task_list.tv_usec - time_start_fill_planned_task_list.tv_usec; +#endif + + } + else + { + goto random; + } + + /* If no task have been added to the list. */ + if (starpu_task_list_empty(&g->planned_task)) + { + /* If there was a conflict (two PU chose the same data to load), then we restart the schedule from one of them. */ + if (data_conflict[current_gpu] == true) + { +#ifdef STARPU_DARTS_STATS + number_critical_data_conflict++; + number_data_conflict--; +#endif + _STARPU_SCHED_PRINT("Critical data conflict.\n"); + _starpu_darts_scheduling_3D_matrix(main_task_list, current_gpu, g); + } + + random: ; /* We pop a task from the main task list. Either the head (from a randomized list or not depending on STARPU_DARTS_TASK_ORDER) or the highest priority task. */ + + Dopt[current_gpu] = NULL; + +#ifdef STARPU_DARTS_STATS + struct timeval time_start_pick_random_task; + gettimeofday(&time_start_pick_random_task, NULL); + number_random_selection++; +#endif + + struct starpu_task *task = NULL; + + _REFINED_MUTEX_LOCK(); + + if (!starpu_task_list_empty(main_task_list)) + { + if (highest_priority_task_returned_in_default_case == 1) /* Highest priority task is returned. */ + { + task = get_highest_priority_task(main_task_list); + } + else /* Head of the task list is returned. */ + { + task = starpu_task_list_pop_front(main_task_list); + } + + _STARPU_SCHED_PRINT("\"Random\" task for GPU %d is %p.\n", current_gpu, task); + } + else + { + _STARPU_SCHED_PRINT("Return void in scheduling for GPU %d.\n", current_gpu); + _REFINED_MUTEX_UNLOCK(); + return; + } + + if (choose_best_data_from == 0) + { + unsigned x; + for (x= 0; x < STARPU_TASK_GET_NBUFFERS(task); x++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, x); + if (!_starpu_darts_gpu_data_not_used_list_empty(g->gpu_data)) + { + struct _starpu_darts_gpu_data_not_used *e; + for (e = _starpu_darts_gpu_data_not_used_list_begin(g->gpu_data); e != _starpu_darts_gpu_data_not_used_list_end(g->gpu_data); e = _starpu_darts_gpu_data_not_used_list_next(e)) + { + if (e->D == STARPU_TASK_GET_HANDLE(task, x)) + { + _starpu_darts_gpu_data_not_used_list_erase(g->gpu_data, e); + hud = e->D->user_data; + _STARPU_SCHED_PRINT("%p gets 0 at is_present_in_data_not_used_yet GPU is %d\n", e->D, current_gpu); + hud->is_present_in_data_not_used_yet[current_gpu] = 0; + e->D->user_data = hud; + + _starpu_darts_gpu_data_not_used_delete(e); + break; + } + } + } + } + } + + /* Add it from planned task compteur */ + _starpu_darts_increment_planned_task_data(task, current_gpu); + _STARPU_SCHED_PRINT("For GPU %d, returning head of the randomized main task list in planned_task: %p.\n", current_gpu, task); + _starpu_darts_erase_task_and_data_pointer(task, main_task_list); + starpu_task_list_push_back(&g->planned_task, task); + + _REFINED_MUTEX_UNLOCK(); + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_pick_random_task; + gettimeofday(&time_end_pick_random_task, NULL); + time_total_pick_random_task += (time_end_pick_random_task.tv_sec - time_start_pick_random_task.tv_sec)*1000000LL + time_end_pick_random_task.tv_usec - time_start_pick_random_task.tv_usec; +#endif + return; + } + + end_scheduling: ; + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_schedule; + gettimeofday(&time_end_schedule, NULL); + time_total_schedule += (time_end_schedule.tv_sec - time_start_schedule.tv_sec)*1000000LL + time_end_schedule.tv_usec - time_start_schedule.tv_usec; +#endif + + /* TODO: Do we need this at the top and at the end of this function? */ + Dopt[current_gpu] = NULL; +} + +static void _starpu_darts_add_task_to_pulled_task(int current_gpu, struct starpu_task *task) +{ + /* We increment for each data using the task, the number of task in pulled task associated with this data. */ + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + struct _starpu_darts_handle_user_data *hud = STARPU_TASK_GET_HANDLE(task, i)->user_data; + hud->nb_task_in_pulled_task[current_gpu] += 1; + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + + struct _starpu_darts_pulled_task *p = _starpu_darts_pulled_task_new(); + p->pointer_to_pulled_task = task; + _starpu_darts_pulled_task_list_push_back(tab_gpu_pulled_task[current_gpu].ptl, p); +} + +/* + * Get a task to return to pull_task. + * In multi GPU it allows me to return a task from the right element in the + * linked list without having another GPU comme and ask a task in pull_task. + */ +static struct starpu_task *get_task_to_return_pull_task_darts(int current_gpu, struct starpu_task_list *l) +{ + /* If there are still tasks either in the packages, the main task list or the refused task, + * I enter here to return a task or start darts_scheduling. Else I return NULL. + */ + /* If one or more task have been refused. Need to update planned task but not pulled task as it was already done before. */ + if (!starpu_task_list_empty(&tab_gpu_planned_task[current_gpu].refused_fifo_list)) + { + struct starpu_task *task = starpu_task_list_pop_front(&tab_gpu_planned_task[current_gpu].refused_fifo_list); + _STARPU_SCHED_PRINT("Return refused task %p.\n", task); + return task; + } + + _REFINED_MUTEX_LOCK(); + /* If the package is not empty I can return the head of the task list. */ + if (!starpu_task_list_empty(&tab_gpu_planned_task[current_gpu].planned_task)) + { + struct starpu_task *task = starpu_task_list_pop_front(&tab_gpu_planned_task[current_gpu].planned_task); + /* Remove it from planned task compteur. Could be done in an external function as I use it two times */ + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + struct _starpu_darts_handle_user_data *hud = STARPU_TASK_GET_HANDLE(task, i)->user_data; + hud->nb_task_in_planned_task[current_gpu] = hud->nb_task_in_planned_task[current_gpu] - 1; + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + _starpu_darts_add_task_to_pulled_task(current_gpu, task); + + /* For visualisation in python. */ + _sched_visu_print_data_to_load_prefetch(task, current_gpu, 1); + _STARPU_SCHED_PRINT("Task: %p is getting out of pull_task from planned task not empty on GPU %d.\n", task, current_gpu); + _REFINED_MUTEX_UNLOCK(); + return task; + } + + /* Else if there are still tasks in the main task list I call dynamic outer algorithm. */ + if (!starpu_task_list_empty(l)) + { + _REFINED_MUTEX_UNLOCK(); + _starpu_darts_scheduling_3D_matrix(l, current_gpu, &tab_gpu_planned_task[current_gpu]); + _REFINED_MUTEX_LOCK(); + struct starpu_task *task; + if (!starpu_task_list_empty(&tab_gpu_planned_task[current_gpu].planned_task)) + { + task = starpu_task_list_pop_front(&tab_gpu_planned_task[current_gpu].planned_task); + _starpu_darts_add_task_to_pulled_task(current_gpu, task); + + /* Remove it from planned task compteur */ + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + struct _starpu_darts_handle_user_data *hud = STARPU_TASK_GET_HANDLE(task, i)->user_data; + hud->nb_task_in_planned_task[current_gpu] = hud->nb_task_in_planned_task[current_gpu] - 1; + + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + } + else + { + _REFINED_MUTEX_UNLOCK(); + _STARPU_SCHED_PRINT("Return NULL after scheduling call.\n"); +#ifdef STARPU_DARTS_STATS + nb_return_null_after_scheduling++; +#endif + return NULL; + } + + /* For visualisation in python. */ + _sched_visu_print_data_to_load_prefetch(task, current_gpu, 1); + _STARPU_SCHED_PRINT("Return task %p from the scheduling call GPU %d.\n", task, current_gpu); +#ifdef STARPU_DARTS_STATS + nb_return_task_after_scheduling++; +#endif + _REFINED_MUTEX_UNLOCK(); + return task; + } + else + { + _STARPU_SCHED_PRINT("Return NULL because main task list is empty.\n"); +#ifdef STARPU_DARTS_STATS + nb_return_null_because_main_task_list_empty++; +#endif + _REFINED_MUTEX_UNLOCK(); + return NULL; + } +} + +static bool graph_read = false; /* TODO: a suppr si j'utilise pas graph_descendants == 1 */ + +/* Pull tasks. When it receives new task it either append tasks to the task list or randomize + * the task list woth the new task, depending on the parameters of STARPU_DARTS_TASK_ORDER. + * Similarly, data can be appended or randomized with STARPU_DARTS_DATA_ORDER. + * By default, task and data are just appended. + * This function return task from the head of planned task. + * If it is empty it calls darts scheduling. */ +static struct starpu_task *darts_pull_task(struct starpu_sched_component *component, struct starpu_sched_component *to) +{ + (void)to; + _STARPU_SCHED_PRINT("Début de pull_task.\n"); + + _LINEAR_MUTEX_LOCK(); + + struct _starpu_darts_sched_data *data = component->data; + + /* If GRAPHE!=0, we compute descendants of new tasks. */ + if (graph_descendants == 1 && graph_read == false && new_tasks_initialized == true) + { + graph_read = true; + _starpu_graph_compute_descendants(); + _starpu_graph_foreach(set_priority, data); + } + + _REFINED_MUTEX_LOCK(); + + if (new_tasks_initialized == true) + { + if (graph_descendants == 2) + { + _starpu_graph_compute_descendants(); + _starpu_graph_foreach(set_priority, data); + } + +#ifdef STARPU_DARTS_STATS + nb_new_task_initialized++; +#endif + + _STARPU_SCHED_PRINT("New tasks in pull_task.\n"); + new_tasks_initialized = false; + + _STARPU_SCHED_PRINT("\n-----\nPrinting GPU's data list and NEW task list before randomization:\n"); + print_data_not_used_yet(); + print_task_list(&data->sched_list, "Main task list"); + +#ifdef STARPU_DARTS_STATS + struct timeval time_start_randomize; + gettimeofday(&time_start_randomize, NULL); +#endif + + /* Randomizing or not the task in the set of task to compute. */ + if (task_order == 0) /* Randomize every task. */ + { + if (!starpu_task_list_empty(&data->main_task_list)) + { + randomize_full_task_list(data); + } + else + { + randomize_new_task_list(data); + } + } + else if (task_order == 1) /* Randomize new tasks. */ + { + randomize_new_task_list(data); + } + else if (dependances == 0) /* Do not randomize. */ + { + natural_order_task_list(data); + } + + /* Randomize or not data order in datanorusedyet. */ + if (choose_best_data_from != 1) /* If we use this parametere with CHOOSE_FROM_MEM=1, we do not need to randomize data as we directly choose them from memory. */ + { + if (data_order == 0) /* Randomize all data. */ + { + randomize_full_data_not_used_yet(); + } + else if (data_order == 1) /* Randomize only new data. */ + { + randomize_new_data_not_used_yet(); + } + else if (dependances == 0) /* Do not randomize data. */ + { + natural_order_data_not_used_yet(); + } + } + + NT_DARTS = 0; + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_randomize; + gettimeofday(&time_end_randomize, NULL); + time_total_randomize += (time_end_randomize.tv_sec - time_start_randomize.tv_sec)*1000000LL + time_end_randomize.tv_usec - time_start_randomize.tv_usec; +#endif + + _STARPU_SCHED_PRINT("Il y a %d tâches.\n", NT_DARTS); + _STARPU_SCHED_PRINT("Printing GPU's data list and main task list after randomization (STARPU_DARTS_TASK_ORDER = %d, STARPU_DARTS_DATA_ORDER = %d):\n", task_order, data_order); + print_task_list(&data->main_task_list, "Main task list"); + _STARPU_SCHED_PRINT("-----\n\n"); + } + + _REFINED_MUTEX_UNLOCK(); + + int current_gpu; /* Index in tabs of structs */ + if (cpu_only == 1) + { + current_gpu = 0; + } + else if (cpu_only == 2) + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()); + } + else + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()) - 1; + } + + struct starpu_task *task = get_task_to_return_pull_task_darts(current_gpu, &data->main_task_list); + + _LINEAR_MUTEX_UNLOCK(); + + _STARPU_SCHED_PRINT("Pulled %stask %p on GPU %d.\n", task?"":"NO ", task, current_gpu); + return task; +} + +static void push_data_not_used_yet_random_spot(starpu_data_handle_t h, struct _starpu_darts_gpu_planned_task *g, int gpu_id) +{ + struct _starpu_darts_gpu_data_not_used *new_element = _starpu_darts_gpu_data_not_used_new(); + new_element->D = h; + + _STARPU_SCHED_PRINT("%p gets 1 at is_present_in_data_not_used_yet with random push\n", h); + struct _starpu_darts_handle_user_data *hud = h->user_data; + hud->is_present_in_data_not_used_yet[gpu_id] = 1; + h->user_data = hud; + + if (_starpu_darts_gpu_data_not_used_list_empty(g->gpu_data)) + { + _starpu_darts_gpu_data_not_used_list_push_back(g->gpu_data, new_element); + return; + } + int random = rand()%_starpu_darts_gpu_data_not_used_list_size(g->gpu_data); + struct _starpu_darts_gpu_data_not_used *ptr; + ptr = _starpu_darts_gpu_data_not_used_list_begin(g->gpu_data); + int i = 0; + for (i = 0; i < random; i++) + { + ptr = _starpu_darts_gpu_data_not_used_list_next(ptr); + } + _starpu_darts_gpu_data_not_used_list_insert_before(g->gpu_data, new_element, ptr); +} + +/* If an eviction was refused, we will try to evict it if possible at the next eviction call. To do this we retrieve the data here. */ +static void darts_victim_eviction_failed(starpu_data_handle_t victim, void *component) +{ + (void)component; + _REFINED_MUTEX_LOCK(); + _LINEAR_MUTEX_LOCK(); + +#ifdef STARPU_DARTS_STATS + struct timeval time_start_evicted; + gettimeofday(&time_start_evicted, NULL); + victim_evicted_compteur++; +#endif + + int current_gpu; /* Index in tabs of structs */ + if (cpu_only == 1) + { + current_gpu = 0; + } + else if (cpu_only == 2) + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()); + } + else + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()) - 1; + } + + tab_gpu_planned_task[current_gpu].data_to_evict_next = victim; + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_evicted; + gettimeofday(&time_end_evicted, NULL); + time_total_evicted += (time_end_evicted.tv_sec - time_start_evicted.tv_sec)*1000000LL + time_end_evicted.tv_usec - time_start_evicted.tv_usec; +#endif + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); +} + +/* Applying Belady on tasks from pulled task. */ +static starpu_data_handle_t _starpu_darts_belady_on_pulled_task(starpu_data_handle_t *data_tab, int nb_data_on_node, unsigned node, enum starpu_is_prefetch is_prefetch, struct _starpu_darts_gpu_pulled_task *g) +{ +#ifdef STARPU_DARTS_STATS + struct timeval time_start_belady; + gettimeofday(&time_start_belady, NULL); +#endif + int index_next_use = 0; + int max_next_use = -1; + starpu_data_handle_t returned_handle = NULL; + + int i; + for (i = 0; i < nb_data_on_node; i++) + { + STARPU_IGNORE_UTILITIES_HANDLES_FROM_DATA(data_tab[i]); + if (starpu_data_can_evict(data_tab[i], node, is_prefetch)) /* TODO: This could be replaced by just looking in a tab of int as we already call this function on all data in victim_selector. */ + { + index_next_use = 0; + struct _starpu_darts_pulled_task *p; + for (p = _starpu_darts_pulled_task_list_begin(g->ptl); p != _starpu_darts_pulled_task_list_end(g->ptl); p = _starpu_darts_pulled_task_list_next(p)) + { + unsigned j; + for (j = 0; j < STARPU_TASK_GET_NBUFFERS(p->pointer_to_pulled_task); j++) + { + STARPU_IGNORE_UTILITIES_HANDLES(p->pointer_to_pulled_task, j); + index_next_use++; + if (STARPU_TASK_GET_HANDLE(p->pointer_to_pulled_task, j) == data_tab[i]) + { + if (max_next_use < index_next_use) + { + max_next_use = index_next_use; + returned_handle = data_tab[i]; + } + goto break_nested_for_loop; + } + } + } + break_nested_for_loop : ; + } + } + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_belady; + gettimeofday(&time_end_belady, NULL); + time_total_belady += (time_end_belady.tv_sec - time_start_belady.tv_sec)*1000000LL + time_end_belady.tv_usec - time_start_belady.tv_usec; +#endif + + return returned_handle; +} + +static starpu_data_handle_t _starpu_darts_least_used_data_on_planned_task(starpu_data_handle_t *data_tab, int nb_data_on_node, int *nb_task_in_pulled_task, int current_gpu) +{ +#ifdef STARPU_DARTS_STATS + struct timeval time_start_least_used_data_planned_task; + gettimeofday(&time_start_least_used_data_planned_task, NULL); +#endif + + int min_nb_task_in_planned_task = INT_MAX; + starpu_data_handle_t returned_handle = NULL; + + struct _starpu_darts_handle_user_data *hud = malloc(sizeof(struct _starpu_darts_handle_user_data)); + int i; + for (i = 0; i < nb_data_on_node; i++) + { + STARPU_IGNORE_UTILITIES_HANDLES_FROM_DATA(data_tab[i]); + if (nb_task_in_pulled_task[i] == 0) + { + hud = data_tab[i]->user_data; + + if (hud->nb_task_in_planned_task[current_gpu] < min_nb_task_in_planned_task) + { + min_nb_task_in_planned_task = hud->nb_task_in_planned_task[current_gpu]; + returned_handle = data_tab[i]; + } + } + } + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_least_used_data_planned_task; + gettimeofday(&time_end_least_used_data_planned_task, NULL); + time_total_least_used_data_planned_task += (time_end_least_used_data_planned_task.tv_sec - time_start_least_used_data_planned_task.tv_sec)*1000000LL + time_end_least_used_data_planned_task.tv_usec - time_start_least_used_data_planned_task.tv_usec; +#endif + + return returned_handle; +} + +/* Return a data to evict following Least Used in the Future eviction policy. + * This function is called each time a PU's memory is full and it needs to load a data, either as a fetch or prefetch. */ +static starpu_data_handle_t darts_victim_selector(starpu_data_handle_t toload, unsigned node, enum starpu_is_prefetch is_prefetch, void *component) +{ + (void)toload; + _REFINED_MUTEX_LOCK(); + _LINEAR_MUTEX_LOCK(); + +#ifdef STARPU_DARTS_STATS + struct timeval time_start_selector; + victim_selector_compteur++; + gettimeofday(&time_start_selector, NULL); +#endif + + int current_gpu; /* Index in tabs of structs */ + if (cpu_only == 1) + { + current_gpu = 0; + } + else if (cpu_only == 2) + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()); + } + else + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()) - 1; + } + + /* If an eviction was refused we try to evict it again. */ + if (tab_gpu_planned_task[current_gpu].data_to_evict_next != NULL) + { + starpu_data_handle_t temp_handle = tab_gpu_planned_task[current_gpu].data_to_evict_next; + tab_gpu_planned_task[current_gpu].data_to_evict_next = NULL; + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_selector; + gettimeofday(&time_end_selector, NULL); + time_total_selector += (time_end_selector.tv_sec - time_start_selector.tv_sec)*1000000LL + time_end_selector.tv_usec - time_start_selector.tv_usec; +#endif + + if (!starpu_data_is_on_node(temp_handle, node)) + { + _STARPU_SCHED_PRINT("Refused %p is not on node %d. Restart eviction\n", temp_handle, node); +#ifdef STARPU_DARTS_STATS + victim_selector_refused_not_on_node++; +#endif + + goto debuteviction; + } + if (!starpu_data_can_evict(temp_handle, node, is_prefetch)) + { + _STARPU_SCHED_PRINT("Refused data can't be evicted. Restart eviction selection.\n"); +#ifdef STARPU_DARTS_STATS + victim_selector_refused_cant_evict++; +#endif + goto debuteviction; + } + + _STARPU_SCHED_PRINT("Evict refused data %p for GPU %d.\n", temp_handle, current_gpu); +#ifdef STARPU_DARTS_STATS + victim_selector_return_refused++; +#endif + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + + return temp_handle; + } + + debuteviction: ; + + /* Getting the set of data on node. */ + starpu_data_handle_t *data_on_node; + unsigned nb_data_on_node = 0; + int *valid; + starpu_data_handle_t returned_handle = STARPU_DATA_NO_VICTIM; + starpu_data_get_node_data(node, &data_on_node, &valid, &nb_data_on_node); + + int min_number_task_in_pulled_task = INT_MAX; + int nb_task_in_pulled_task[nb_data_on_node]; + + unsigned i; + for (i = 0; i < nb_data_on_node; i++) + { + nb_task_in_pulled_task[i] = 0; + } + + /* Compute the number of task in pulled_task associated with each data. */ + for (i = 0; i < nb_data_on_node; i++) + { + STARPU_IGNORE_UTILITIES_HANDLES_FROM_DATA(data_on_node[i]); + if (starpu_data_can_evict(data_on_node[i], node, is_prefetch)) + { + struct _starpu_darts_handle_user_data *hud = data_on_node[i]->user_data; + nb_task_in_pulled_task[i] = hud->nb_task_in_pulled_task[current_gpu]; + _STARPU_SCHED_PRINT("%d task in pulled_task for %p.\n", hud->nb_task_in_pulled_task[current_gpu], data_on_node[i]); + + if (hud->nb_task_in_pulled_task[current_gpu] == 0 && hud->nb_task_in_planned_task[current_gpu] == 0) + { +#ifdef STARPU_DARTS_STATS + victim_selector_return_data_not_in_planned_and_pulled++; +#endif + _STARPU_SCHED_PRINT("%d task in planned task as well for %p.\n", hud->nb_task_in_pulled_task[current_gpu], data_on_node[i]); + returned_handle = data_on_node[i]; + goto deletion_in_victim_selector; + } + + if (hud->nb_task_in_pulled_task[current_gpu] < min_number_task_in_pulled_task) + { + min_number_task_in_pulled_task = hud->nb_task_in_pulled_task[current_gpu]; + } + } + else + { + /* - 1 means that the associated data in the tab of data cannot be evicted. */ + nb_task_in_pulled_task[i] = -1; + } + } + + if (min_number_task_in_pulled_task == INT_MAX) + { +#ifdef STARPU_DARTS_STATS + struct timeval time_end_selector; + gettimeofday(&time_end_selector, NULL); + time_total_selector += (time_end_selector.tv_sec - time_start_selector.tv_sec)*1000000LL + time_end_selector.tv_usec - time_start_selector.tv_usec; + victim_selector_return_no_victim++; +#endif + _STARPU_SCHED_PRINT("Evict NO_VICTIM because min_number_task_in_pulled_task == INT_MAX.\n"); + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + + return STARPU_DATA_NO_VICTIM; + } + else if (min_number_task_in_pulled_task == 0) + { + /* At least 1 data is not used in pulled_task */ + returned_handle = _starpu_darts_least_used_data_on_planned_task(data_on_node, nb_data_on_node, nb_task_in_pulled_task, current_gpu); + } + else /* At least 1 data is necessary in pulled_task */ + { + /* If a prefetch is requesting the eviction, we return NO_VICTIM because we don't want to favor prefetch over task that are in pulled_task. */ + if (is_prefetch >= 1) + { +#ifdef STARPU_DARTS_STATS + struct timeval time_end_selector; + gettimeofday(&time_end_selector, NULL); + time_total_selector += (time_end_selector.tv_sec - time_start_selector.tv_sec)*1000000LL + time_end_selector.tv_usec - time_start_selector.tv_usec; + victim_selector_return_no_victim++; +#endif + _STARPU_SCHED_PRINT("Evict NO_VICTIM because is_prefetch >= 1.\n"); + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + + return STARPU_DATA_NO_VICTIM; + } + +#ifdef STARPU_DARTS_STATS + victim_selector_belady++; +#endif + + returned_handle = _starpu_darts_belady_on_pulled_task(data_on_node, nb_data_on_node, node, is_prefetch, &tab_gpu_pulled_task[current_gpu]); + } + + /* TODO: DOes it really happens sometimes ? To check. */ + if (returned_handle == NULL) + { +#ifdef STARPU_DARTS_STATS + struct timeval time_end_selector; + gettimeofday(&time_end_selector, NULL); + time_total_selector += (time_end_selector.tv_sec - time_start_selector.tv_sec)*1000000LL + time_end_selector.tv_usec - time_start_selector.tv_usec; + victim_selector_return_no_victim++; +#endif + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + + return STARPU_DATA_NO_VICTIM; + } + + deletion_in_victim_selector : ; + + struct starpu_sched_component *temp_component = component; + struct _starpu_darts_sched_data *data = temp_component->data; + + struct starpu_task *task; + for (task = starpu_task_list_begin(&tab_gpu_planned_task[current_gpu].planned_task); task != starpu_task_list_end(&tab_gpu_planned_task[current_gpu].planned_task); task = starpu_task_list_next(task)) + { + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + if (STARPU_TASK_GET_HANDLE(task, i) == returned_handle) + { + /* Removing task using the handle selected for eviction from the planned_task list. */ + struct _starpu_darts_pointer_in_task *pt = task->sched_data; + starpu_task_list_erase(&tab_gpu_planned_task[current_gpu].planned_task, pt->pointer_to_cell); + + pt->pointer_to_cell = task; + pt->pointer_to_D = malloc(get_nbuffer_without_scratch(task)*sizeof(STARPU_TASK_GET_HANDLE(task, 0))); + pt->tud = malloc(get_nbuffer_without_scratch(task)*sizeof(_starpu_darts_task_using_data_new())); + + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + /* Pointer toward the main task list in the handles. */ + struct _starpu_darts_task_using_data *e = _starpu_darts_task_using_data_new(); + e->pointer_to_T = task; + + if (STARPU_TASK_GET_HANDLE(task, i)->sched_data == NULL) + { + struct _starpu_darts_task_using_data_list *tl = _starpu_darts_task_using_data_list_new(); + _starpu_darts_task_using_data_list_push_front(tl, e); + STARPU_TASK_GET_HANDLE(task, i)->sched_data = tl; + } + else + { + _starpu_darts_task_using_data_list_push_front(STARPU_TASK_GET_HANDLE(task, i)->sched_data, e); + } + + /* Adding the pointer in the task. */ + pt->pointer_to_D[i] = STARPU_TASK_GET_HANDLE(task, i); + pt->tud[i] = e; + + /* Increase expected length of task using this data */ + struct _starpu_darts_handle_user_data *hud = pt->pointer_to_D[i]->user_data; + hud->sum_remaining_task_expected_length += starpu_task_expected_length(task, perf_arch, 0); + + _STARPU_SCHED_PRINT("Eviction of data %p.\n", STARPU_TASK_GET_HANDLE(task, i)); + + pt->pointer_to_D[i]->user_data = hud; + + } + task->sched_data = pt; + + starpu_task_list_push_back(&data->main_task_list, task); + break; + } + } + } + + /* Pushing the evicted data in datanotusedyet if it is still usefull to some tasks or if we are in a case with dependencies. */ + if (choose_best_data_from == 0) + { + if (!_starpu_darts_task_using_data_list_empty(returned_handle->sched_data)) + { + if (dependances == 1) /* Checking if other PUs have this handle in datanotusedtyet. */ + { + struct _starpu_darts_handle_user_data *hud = returned_handle->user_data; + if (hud->is_present_in_data_not_used_yet[current_gpu] == 0) + { + push_data_not_used_yet_random_spot(returned_handle, &tab_gpu_planned_task[current_gpu], current_gpu); + } + + int x; + for (x = 0; x < _nb_gpus; x++) + { + if (x != current_gpu) + { + if (hud->is_present_in_data_not_used_yet[x] == 0 && (can_a_data_be_in_mem_and_in_not_used_yet == 1 || !starpu_data_is_on_node(returned_handle, memory_nodes[x]))) + { + push_data_not_used_yet_random_spot(returned_handle, &tab_gpu_planned_task[x], i); + } + } + } + } + else + { + push_data_not_used_yet_random_spot(returned_handle, &tab_gpu_planned_task[current_gpu], current_gpu); + } + } + } + +#ifdef STARPU_DARTS_STATS + struct timeval time_end_selector; + gettimeofday(&time_end_selector, NULL); + time_total_selector += (time_end_selector.tv_sec - time_start_selector.tv_sec)*1000000LL + time_end_selector.tv_usec - time_start_selector.tv_usec; +#endif + _STARPU_SCHED_PRINT("Evict %p on GPU %d.\n", returned_handle, current_gpu); + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + + return returned_handle; +} + +static int darts_can_push(struct starpu_sched_component *component, struct starpu_sched_component *to) +{ + int didwork = 0; + struct starpu_task *task; + task = starpu_sched_component_pump_to(component, to, &didwork); + if (task) + { + /* If a task is refused it is pushed it in the refused fifo list of the associated processing unit. + * This list is looked at first when a PU is asking for a task. */ + + _REFINED_MUTEX_LOCK(); + _LINEAR_MUTEX_LOCK(); + + _STARPU_SCHED_PRINT("Refused %p in can_push.\n", task); +#ifdef STARPU_DARTS_STATS + nb_refused_task++; +#endif + + int current_gpu; /* Index in tabs of structs */ + if (cpu_only == 1) + { + current_gpu = 0; + } + else if (cpu_only == 2) + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()); + } + else + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()) - 1; + } + + starpu_task_list_push_back(&tab_gpu_planned_task[current_gpu].refused_fifo_list, task); + + _REFINED_MUTEX_UNLOCK(); + _LINEAR_MUTEX_UNLOCK(); + } + + /* There is room now */ + return didwork || starpu_sched_component_can_push(component, to); +} + +static int darts_can_pull(struct starpu_sched_component *component) +{ + return starpu_sched_component_can_pull(component); +} + +struct starpu_sched_component *starpu_sched_component_darts_create(struct starpu_sched_tree *tree, void *params STARPU_ATTRIBUTE_UNUSED) +{ + /* Global variables extracted from environement variables. */ + eviction_strategy_darts = starpu_get_env_number_default("STARPU_DARTS_EVICTION_STRATEGY_DARTS", 1); + threshold = starpu_get_env_number_default("STARPU_DARTS_THRESHOLD", 0); + app = starpu_get_env_number_default("STARPU_DARTS_APP", 1); + choose_best_data_from = starpu_get_env_number_default("STARPU_DARTS_CHOOSE_BEST_DATA_FROM", 0); + simulate_memory = starpu_get_env_number_default("STARPU_DARTS_SIMULATE_MEMORY", 0); + task_order = starpu_get_env_number_default("STARPU_DARTS_TASK_ORDER", 2); + data_order = starpu_get_env_number_default("STARPU_DARTS_DATA_ORDER", 2); + dependances = starpu_get_env_number_default("STARPU_DARTS_DEPENDANCES", 1); + prio = starpu_get_env_number_default("STARPU_DARTS_PRIO", 1); + free_pushed_task_position = starpu_get_env_number_default("STARPU_DARTS_FREE_PUSHED_TASK_POSITION", 1); + graph_descendants = starpu_get_env_number_default("STARPU_DARTS_GRAPH_DESCENDANTS", 0); + dopt_selection_order = starpu_get_env_number_default("STARPU_DARTS_DOPT_SELECTION_ORDER", 7); + highest_priority_task_returned_in_default_case = starpu_get_env_number_default("STARPU_DARTS_HIGHEST_PRIORITY_TASK_RETURNED_IN_DEFAULT_CASE", 1); + can_a_data_be_in_mem_and_in_not_used_yet = starpu_get_env_number_default("STARPU_DARTS_CAN_A_DATA_BE_IN_MEM_AND_IN_NOT_USED_YET", 0); + push_free_task_on_gpu_with_least_task_in_planned_task = starpu_get_env_number_default("STARPU_DARTS_PUSH_FREE_TASK_ON_GPU_WITH_LEAST_TASK_IN_PLANNED_TASK", 2); + + if (starpu_cpu_worker_get_count() > 0 && starpu_cuda_worker_get_count() == 0 && starpu_hip_worker_get_count() == 0 && starpu_opencl_worker_get_count() == 0 && starpu_mpi_ms_worker_get_count() == 0 && starpu_tcpip_ms_worker_get_count() == 0) + { + cpu_only = 1; // Only CPUs + } + else if (starpu_cpu_worker_get_count() > 0 && (starpu_cuda_worker_get_count() > 0 || starpu_hip_worker_get_count() == 0 || starpu_opencl_worker_get_count() == 0 || starpu_mpi_ms_worker_get_count() == 0 || starpu_tcpip_ms_worker_get_count() == 0)) + { + cpu_only = 2; // Both GPUs and CPUs + } + else + { + cpu_only = 0; // Only GPUs + } + + _STARPU_SCHED_PRINT("-----\nSTARPU_DARTS_EVICTION_STRATEGY_DARTS = %d\nSTARPU_DARTS_THRESHOLD = %d\nSTARPU_DARTS_APP = %d\nSTARPU_DARTS_CHOOSE_BEST_DATA_FROM = %d\nSTARPU_DARTS_SIMULATE_MEMORY = %d\nSTARPU_DARTS_TASK_ORDER = %d\nSTARPU_DARTS_DATA_ORDER = %d\nSTARPU_DARTS_DEPENDANCES = %d\nSTARPU_DARTS_PRIO = %d\nSTARPU_DARTS_FREE_PUSHED_TASK_POSITION = %d\nSTARPU_DARTS_GRAPH_DESCENDANTS = %d\nSTARPU_DARTS_DOPT_SELECTION_ORDER = %d\nSTARPU_DARTS_HIGHEST_PRIORITY_TASK_RETURNED_IN_DEFAULT_CASE = %d\nSTARPU_DARTS_CAN_A_DATA_BE_IN_MEM_AND_IN_NOT_USED_YET = %d\nSTARPU_DARTS_PUSH_FREE_TASK_ON_GPU_WITH_LEAST_TASK_IN_PLANNED_TASK = %d\n-----\n", eviction_strategy_darts, threshold, app, choose_best_data_from, simulate_memory, task_order, data_order, dependances, prio, free_pushed_task_position, graph_descendants, dopt_selection_order, highest_priority_task_returned_in_default_case, can_a_data_be_in_mem_and_in_not_used_yet, push_free_task_on_gpu_with_least_task_in_planned_task); + + _nb_gpus = _get_number_GPU(); + if (cpu_only == 2) + { + _nb_gpus += starpu_memory_nodes_get_count_by_kind(STARPU_CPU_RAM); /* Adding one to account for the NUMA node because we are in a CPU+GPU case. */ + } + NT_DARTS = 0; + new_tasks_initialized = false; + round_robin_free_task = -1; /* Starts at -1 because it is updated at the beginning and not the end of push_task. Thus to start at 0 on the first task you need to init it at -1. */ + + /* Initialize memory node of each GPU or CPU */ + memory_nodes = malloc(sizeof(int)*_nb_gpus); + int i; + for (i = 0; i < _nb_gpus; i++) + { + if (cpu_only == 0) /* Using GPUs so memory nodes are 1->Ngpu */ + { + memory_nodes[i] = i + 1; + } + else if (cpu_only == 2) /* Using GPUs and CPUs so memory nodes are 0->Ngpu+Ncpu */ + { + memory_nodes[i] = i; + } + else /* Using CPUs so memory nodes are 0->N_numa_nodes */ + { + memory_nodes[i] = i; + } + } + +#ifdef STARPU_DARTS_STATS + int size = strlen(_output_directory) + strlen("/Data_DARTS_data_chosen_stats_GPU_.csv") + 3; + for (i = 0; i < _nb_gpus; i++) + { + char path[size]; + snprintf(path, size, "%s%s%d%s", _output_directory, "/Data_DARTS_data_chosen_stats_GPU_", i+1, ".csv"); + FILE *f = fopen(path, "w"); + STARPU_ASSERT_MSG(f, "cannot open file <%s>\n", path); + fprintf(f, "Data selection,Data chosen,Number of data read,Number of task added in planned_task\n"); + fclose(f); + } + + gettimeofday(&time_start_createtolasttaskfinished, NULL); + nb_return_null_after_scheduling = 0; + nb_return_task_after_scheduling = 0; + nb_data_selection = 0; + nb_return_null_because_main_task_list_empty = 0; + nb_new_task_initialized = 0; + nb_refused_task = 0; + victim_selector_refused_not_on_node = 0; + victim_selector_refused_cant_evict = 0; + victim_selector_return_refused = 0; + victim_selector_return_unvalid = 0; + victim_selector_return_data_not_in_planned_and_pulled = 0; + number_data_conflict = 0; + number_critical_data_conflict = 0; + victim_evicted_compteur = 0; + victim_selector_compteur = 0; + victim_selector_return_no_victim = 0; + victim_selector_belady = 0; + number_random_selection = 0; + nb_free_choice = 0; + nb_1_from_free_choice = 0; + nb_data_selection_per_index = 0; + nb_task_added_in_planned_task = 0; + nb_1_from_free_task_not_found = 0; + time_total_selector = 0; + time_total_evicted = 0; + time_total_belady = 0; + time_total_schedule = 0; + time_total_choose_best_data = 0; + time_total_fill_planned_task_list = 0; + time_total_initialisation = 0; + time_total_randomize = 0; + time_total_pick_random_task = 0; + time_total_least_used_data_planned_task = 0; + time_total_createtolasttaskfinished = 0; + data_choice_per_index = false; +#endif + + _sched_visu_init(_nb_gpus); + + struct starpu_sched_component *component = starpu_sched_component_create(tree, "darts"); + starpu_srand48(starpu_get_env_number_default("SEED", 0)); + + struct _starpu_darts_sched_data *data; + _STARPU_MALLOC(data, sizeof(*data)); + starpu_task_list_init(&data->sched_list); + starpu_task_list_init(&data->main_task_list); + + tab_gpu_planned_task = malloc(_nb_gpus*sizeof(struct _starpu_darts_gpu_planned_task)); + _starpu_darts_tab_gpu_planned_task_init(); + tab_gpu_pulled_task = malloc(_nb_gpus*sizeof(struct _starpu_darts_gpu_pulled_task)); + _starpu_darts_tab_gpu_pulled_task_init(); + + _REFINED_MUTEX_INIT(); + _LINEAR_MUTEX_INIT(); + + Dopt = calloc(_nb_gpus, sizeof(starpu_data_handle_t)); + data_conflict = malloc(_nb_gpus*sizeof(bool)); + + component->data = data; + component->push_task = darts_push_task; + component->pull_task = darts_pull_task; + component->can_push = darts_can_push; + component->can_pull = darts_can_pull; + + if (eviction_strategy_darts == 1) + { + starpu_data_register_victim_selector(darts_victim_selector, darts_victim_eviction_failed, component); + } + + return component; +} + +static void initialize_darts_center_policy(unsigned sched_ctx_id) +{ + _output_directory = _sched_visu_get_output_directory(); + starpu_sched_component_initialize_simple_scheduler((starpu_sched_component_create_t) starpu_sched_component_darts_create, NULL, + STARPU_SCHED_SIMPLE_DECIDE_MEMNODES | + STARPU_SCHED_SIMPLE_DECIDE_ALWAYS | + STARPU_SCHED_SIMPLE_FIFOS_BELOW | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_READY | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_READY_FIRST | + STARPU_SCHED_SIMPLE_IMPL, sched_ctx_id); + + + /* To avoid systematic prefetch in sched_policy.c */ + struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id); + sched_ctx->sched_policy->prefetches = 1; + + perf_arch = starpu_worker_get_perf_archtype(0, sched_ctx_id); /* Getting the perfmodel. Used to get the expected length of a task to tiebreak when choosing Dopt. We use 0 here because we assume all processing units to be homogeneous. TODO: use the mean performance of each processing unit or each time the performance model is needed, use the one of the selected processing unit to get the exepected task length or data transfer duration. */ + + if (prio != 0) + { + if (graph_descendants != 0) + { + _starpu_graph_record = 1; + } + + /* To get the priority of each task. */ + starpu_sched_ctx_set_min_priority(sched_ctx_id, INT_MIN); + starpu_sched_ctx_set_max_priority(sched_ctx_id, INT_MAX); + } +} + +static void deinitialize_darts_center_policy(unsigned sched_ctx_id) +{ + struct starpu_sched_tree *tree = (struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id); + starpu_sched_tree_destroy(tree); +} + +/* Get the task that was last executed. Used to update the task list of pulled task. */ +static void get_task_done(struct starpu_task *task, unsigned sci) +{ + _LINEAR_MUTEX_LOCK(); + + int current_gpu; + if (cpu_only == 1) + { + current_gpu = 0; + } + else if (cpu_only == 2) + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()); + } + else + { + current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()) - 1; + } + + if (eviction_strategy_darts == 1) + { + _REFINED_MUTEX_LOCK(); + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + STARPU_IGNORE_UTILITIES_HANDLES(task, i); + struct _starpu_darts_handle_user_data *hud = STARPU_TASK_GET_HANDLE(task, i)->user_data; + hud->nb_task_in_pulled_task[current_gpu] -= 1; + STARPU_TASK_GET_HANDLE(task, i)->user_data = hud; + } + _REFINED_MUTEX_UNLOCK(); + } + + int trouve = 0; + + if (!_starpu_darts_pulled_task_list_empty(tab_gpu_pulled_task[current_gpu].ptl)) + { + struct _starpu_darts_pulled_task *temp; + for (temp = _starpu_darts_pulled_task_list_begin(tab_gpu_pulled_task[current_gpu].ptl); temp != _starpu_darts_pulled_task_list_end(tab_gpu_pulled_task[current_gpu].ptl); temp = _starpu_darts_pulled_task_list_next(temp)) + { + if (temp->pointer_to_pulled_task == task) + { + trouve = 1; + break; + } + } + if (trouve == 1) + { + _starpu_darts_pulled_task_list_erase(tab_gpu_pulled_task[current_gpu].ptl, temp); + + if (cpu_only == 0) + { + _starpu_darts_pulled_task_delete(temp); + } + } + } + + _LINEAR_MUTEX_UNLOCK(); + + starpu_sched_component_worker_pre_exec_hook(task, sci); +} + +struct starpu_sched_policy _starpu_sched_darts_policy = +{ + .init_sched = initialize_darts_center_policy, + .deinit_sched = deinitialize_darts_center_policy, + .add_workers = starpu_sched_tree_add_workers, + .remove_workers = starpu_sched_tree_remove_workers, + .push_task = starpu_sched_tree_push_task, + //~ .pop_task = _sched_visu_get_data_to_load, /* Modified from starpu_sched_tree_pop_task */ + .pop_task = starpu_sched_tree_pop_task, /* Modified from starpu_sched_tree_pop_task */ + //~ .pre_exec_hook = _sched_visu_get_current_tasks, /* Modified from starpu_sched_component_worker_pre_exec_hookstarpu_sched_component_worker_pre_exec_hook */ + .pre_exec_hook = starpu_sched_component_worker_pre_exec_hook, /* Modified from starpu_sched_component_worker_pre_exec_hook */ + .reset_scheduler = starpu_darts_reinitialize_structures, + .post_exec_hook = get_task_done, + .policy_name = "darts", + .policy_description = "Dynamic scheduler that select data in order to maximize the amount of task that can be computed without any additional data load", + .worker_type = STARPU_WORKER_LIST, +}; diff --git a/src/sched_policies/darts.h b/src/sched_policies/darts.h new file mode 100644 index 0000000000..b279ecb167 --- /dev/null +++ b/src/sched_policies/darts.h @@ -0,0 +1,55 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2020-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#ifndef __DARTS_H__ +#define __DARTS_H__ + +/** @file */ + +#include + +#pragma GCC visibility push(hidden) + +#define STARPU_DARTS_GRAPH_DESCENDANTS /* To use the graph as a way to add priorities. 0: Not used. 1: With a graph reading descendants in DARTS, use a pause in the task submit. 2: With a graph reading descendants in DARTS, but don't use pause and the graph is read at each new batch of tasks in pull_task. */ +// 0 by default for all the following global variables +#define STARPU_DARTS_EVICTION_STRATEGY_DARTS /* 0: LRU, 1: special eviction for DARTS */ +#define STARPU_DARTS_THRESHOLD /* Pour arrêter de regarder dans la liste des données plus tôt. 0 = no threshold, 1 = threshold à 14400 tâches pour une matrice 2D (donc STARPU_DARTS_APP == 0) et à 1599 tâches aussi pour matrice 3D (donc STARPU_DARTS_APP == 1), 2 = on s'arrete des que on a trouvé 1 donnée qui permet de faire au moins une tache gratuite ou si il y en a pas 1 donnée qui permet de faire au moins 1 tache a 1 d'ere gratuite. 0 par défaut */ +#define STARPU_DARTS_APP /* 0 matrice 2D. 1 matrice 3D. Sur 1 on regarde les tâches à 1 d'être gratuite galement. Pas plus loin. */ +#define STARPU_DARTS_CHOOSE_BEST_DATA_FROM /* Pour savoir où on regarde pour choisir la meilleure donnée. 0, on regarde la liste des données pas encore utilisées. 1 on regarde les données en mémoire et à partir des tâches de ces données on cherche une donnée pas encore en mémoire qui permet de faire le plus de tâches gratuite ou 1 from free. */ +#define STARPU_DARTS_SIMULATE_MEMORY /* Default 0, means we use starpu_data_is_on_node, 1 we also look at nb of task in planned and pulled task. */ +#define STARPU_DARTS_TASK_ORDER /* 0, signifie qu'on randomize entièrement la liste des tâches. 1 je ne randomise que les nouvelles tâches entre elle et les met à la fin des listes de taches. 2 je ne randomise pas et met chaque GPU sur un m/NGPU portion différentes pour qu'ils commencent à différent endroit de la liste de tâches. Dans le cas avec dépendances il n'y a pas de points de départs différents juste je ne randomise pas. */ +#define STARPU_DARTS_DATA_ORDER /* 0, signifie qu'on randomize entièrement la liste des données. 1 je ne randomise que les nouvelles données entre elle et les met à la fin des listes de données. 2 je ne randomise pas et met chaque GPU sur un Ndata/NGPU portion différentes pour qu'ils commencent à différent endroit de la liste de données.*/ +#define STARPU_DARTS_DEPENDANCES /* 0 non, 1 utile pour savoir si on fais des points de départs différents dans main task list (on ne le fais pas si il y a des dependances). TODO: pas forcément utile à l'avenir à voir si on l'enlève. */ +#define STARPU_DARTS_PRIO /* 0 non, 1 tiebreak data selection with the that have the highest priority task */ +#define STARPU_DARTS_FREE_PUSHED_TASK_POSITION /* To detail where a free task from push_task is pushed in planned_task. 0: at the top of planned task, 1: after the last free task of planned task. */ +#define STARPU_DARTS_DOPT_SELECTION_ORDER /* In which order do I tiebreak when choosing the optimal data: +* 0: Nfree N1fromfree Prio Timeremaining +* 1: Nfree Prio N1fromfree Timeremaining +* 2: TransferTime NFree Prio N1FromFree TimeRemaining +* 3: NFree TransferTime Prio N1FromFree TimeRemaining +* 4: NFree Prio TransferTime N1FromFree TimeRemaining +* 5: NFree Prio N1FromFree TransferTime TimeRemaining +* 6: NFree Prio N1FromFree TimeRemaining TransferTime +* 7: Ratio_Transfer/Free_Task_Time NFree Prio N1FromFree TimeRemaining +*/ +#define STARPU_DARTS_HIGHEST_PRIORITY_TASK_RETURNED_IN_DEFAULT_CASE /* 0: no, 1: I return the highesth priority task of the list when I didn't found a data giving free or 1 from free task. Only makes sense if STARPU_DARTS_TASK_ORDER is set to 2. else you are defeating the purpose of randomization with TASK_ORDEr on 0 or 1. */ +#define STARPU_DARTS_CAN_A_DATA_BE_IN_MEM_AND_IN_NOT_USED_YET /* 0: no, 1 : yes */ +#define STARPU_DARTS_PUSH_FREE_TASK_ON_GPU_WITH_LEAST_TASK_IN_PLANNED_TASK /* 0: no, 1: yes, 2: round robin */ +#define STARPU_DARTS_CPU_ONLY /* 0: we use only GPUs, 1: we use only CPUs, 2: we use both (not functionnal) */ + +#pragma GCC visibility pop + +#endif // __DARTS_H__ diff --git a/src/sched_policies/mst_policy.c b/src/sched_policies/mst_policy.c new file mode 100644 index 0000000000..f474becc69 --- /dev/null +++ b/src/sched_policies/mst_policy.c @@ -0,0 +1,441 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2013-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * Copyright (C) 2013 Simon Archipoff + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +/* Building a maximum spanning tree from the set of tasks using Prim's algorithm. + * The task processing order is then the order in which tasks are added to the tree. + * hMETIS can be used before executing MST on each package with the environemment variable HMETIS=1. + */ + +#include +#include +#include +#include + +static int _nb_gpus; + +static int mst_push_task(struct starpu_sched_component *component, struct starpu_task *task) +{ + struct _starpu_HFP_sched_data *data = component->data; + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + starpu_task_list_push_front(&data->sched_list, task); + starpu_push_task_end(task); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + component->can_pull(component); + return 0; +} + +static struct starpu_task_list mst(struct starpu_task_list task_list, int number_task) +{ + struct starpu_task_list SIGMA; + starpu_task_list_init(&SIGMA); + int i = 0; + int count = 0; + int j = 0; + int v = 0; + unsigned int i_bis = 0; + unsigned int j_bis = 0; + int tab_runner = 0; + struct starpu_task *temp_task_1 = NULL; + struct starpu_task *temp_task_2 = NULL; + int matrice_adjacence[number_task][number_task]; + for (i = 0; i < number_task; i++) + { + for (j = 0; j < number_task; j++) + { + matrice_adjacence[i][j] = 0; + } + } + temp_task_1 = starpu_task_list_begin(&task_list); + temp_task_2 = starpu_task_list_begin(&task_list); + temp_task_2 = starpu_task_list_next(temp_task_2); + /* Building the adjacency matrix */ + for (i = 0; i < number_task; i++) + { + for (j = i + 1; j < number_task; j++) + { + for (i_bis = 0; i_bis < STARPU_TASK_GET_NBUFFERS(temp_task_1); i_bis++) + { + for (j_bis = 0; j_bis < STARPU_TASK_GET_NBUFFERS(temp_task_2); j_bis++) + { + if (STARPU_TASK_GET_HANDLE(temp_task_1, i_bis) == STARPU_TASK_GET_HANDLE(temp_task_2, j_bis)) + { + matrice_adjacence[i][j]++; + } + } + } + temp_task_2 = starpu_task_list_next(temp_task_2); + } + temp_task_1 = starpu_task_list_next(temp_task_1); + temp_task_2 = temp_task_1; + if (i + 1 != number_task) + { + temp_task_2 = starpu_task_list_next(temp_task_2); + } + } + + /* Printing the adjacency matrix */ + _sched_visu_print_matrix((int **)matrice_adjacence, number_task, number_task, "Matrice d'adjacence :\n"); + + /* Struct of packages to have one task by package and thus being able to number each task. + * We need to number them to recognize them later on. */ + struct _starpu_HFP_my_list *temp_sub_list = malloc(sizeof(*temp_sub_list)); + struct _starpu_HFP_paquets *temp_paquets = malloc(sizeof(*temp_paquets)); + starpu_task_list_init(&temp_sub_list->sub_list); + temp_sub_list->next = NULL; + temp_paquets->temp_pointer_1 = temp_sub_list; + temp_paquets->first_link = temp_paquets->temp_pointer_1; + int do_not_add_more = 0; + while (!starpu_task_list_empty(&task_list)) + { + starpu_task_list_push_back(&temp_paquets->temp_pointer_1->sub_list, starpu_task_list_pop_front(&task_list)); + temp_paquets->temp_pointer_1->index_package = do_not_add_more; + if (do_not_add_more != number_task - 1) + { + _starpu_HFP_insertion(temp_paquets); + } + do_not_add_more++; + } + temp_paquets->first_link = temp_paquets->temp_pointer_1; + + /* Start of the MST algorithm */ + // Key values used to pick minimum weight edge in cut + int key[number_task]; + // To represent set of vertices included in MST + bool mstSet[number_task]; + int tab_SIGMA[number_task]; + // Initialize all keys as 0 + for (i = 0; i < number_task; i++) + { + key[i] = 0, mstSet[i] = false; + } + + // Always include first 1st vertex in MST. + // Make key 0 so that this vertex is picked as first vertex. + key[0] = 1; + for (count = 0; count < number_task - 1; count++) + { + // Pick the minimum key vertex from the + // set of vertices not yet included in MST + int max = -1, max_index = 0; + for (v = 0; v < number_task; v++) + if (mstSet[v] == false && key[v] > max) + max = key[v], max_index = v; + + int u = max_index; + // Add the picked vertex to the MST Set + mstSet[u] = true; + tab_SIGMA[tab_runner] = u; + tab_runner++; + + // Update key value and parent index of + // the adjacent vertices of the picked vertex. + // Consider only those vertices which are not + // yet included in MST + for (v = 0; v < number_task; v++) + // matrice_adjacence[u][v] is non zero only for adjacent vertices of m + // mstSet[v] is false for vertices not yet included in MST + // Update the key only if graph[u][v] is greater than key[v] + if (matrice_adjacence[u][v] && mstSet[v] == false && matrice_adjacence[u][v] > key[v]) + key[v] = matrice_adjacence[u][v]; + } + /* End of the MST algorithm */ + + /* Put last vertex in SIGMA */ + for (i = 0; i < number_task; i++) + { + if (mstSet[i] == false) + { + tab_SIGMA[number_task - 1] = i; + } + } + + _sched_visu_print_vector(tab_SIGMA, number_task, "tab_SIGMA[i] : "); + + i = 0; + + /* Filling our task list */ + temp_paquets->temp_pointer_1 = temp_paquets->first_link; + while (i != number_task) + { + if (tab_SIGMA[i] == temp_paquets->temp_pointer_1->index_package) + { + starpu_task_list_push_back(&SIGMA, starpu_task_list_pop_front(&temp_paquets->temp_pointer_1->sub_list)); + i++; + temp_paquets->temp_pointer_1 = temp_paquets->first_link; + } + else + { + temp_paquets->temp_pointer_1 = temp_paquets->temp_pointer_1->next; + } + } + + //Belady + //~ if (starpu_get_env_number_default("BELADY",0) == 1) + //~ { + //~ get_ordre_utilisation_donnee_mst(data, NB_TOTAL_DONNEES); + //~ } + + return SIGMA; +} + +static struct starpu_task *mst_pull_task(struct starpu_sched_component *component, struct starpu_sched_component *to) +{ + struct _starpu_HFP_sched_data *data = component->data; + int i = 0; + struct starpu_task *task = NULL; + + if (_starpu_HFP_do_schedule_done == true) + { + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + + /* If one or more task have been refused */ + data->p->temp_pointer_1 = data->p->first_link; + if (data->p->temp_pointer_1->next != NULL) + { + for (i = 0; i < _nb_gpus; i++) + { + if (to == component->children[i]) + { + break; + } + else + { + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + } + } + } + if (!starpu_task_list_empty(&data->p->temp_pointer_1->refused_fifo_list)) + { + task = starpu_task_list_pop_back(&data->p->temp_pointer_1->refused_fifo_list); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + _STARPU_SCHED_PRINT("Task %p is getting out of pull_task from fifo refused list on gpu %p\n",task, to); + return task; + } + /* If the linked list is empty, we can pull more tasks */ + if (_starpu_HFP_is_empty(data->p->first_link) == true) + { + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + return NULL; + } + else + { + task = _starpu_HFP_get_task_to_return(component, to, data->p, _nb_gpus); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + _STARPU_SCHED_PRINT("Task %p is getting out of pull_task from gpu %p\n", task, to); + return task; + } +} + return NULL; +} + +static int _get_number_GPU() +{ + int return_value = starpu_memory_nodes_get_count_by_kind(STARPU_CUDA_RAM); + + if (return_value == 0) /* We are not using GPUs so we are in an out-of-core case using CPUs. Need to return 1. If I want to deal with GPUs AND CPUs we need to adpt this function to return NGPU + 1 */ + { + return 1; + } + + return return_value; +} + +static int mst_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to) +{ + int i = 0; + struct _starpu_HFP_sched_data *data = component->data; + int didwork = 0; + + struct starpu_task *task; + task = starpu_sched_component_pump_to(component, to, &didwork); + if (task) + { + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + data->p->temp_pointer_1 = data->p->first_link; + int nb_gpu = _get_number_GPU(); + if (data->p->temp_pointer_1->next == NULL) + { + starpu_task_list_push_back(&data->p->temp_pointer_1->refused_fifo_list, task); + } + else + { + for (i = 0; i < nb_gpu; i++) + { + if (to == component->children[i]) + { + break; + } + else + { + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + } + } + starpu_task_list_push_back(&data->p->temp_pointer_1->refused_fifo_list, task); + } + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + } + else + { + /* Can I uncomment this part ? */ + //~ { + //~ if (didwork) + //~ fprintf(stderr, "pushed some tasks to %p\n", to); + //~ else + //~ fprintf(stderr, "I didn't have anything for %p\n", to); + //~ } + } + + /* There is room now */ + return didwork || starpu_sched_component_can_push(component, to); +} + +static int mst_can_pull(struct starpu_sched_component * component) +{ + return starpu_sched_component_can_pull(component); +} + +static void mst_do_schedule(struct starpu_sched_component *component) +{ + int i = 0; + struct starpu_task_list temp_task_list; + starpu_task_list_init(&temp_task_list); + int NB_TOTAL_DONNEES = 0; + struct _starpu_HFP_sched_data *data = component->data; + struct starpu_task *task = NULL; + _starpu_HFP_NT = 0; + int number_of_package_to_build = _get_number_GPU(); + _starpu_HFP_GPU_RAM_M = (starpu_memory_get_total(starpu_worker_get_memory_node(starpu_bitmap_first(&component->workers_in_ctx)))); + + /* If the linked list is empty, we can pull more tasks */ + if (_starpu_HFP_is_empty(data->p->first_link) == true) + { + if (!starpu_task_list_empty(&data->sched_list)) + { + _starpu_HFP_appli = starpu_task_get_name(starpu_task_list_begin(&data->sched_list)); + if (_starpu_HFP_hmetis != 0) + { + _starpu_hmetis_scheduling(data->p, &data->sched_list, number_of_package_to_build); + + /* Apply mst on each package */ + data->p->temp_pointer_1 = data->p->first_link; + for (i = 0; i < number_of_package_to_build; i++) + { + data->p->temp_pointer_1->sub_list = mst(data->p->temp_pointer_1->sub_list, data->p->temp_pointer_1->nb_task_in_sub_list); + data->p->temp_pointer_1 = data->p->temp_pointer_1->next; + } + _starpu_HFP_do_schedule_done = true; + return; + } + + /* Pulling all tasks and counting them */ + while (!starpu_task_list_empty(&data->sched_list)) + { + task = starpu_task_list_pop_front(&data->sched_list); + NB_TOTAL_DONNEES+=STARPU_TASK_GET_NBUFFERS(task); + _starpu_HFP_NT++; + _STARPU_SCHED_PRINT("%p\n",task); + starpu_task_list_push_back(&temp_task_list, task); + } + _STARPU_SCHED_PRINT("%d task(s) have been pulled\n", _starpu_HFP_NT); + //~ task = starpu_task_list_begin(&data->popped_task_list); + //~ printf("tache %p\n", task); + /* Apply mst on the task list */ + //~ data->p->temp_pointer_1->sub_list = mst(data->popped_task_list, NT, GPU_RAM_M); + data->p->temp_pointer_1->sub_list = mst(temp_task_list, _starpu_HFP_NT); + + _starpu_HFP_do_schedule_done = true; + } + } +} + +struct starpu_sched_component *starpu_sched_component_mst_create(struct starpu_sched_tree *tree, void *params STARPU_ATTRIBUTE_UNUSED) +{ + _starpu_HFP_hmetis = starpu_get_env_number_default("HMETIS", 0); + + struct starpu_sched_component *component = starpu_sched_component_create(tree, "mst"); + + struct _starpu_HFP_sched_data *data; + struct _starpu_HFP_my_list *my_data = malloc(sizeof(*my_data)); + struct _starpu_HFP_paquets *paquets_data = malloc(sizeof(*paquets_data)); + _STARPU_MALLOC(data, sizeof(*data)); + + _starpu_visu_init(); + + STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL); + starpu_task_list_init(&data->sched_list); + //~ starpu_task_list_init(&data->popped_task_list); + starpu_task_list_init(&my_data->sub_list); + starpu_task_list_init(&my_data->refused_fifo_list); + + my_data->next = NULL; + paquets_data->temp_pointer_1 = my_data; + paquets_data->first_link = paquets_data->temp_pointer_1; + data->p = paquets_data; + data->p->temp_pointer_1->nb_task_in_sub_list = 0; + data->p->temp_pointer_1->expected_time_pulled_out = 0; + + component->data = data; + component->do_schedule = mst_do_schedule; + component->push_task = mst_push_task; + component->pull_task = mst_pull_task; + component->can_push = mst_can_push; + component->can_pull = mst_can_pull; + + return component; +} + +static void initialize_mst_center_policy(unsigned sched_ctx_id) +{ + _nb_gpus = _get_number_GPU(); + starpu_sched_component_initialize_simple_scheduler((starpu_sched_component_create_t) starpu_sched_component_mst_create, NULL, + STARPU_SCHED_SIMPLE_DECIDE_MEMNODES | + STARPU_SCHED_SIMPLE_DECIDE_ALWAYS | + STARPU_SCHED_SIMPLE_FIFOS_BELOW | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_READY | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP | + STARPU_SCHED_SIMPLE_IMPL, sched_ctx_id); +} + +static void deinitialize_mst_center_policy(unsigned sched_ctx_id) +{ + struct starpu_sched_tree *tree = (struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id); + starpu_sched_tree_destroy(tree); +} + +static void get_current_tasks_mst(struct starpu_task *task, unsigned sci) +{ +#ifdef PRINT_PYTHON + task_currently_treated = task; +#endif + starpu_sched_component_worker_pre_exec_hook(task,sci); +} + +struct starpu_sched_policy _starpu_sched_mst_policy = +{ + .init_sched = initialize_mst_center_policy, + .deinit_sched = deinitialize_mst_center_policy, + .add_workers = starpu_sched_tree_add_workers, + .remove_workers = starpu_sched_tree_remove_workers, + .do_schedule = starpu_sched_tree_do_schedule, + .push_task = starpu_sched_tree_push_task, + .pop_task = starpu_sched_tree_pop_task, + .pre_exec_hook = get_current_tasks_mst, + .post_exec_hook = starpu_sched_component_worker_post_exec_hook, + .policy_name = "mst", + .policy_description = "Maximum Spanning Tree", + .worker_type = STARPU_WORKER_LIST, +}; diff --git a/src/sched_policies/random_order.c b/src/sched_policies/random_order.c new file mode 100644 index 0000000000..4157e23770 --- /dev/null +++ b/src/sched_policies/random_order.c @@ -0,0 +1,324 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2013-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * Copyright (C) 2013 Simon Archipoff + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#define PRINTF /* O or 1 */ + +/* Structure used to store all the variable we need and the tasks of each package. Each link is a package */ +struct my_list +{ + int package_nb_data; + //~ int nb_task_in_sub_list; + int index_package; /* Used to write in Data_coordinates.txt and keep track of the initial index of the package */ + //~ starpu_data_handle_t * package_data; /* List of all the data in the packages. We don't put two times the duplicates */ + struct starpu_task_list sub_list; /* The list containing the tasks */ + struct my_list *next; +}; + +/* Structure used to access the struct my_list. There are also task's list */ +struct random_order_sched_data +{ + struct starpu_task_list popped_task_list; /* List used to store all the tasks at the beginning of the pull_task function */ + struct starpu_task_list random_list; /* List used to store all the tasks at the beginning of the pull_task function */ + struct starpu_task_list list_if_fifo_full; /* List used if the fifo list is not empty. It means that task from the last iteration haven't been pushed, thus we need to pop task from this list */ + + /* All the pointer use to navigate through the linked list */ + struct my_list *temp_pointer_1; + struct my_list *temp_pointer_2; + struct my_list *temp_pointer_3; + struct my_list *first_link; /* Pointer that we will use to point on the first link of the linked list */ + + struct starpu_task_list sched_list; + starpu_pthread_mutex_t policy_mutex; +}; + +/* Put a link at the beginning of the linked list */ +static void random_order_insertion(struct random_order_sched_data *a) +{ + struct my_list *new = malloc(sizeof(*new)); /* Creation of a new link */ + starpu_task_list_init(&new->sub_list); + new->next = a->temp_pointer_1; + a->temp_pointer_1 = new; +} + +/* Delete all the empty packages */ +static struct my_list* random_order_delete_link(struct random_order_sched_data* a) +{ + while (a->first_link != NULL && a->first_link->package_nb_data == 0) + { + a->temp_pointer_1 = a->first_link; + a->first_link = a->first_link->next; + free(a->temp_pointer_1); + } + if (a->first_link != NULL) + { + a->temp_pointer_2 = a->first_link; + a->temp_pointer_3 = a->first_link->next; + while (a->temp_pointer_3 != NULL) + { + while (a->temp_pointer_3 != NULL && a->temp_pointer_3->package_nb_data == 0) + { + a->temp_pointer_1 = a->temp_pointer_3; + a->temp_pointer_3 = a->temp_pointer_3->next; + a->temp_pointer_2->next = a->temp_pointer_3; + free(a->temp_pointer_1); + } + if (a->temp_pointer_3 != NULL) + { + a->temp_pointer_2 = a->temp_pointer_3; + a->temp_pointer_3 = a->temp_pointer_3->next; + } + } + } + return a->first_link; +} + +/* Pushing the tasks */ +static int random_order_push_task(struct starpu_sched_component *component, struct starpu_task *task) +{ + struct random_order_sched_data *data = component->data; + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + starpu_task_list_push_front(&data->sched_list, task); + starpu_push_task_end(task); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + /* Tell below that they can now pull */ + component->can_pull(component); + return 0; +} + +/* The function that sort the tasks in packages */ +static struct starpu_task *random_order_pull_task(struct starpu_sched_component *component, struct starpu_sched_component *to) +{ + (void)to; + struct random_order_sched_data *data = component->data; + + int random_number = 0; + struct starpu_task *task1 = NULL; + struct starpu_task *temp_task_1 = NULL; + struct starpu_task *temp_task_2 = NULL; + + int NT = 0; int i = 0; int link_index = 0; int do_not_add_more = 0; + + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + + /* If one or more task have been refused */ + if (!starpu_task_list_empty(&data->list_if_fifo_full)) + { + task1 = starpu_task_list_pop_back(&data->list_if_fifo_full); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + return task1; + } + if (starpu_task_list_empty(&data->random_list)) + { + if (!starpu_task_list_empty(&data->sched_list)) + { + time_t start, end; time(&start); + while (!starpu_task_list_empty(&data->sched_list)) + { + task1 = starpu_task_list_pop_front(&data->sched_list); + NT++; + if (starpu_get_env_number_default("PRINTF",0) == 1) printf("%p\n",task1); + starpu_task_list_push_back(&data->popped_task_list,task1); + } + if (starpu_get_env_number_default("PRINTF",0) == 1) printf("%d task(s) have been pulled\n",NT); + + temp_task_1 = starpu_task_list_begin(&data->popped_task_list); + //~ data->temp_pointer_1->package_data = malloc(STARPU_TASK_GET_NBUFFERS(temp_task_1)*sizeof(data->temp_pointer_1->package_data[0])); + /* One task == one link in the linked list */ + do_not_add_more = NT - 1; + for (temp_task_1 = starpu_task_list_begin(&data->popped_task_list); temp_task_1 != starpu_task_list_end(&data->popped_task_list); temp_task_1 = temp_task_2) + { + //~ printf("ok0.5\n"); + temp_task_2 = starpu_task_list_next(temp_task_1); + temp_task_1 = starpu_task_list_pop_front(&data->popped_task_list); + //~ printf("ok0.6\n"); + data->temp_pointer_1->package_nb_data = 1; + //~ printf("ok0.7\n"); + /* We sort our data in the packages */ + /* Pushing the task and the number of the package in the package*/ + starpu_task_list_push_back(&data->temp_pointer_1->sub_list,temp_task_1); + //~ printf("ok0.8\n"); + data->temp_pointer_1->index_package = link_index; + /* Initialization of the lists last_packages */ + //~ printf("ok1\n"); + link_index++; + //~ data->temp_pointer_1->nb_task_in_sub_list ++; + + if(do_not_add_more != 0) + { + random_order_insertion(data); + } + do_not_add_more--; + } + data->first_link = data->temp_pointer_1; + int temp_NT = NT; + for (i = 0; i < temp_NT; i++) + { + data->temp_pointer_1 = data->first_link; + random_number = rand()%NT; + //~ printf("Il y a %d tâche, random = %d\n",NT,random_number); + while (random_number != 0) + { + data->temp_pointer_1 = data->temp_pointer_1->next; + random_number--; + } + data->temp_pointer_1->package_nb_data = 0; + starpu_task_list_push_back(&data->random_list,starpu_task_list_pop_front(&data->temp_pointer_1->sub_list)); + data->temp_pointer_1 = random_order_delete_link(data); + NT--; + //~ task1 = starpu_task_list_pop_front(&data->temp_pointer_1->sub_list); + } + //~ free(&data->temp_pointer_1->package_nb_data); + //~ free(data); + //~ random_order_free(data); + + time(&end); + int time_taken = end - start; + if (starpu_get_env_number_default("PRINTF",0) == 1) printf("Temps d'exec : %d secondes\n",time_taken); + FILE *f_time = fopen("Output_maxime/Execution_time_raw.txt","a"); + fprintf(f_time,"%d\n",time_taken); + fclose(f_time); + + task1 = starpu_task_list_pop_front(&data->random_list); + //~ free(data->temp_pointer_1->package_nb_data); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + if (starpu_get_env_number_default("PRINTF",0) == 1) printf("Task %p is getting out of pull_task\n",task1); + return task1; + } + else + { + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + //~ if (starpu_get_env_number_default("PRINTF",0) == 1) printf("Task %p is getting out of pull_task\n",task1); + return task1; + } + } + else + { + //~ task1 = starpu_task_list_pop_front(&data->popped_task_list); + task1 = starpu_task_list_pop_front(&data->random_list); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + if (starpu_get_env_number_default("PRINTF",0) == 1) printf("Task %p is getting out of pull_task\n",task1); + return task1; + } +} + +static int random_order_can_push(struct starpu_sched_component * component, struct starpu_sched_component * to) +{ + struct random_order_sched_data *data = component->data; + int didwork = 0; + + struct starpu_task *task; + task = starpu_sched_component_pump_to(component, to, &didwork); + + if (task) + { + //~ if (starpu_get_env_number_default("PRINTF",0) == 1) { fprintf(stderr, "oops, task %p got refused\n", task); } + /* Oops, we couldn't push everything, put back this task */ + STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex); + starpu_task_list_push_back(&data->list_if_fifo_full, task); + STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex); + } + else + { + /* Can I uncomment this part ? */ + //~ { + //~ if (didwork) + //~ fprintf(stderr, "pushed some tasks to %p\n", to); + //~ else + //~ fprintf(stderr, "I didn't have anything for %p\n", to); + //~ } + } + + /* There is room now */ + return didwork || starpu_sched_component_can_push(component, to); +} + +static int random_order_can_pull(struct starpu_sched_component * component) +{ + return starpu_sched_component_can_pull(component); +} + +struct starpu_sched_component *starpu_sched_component_random_order_create(struct starpu_sched_tree *tree, void *params STARPU_ATTRIBUTE_UNUSED) +{ + starpu_srand48(time(0)); + struct starpu_sched_component *component = starpu_sched_component_create(tree, "random_order"); + + struct random_order_sched_data *data; + _STARPU_CALLOC(data, 1, sizeof(*data)); + + STARPU_PTHREAD_MUTEX_INIT(&data->policy_mutex, NULL); + starpu_task_list_init(&data->sched_list); + starpu_task_list_init(&data->list_if_fifo_full); + starpu_task_list_init(&data->popped_task_list); + starpu_task_list_init(&data->random_list); + + struct my_list *my_data = malloc(sizeof(*my_data)); + my_data->next = NULL; + starpu_task_list_init(&my_data->sub_list); + data->temp_pointer_1 = my_data; + + component->data = data; + component->push_task = random_order_push_task; + component->pull_task = random_order_pull_task; + component->can_push = random_order_can_push; + component->can_pull = random_order_can_pull; + + return component; +} + +static void initialize_random_order_center_policy(unsigned sched_ctx_id) +{ + starpu_sched_component_initialize_simple_scheduler((starpu_sched_component_create_t) starpu_sched_component_random_order_create, NULL, + STARPU_SCHED_SIMPLE_DECIDE_MEMNODES | + STARPU_SCHED_SIMPLE_DECIDE_ALWAYS | + STARPU_SCHED_SIMPLE_FIFOS_BELOW | + STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP | + STARPU_SCHED_SIMPLE_IMPL, sched_ctx_id); +} + +static void deinitialize_random_order_center_policy(unsigned sched_ctx_id) +{ + struct starpu_sched_tree *tree = (struct starpu_sched_tree*)starpu_sched_ctx_get_policy_data(sched_ctx_id); + starpu_sched_tree_destroy(tree); +} + +struct starpu_sched_policy _starpu_sched_random_order_policy = +{ + .init_sched = initialize_random_order_center_policy, + .deinit_sched = deinitialize_random_order_center_policy, + .add_workers = starpu_sched_tree_add_workers, + .remove_workers = starpu_sched_tree_remove_workers, + .push_task = starpu_sched_tree_push_task, + .pop_task = starpu_sched_tree_pop_task, + .pre_exec_hook = starpu_sched_component_worker_pre_exec_hook, + .post_exec_hook = starpu_sched_component_worker_post_exec_hook, + .policy_name = "random_order", + .policy_description = "Description", + .worker_type = STARPU_WORKER_LIST, +}; diff --git a/src/sched_policies/sched_visu.c b/src/sched_policies/sched_visu.c new file mode 100644 index 0000000000..6a7c48b983 --- /dev/null +++ b/src/sched_policies/sched_visu.c @@ -0,0 +1,683 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2013-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#include +#include +#include + +#ifdef PRINT_PYTHON +static int *_index_current_popped_task; +static int _index_current_popped_task_all_gpu; +static int *_index_current_popped_task_prefetch; +static int _index_current_popped_task_all_gpu_prefetch; +#endif +int _print3d; +int _print_in_terminal; +int _print_n; +#ifdef PRINT_PYTHON +static int index_task_currently_treated=0; +#endif +static int _index_current_task_for_visualization=0; +struct starpu_task *task_currently_treated = NULL; + +static char *_output_directory = NULL; +char *_sched_visu_get_output_directory() +{ + if (_output_directory == NULL) + { + _output_directory = starpu_getenv("STARPU_SCHED_OUTPUT"); + if (_output_directory == NULL) + _output_directory = "/tmp"; + _starpu_mkpath_and_check(_output_directory, S_IRWXU); + } + return _output_directory; +} + +void _sched_visu_init(int nb_gpus) +{ + (void)nb_gpus; +#ifdef PRINT_PYTHON + _index_current_popped_task = malloc(sizeof(int)*nb_gpus); + _index_current_popped_task_prefetch = malloc(sizeof(int)*nb_gpus); + _index_current_popped_task_all_gpu = 0; + _index_current_popped_task_all_gpu_prefetch = 0; +#endif + _print3d = starpu_get_env_number_default("STARPU_SCHED_PRINT3D", 0); + _print_in_terminal = starpu_get_env_number_default("STARPU_SCHED_PRINT_IN_TERMINAL", 0); + _print_n = starpu_get_env_number_default("STARPU_SCHED_PRINT_N", 0); +} + +/* Printing in a file the coordinates and the data loaded during prefetch for each task for visu python */ +void _sched_visu_print_data_to_load_prefetch(struct starpu_task *task, int gpu_id, int force) +{ + (void)task; + (void)gpu_id; + (void)force; +#ifdef PRINT_PYTHON + int current_gpu = gpu_id; + _index_current_popped_task_prefetch[current_gpu]++; /* Increment popped task on the right GPU */ + _index_current_popped_task_all_gpu_prefetch++; + int nb_data_to_load = 0; + int x_to_load = 0; + int y_to_load = 0; + int z_to_load = 0; + unsigned int i = 0; + /* Getting the number of data to load */ + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if(!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(task, i), gpu_id)) + { + nb_data_to_load++; + + /* To know if I load a line or a column */ + if (i == 0) + { + x_to_load = 1; + } + if (i == 1) + { + y_to_load = 1; + } + if (i == 2) + { + z_to_load = 1; + } + } + } + /* Printing the number of data to load */ + FILE *f2 = NULL; + int tab_coordinates[2]; + + if (strcmp(starpu_task_get_name(task), "starpu_sgemm_gemm") == 0) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, tab_coordinates); + int size = strlen(_output_directory) + strlen("/Data_to_load_prefetch_SCHEDULER.txt") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_to_load_prefetch_SCHEDULER.txt"); + if (_index_current_popped_task_all_gpu_prefetch == 1) + { + f2 = fopen(path, "w"); + } + else + { + f2 = fopen(path, "a"); + } + STARPU_ASSERT_MSG(f2, "cannot open file <%s>\n", path); + if (_print3d != 0) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, tab_coordinates); + fprintf(f2, "%d %d", tab_coordinates[0], tab_coordinates[1]); + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 0), 2, tab_coordinates); + fprintf(f2, " %d %d %d %d %d\n", tab_coordinates[0], x_to_load, y_to_load, z_to_load, current_gpu); + } + else + { + fprintf(f2, "%d %d %d %d %d\n", tab_coordinates[0], tab_coordinates[1], x_to_load, y_to_load, current_gpu); + } + } + else if (strcmp(starpu_task_get_name(task), "POTRF") == 0 || strcmp(starpu_task_get_name(task), "SYRK") == 0 || strcmp(starpu_task_get_name(task), "TRSM") == 0 || strcmp(starpu_task_get_name(task), "GEMM") == 0) + { + /* Ouverture du fichier. */ + int size = strlen(_output_directory) + strlen("/Data_to_load_prefetch_SCHEDULER.txt") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_to_load_prefetch_SCHEDULER.txt"); + if (_index_current_popped_task_all_gpu_prefetch == 1) + { + f2 = fopen(path, "w"); + fprintf(f2, "TASK COORDY COORDX XTOLOAD YTOLOAD ZTOLOAD GPU ITERATIONK\n"); + } + else + { + f2 = fopen(path, "a"); + } + + /* Impression du type de tâche. */ + if (strcmp(starpu_task_get_name(task), "POTRF") == 0) + { + fprintf(f2, "POTRF"); + } + else if (strcmp(starpu_task_get_name(task), "TRSM") == 0) + { + fprintf(f2, "TRSM"); + } + else + { + /* Cas SYRK et GEMM que je distingue avec la donnée en double pour SYRK. */ + if (STARPU_TASK_GET_HANDLE(task, 0) == STARPU_TASK_GET_HANDLE(task, 1)) + { + fprintf(f2, "SYRK"); + /* Attention pour SYRK il ne faut pas compter en double la donnée à charger. Donc je regarde si je l'a compté en double je fais --. */ + if(!starpu_data_is_on_node(STARPU_TASK_GET_HANDLE(task, 0), current_gpu)) + { + y_to_load = 0; + } + } + else + { + fprintf(f2, "GEMM"); + } + } + /* La je n'imprime que les coords de la dernière donnée de la tâche car c'est ce qui me donne la place dans le triangle de Cholesky. */ + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, STARPU_TASK_GET_NBUFFERS(task) - 1), 2, tab_coordinates); + fprintf(f2, " %d %d %d %d %d %d %ld\n", tab_coordinates[0], tab_coordinates[1], x_to_load, y_to_load, z_to_load, current_gpu, task->iterations[0]); + + } + else + { + _STARPU_DISP("There is only support for GEMM and CHOLESKY currently. Task %s is not supported.\n", starpu_task_get_name(task)); + } + if (f2) fclose(f2); +#endif +} + +void _sched_visu_pop_ready_task(struct starpu_task *task) +{ + (void)task; +/* Getting the data we need to fetch for visualization */ +#ifdef PRINT_PYTHON + if (_index_current_task_for_visualization == 0) + { + _output_directory = _sched_visu_get_output_directory(); + } + + if (task != NULL) + { + int current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()); + _index_current_popped_task[current_gpu]++; /* Increment popped task on the right GPU */ + _index_current_popped_task_all_gpu++; + int nb_data_to_load = 0; + int x_to_load = 0; + int y_to_load = 0; + int z_to_load = 0; + unsigned i; + /* Getting the number of data to load */ + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if(!starpu_data_is_on_node_excluding_prefetch(STARPU_TASK_GET_HANDLE(task, i), current_gpu)) + { + nb_data_to_load++; + + /* To know if I load a line or a column. Attention ca marche pas si plus de 3 données dans la tâche. */ + if (i == 0) + { + x_to_load = 1; + } + else if (i == 1) + { + y_to_load = 1; + } + else if (i == 2) + { + z_to_load = 1; + } + else + { + perror("Cas pas géré dans get data to load.\n"); exit(0); + } + } + } + FILE *f2 = NULL; + int tab_coordinates[2]; + /* Cas 2D et 3D qui marche. */ + if (strcmp(starpu_task_get_name(task), "starpu_sgemm_gemm") == 0) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, tab_coordinates); + int size = strlen(_output_directory) + strlen("/Data_to_load_SCHEDULER.txt") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_to_load_SCHEDULER.txt"); + if (_index_current_popped_task_all_gpu == 1) + { + f2 = fopen(path, "w"); + } + else + { + f2 = fopen(path, "a"); + } + if (_print3d != 0) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, tab_coordinates); + fprintf(f2, "%d %d", tab_coordinates[0], tab_coordinates[1]); + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 0), 2, tab_coordinates); + fprintf(f2, " %d %d %d %d %d\n", tab_coordinates[0], x_to_load, y_to_load, z_to_load, current_gpu - 1); + } + else + { + fprintf(f2, "%d %d %d %d %d\n", tab_coordinates[0], tab_coordinates[1], x_to_load, y_to_load, current_gpu - 1); + } + } + else if (strcmp(starpu_task_get_name(task), "POTRF") == 0 || strcmp(starpu_task_get_name(task), "SYRK") == 0 || strcmp(starpu_task_get_name(task), "TRSM") == 0 || strcmp(starpu_task_get_name(task), "GEMM") == 0) + { + /* Ouverture du fichier. */ + int size = strlen(_output_directory) + strlen("/Data_to_load_SCHEDULER.txt") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_to_load_SCHEDULER.txt"); + + if (_index_current_popped_task_all_gpu == 1) + { + f2 = fopen(path, "w"); + fprintf(f2, "TASK COORDY COORDX XTOLOAD YTOLOAD ZTOLOAD GPU ITERATIONK\n"); + } + else + { + f2 = fopen(path, "a"); + } + + /* Impression du type de tâche. */ + if (strcmp(starpu_task_get_name(task), "chol_model_11") == 0 || strcmp(starpu_task_get_name(task), "POTRF") == 0) + { + fprintf(f2, "POTRF"); + } + else if (strcmp(starpu_task_get_name(task), "chol_model_21") == 0 || strcmp(starpu_task_get_name(task), "TRSM") == 0) + { + fprintf(f2, "TRSM"); + } + else + { + /* Cas SYRK et GEMM que je distingue avec la donnée en double pour SYRK. */ + if (STARPU_TASK_GET_HANDLE(task, 0) == STARPU_TASK_GET_HANDLE(task, 1)) + { + fprintf(f2, "SYRK"); + if(!starpu_data_is_on_node_excluding_prefetch(STARPU_TASK_GET_HANDLE(task, 0), current_gpu)) + { + y_to_load = 0; + } + } + else + { + fprintf(f2, "GEMM"); + } + } + + /* La je n'imprime que les coords de la dernière donnée de la tâche car c'est ce qui me donne la place dans le triangle de Cholesky. */ + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, STARPU_TASK_GET_NBUFFERS(task) - 1), 2, tab_coordinates); + fprintf(f2, " %d %d %d %d %d %d %ld\n", tab_coordinates[0], tab_coordinates[1], x_to_load, y_to_load, z_to_load, current_gpu - 1, task->iterations[0]); + } + else + { + _STARPU_DISP("Dans get data to load je ne gère que GEMM et CHOLESKY. Task %s is not supported.\n", starpu_task_get_name(task)); + } + fclose(f2); + } +#endif +} + +/* Used for visualisation python */ +struct starpu_task *_sched_visu_get_data_to_load(unsigned sched_ctx) +{ +#ifndef PRINT_PYTHON + return starpu_sched_tree_pop_task(sched_ctx); +#else + struct starpu_task *task = starpu_sched_tree_pop_task(sched_ctx); + if (task != NULL) + { + //~ int current_gpu = starpu_worker_get_id(); + int current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()); + //~ printf("Ngpu = %d current = %d task = %p\n", _nb_gpus, current_gpu, task); + _index_current_popped_task[current_gpu]++; /* Increment popped task on the right GPU */ + _index_current_popped_task_all_gpu++; + int nb_data_to_load = 0; + int x_to_load = 0; + int y_to_load = 0; + int z_to_load = 0; + /* Getting the number of data to load */ + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + if(!starpu_data_is_on_node_excluding_prefetch(STARPU_TASK_GET_HANDLE(task, i), current_gpu)) + { + nb_data_to_load++; + + /* To know if I load a line or a column. Attention ca marche pas si plus de 3 données dans la tâche. */ + if (i == 0) + { + x_to_load = 1; + } + else if (i == 1) + { + y_to_load = 1; + } + else if (i == 2) + { + z_to_load = 1; + } + else + { + perror("Cas pas géré dans get data to load.\n"); exit(0); + } + } + } + + //~ printf("%s in get_data_to_load.\n", starpu_task_get_name(task)); + + /* Printing the number of data to load */ + FILE *f2 = NULL; + + int tab_coordinates[2]; + + /* Cas 2D et 3D qui marche. */ + if (strcmp(starpu_task_get_name(task), "starpu_sgemm_gemm") == 0) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, tab_coordinates); + + int size = strlen(_output_directory) + strlen("/Data_to_load_SCHEDULER.txt") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_to_load_SCHEDULER.txt"); + + if (_index_current_popped_task_all_gpu == 1) + { + f2 = fopen(path, "w"); + } + else + { + f2 = fopen(path, "a"); + } + if (_print3d != 0) + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, tab_coordinates); + fprintf(f2, "%d %d", tab_coordinates[0], tab_coordinates[1]); + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 0), 2, tab_coordinates); + fprintf(f2, " %d %d %d %d %d\n", tab_coordinates[0], x_to_load, y_to_load, z_to_load, current_gpu - 1); + } + else + { + fprintf(f2, "%d %d %d %d %d\n", tab_coordinates[0], tab_coordinates[1], x_to_load, y_to_load, current_gpu - 1); + } + } + else if (strcmp(starpu_task_get_name(task), "chol_model_11") == 0 || strcmp(starpu_task_get_name(task), "chol_model_21") == 0 || strcmp(starpu_task_get_name(task), "chol_model_22") == 0 || strcmp(starpu_task_get_name(task), "POTRF") == 0 || strcmp(starpu_task_get_name(task), "SYRK") == 0 || strcmp(starpu_task_get_name(task), "TRSM") == 0 || strcmp(starpu_task_get_name(task), "GEMM") == 0) + { + /* Ouverture du fichier. */ + int size = strlen(_output_directory) + strlen("/Data_to_load_SCHEDULER.txt") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_to_load_SCHEDULER.txt"); + if (_index_current_popped_task_all_gpu == 1) + { + f2 = fopen(path, "w"); + fprintf(f2, "TASK COORDY COORDX XTOLOAD YTOLOAD ZTOLOAD GPU ITERATIONK\n"); + } + else + { + f2 = fopen(path, "a"); + } + + /* Impression du type de tâche. */ + if (strcmp(starpu_task_get_name(task), "chol_model_11") == 0 || strcmp(starpu_task_get_name(task), "POTRF") == 0) + { + fprintf(f2, "POTRF"); + } + else if (strcmp(starpu_task_get_name(task), "chol_model_21") == 0 || strcmp(starpu_task_get_name(task), "TRSM") == 0) + { + fprintf(f2, "TRSM"); + } + else + { + /* Cas SYRK et GEMM que je distingue avec la donnée en double pour SYRK. */ + if (STARPU_TASK_GET_HANDLE(task, 0) == STARPU_TASK_GET_HANDLE(task, 1)) + { + fprintf(f2, "SYRK"); + /* Attention pour SYRK il ne faut pas compter en double la donnée à charger. Donc je regarde si je l'a compté en double je fais --. */ + if(!starpu_data_is_on_node_excluding_prefetch(STARPU_TASK_GET_HANDLE(task, 0), current_gpu)) + { + y_to_load = 0; + } + } + else + { + fprintf(f2, "GEMM"); + } + } + + /* La je n'imprime que les coords de la dernière donnée de la tâche car c'est ce qui me donne la place dans le triangle de Cholesky. */ + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, STARPU_TASK_GET_NBUFFERS(task) - 1), 2, tab_coordinates); + fprintf(f2, " %d %d %d %d %d %d %ld\n", tab_coordinates[0], tab_coordinates[1], x_to_load, y_to_load, z_to_load, current_gpu - 1, task->iterations[0]); + } + else + { + printf("Dans get data to load je ne gère que GEMM et CHOLESKY. Task %s is not supported.\n", starpu_task_get_name(task)); + exit(0); + } + fclose(f2); + } + return task; +#endif +} + +/* Visu python. + * Print in a file the effective order + * (we do it from get_current_task because the ready heuristic + * can change our planned order). + * Also print in a file each task and it data to compute later the data needed + * to load at each iteration. + */ +void _sched_visu_print_effective_order_in_file(struct starpu_task *task, int index_task) +{ + FILE *f = NULL; + int tab_coordinates[2]; + + int current_gpu = starpu_worker_get_memory_node(starpu_worker_get_id()); + /* For the coordinates It write the coordinates (with Z for 3D), then the GPU and then the number of data needed to load for this task */ + //~ if (_print_n != 0 && (strcmp(_starpu_HFP_appli, "starpu_sgemm_gemm") == 0)) + if (strcmp(_starpu_HFP_appli, "starpu_sgemm_gemm") == 0) + { + int size = strlen(_output_directory) + strlen("/Data_coordinates_order_last_SCHEDULER.txt") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_coordinates_order_last_SCHEDULER.txt"); + if (index_task == 0) + { + f = fopen(path, "w"); + } + else + { + f = fopen(path, "a"); + } + /* Pour matrice 3D je récupère la coord de Z aussi */ + if (_print3d != 0) + { + /* 3 for 3D no ? */ + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, tab_coordinates); + fprintf(f, "%d %d", tab_coordinates[0], tab_coordinates[1]); + + /* TODO a suppr */ + //~ printf("Tâche n°%d %p : x = %d | y = %d | ", index_task, task, temp_tab_coordinates[0], temp_tab_coordinates[1]); + + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 0), 2, tab_coordinates); + fprintf(f, " %d %d\n", tab_coordinates[0], current_gpu - 1); + + /* TODO a suppr */ + //~ printf("z = %d\n", temp_tab_coordinates[0]); + } + else + { + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, 2), 2, tab_coordinates); + fprintf(f, "%d %d %d\n", tab_coordinates[0], tab_coordinates[1], current_gpu - 1); + } + fclose(f); + //~ if (index_task == NT - 1) + //~ { + //~ if (_print3d == 0) + //~ { + //~ visualisation_tache_matrice_format_tex_with_data_2D(); + //~ } + //~ } + } + else if (strcmp(starpu_task_get_name(task), "chol_model_11") == 0 || strcmp(starpu_task_get_name(task), "chol_model_21") == 0 || strcmp(starpu_task_get_name(task), "chol_model_22") == 0 || strcmp(starpu_task_get_name(task), "POTRF") == 0 || strcmp(starpu_task_get_name(task), "SYRK") == 0 || strcmp(starpu_task_get_name(task), "TRSM") == 0 || strcmp(starpu_task_get_name(task), "GEMM") == 0) + { + int size = strlen(_output_directory) + strlen("/Data_coordinates_order_last_SCHEDULER.txt") + 1; + char path[size]; + snprintf(path, size, "%s%s", _output_directory, "/Data_coordinates_order_last_SCHEDULER.txt"); + if (index_task == 0) + { + /* Ouverture du fichier. */ + f = fopen(path, "w"); + fprintf(f, "TASK COORDY COORDX GPU ITERATIONK\n"); + } + else + { + f = fopen(path, "a"); + } + + /* Impression du type de tâche. */ + if (strcmp(starpu_task_get_name(task), "chol_model_11") == 0 || strcmp(starpu_task_get_name(task), "POTRF") == 0) + { + fprintf(f, "POTRF"); + } + else if (strcmp(starpu_task_get_name(task), "chol_model_21") == 0 || strcmp(starpu_task_get_name(task), "TRSM") == 0) + { + fprintf(f, "TRSM"); + } + else + { + /* Cas SYRK et GEMM que je distingue avec la donnée en double pour SYRK. */ + if (STARPU_TASK_GET_HANDLE(task, 0) == STARPU_TASK_GET_HANDLE(task, 1)) + { + fprintf(f, "SYRK"); + } + else + { + fprintf(f, "GEMM"); + } + } + + /* La je n'imprime que les coords de la dernière donnée de la tâche car c'est ce qui me donne la place dans le triangle de Cholesky. */ + starpu_data_get_coordinates_array(STARPU_TASK_GET_HANDLE(task, STARPU_TASK_GET_NBUFFERS(task) - 1), 2, tab_coordinates); + fprintf(f, " %d %d %d %ld\n", tab_coordinates[0], tab_coordinates[1], current_gpu - 1, task->iterations[0]); + + fclose(f); + } + else + { + printf("Dans print effective orer in file je ne gère que GEMM et CHOLESKY %s not suported.\n", starpu_task_get_name(task)); + exit(0); + } +} + +/* Printing each package and its content for visualisation */ +void _sched_visu_print_packages_in_terminal(struct _starpu_HFP_paquets *a, int nb_of_loop, const char *msg) +{ + if (_print_in_terminal != 1) return; + fprintf(stderr, "%s\n", msg); + int link_index = 0; + struct starpu_task *task; + a->temp_pointer_1 = a->first_link; + while (a->temp_pointer_1 != NULL) + { + link_index++; a->temp_pointer_1 = a->temp_pointer_1->next; + } + a->temp_pointer_1 = a->first_link; + printf("-----\nOn a fais %d tour(s) de la boucle while et on a fais %d paquet(s)\n",nb_of_loop,link_index); + printf("-----\n"); + link_index = 0; + while (a->temp_pointer_1 != NULL) + { + printf("Le paquet %d contient %d tâche(s) et %d données, expected task time = %f, expected package time = %f, split last package = %d\n",link_index,a->temp_pointer_1->nb_task_in_sub_list, a->temp_pointer_1->package_nb_data,a->temp_pointer_1->expected_time, a->temp_pointer_1->expected_package_computation_time, a->temp_pointer_1->split_last_ij); + for (task = starpu_task_list_begin(&a->temp_pointer_1->sub_list); task != starpu_task_list_end(&a->temp_pointer_1->sub_list); task = starpu_task_list_next(task)) + { + printf("%p : ",task); + unsigned i; + for (i = 0; i < STARPU_TASK_GET_NBUFFERS(task); i++) + { + printf("%p ", STARPU_TASK_GET_HANDLE(task, i)); + } + printf("\n"); + } + link_index++; + a->temp_pointer_1 = a->temp_pointer_1->next; + printf("-----\n"); + } + a->temp_pointer_1 = a->first_link; +} + +/* Appellé par DARTS et HFP pour visu python. */ +void _sched_visu_get_current_tasks(struct starpu_task *task, unsigned sci) +{ +#ifdef PRINT_PYTHON + if (index_task_currently_treated == 0) + { + _starpu_HFP_initialize_global_variable(task); + } + _sched_visu_print_effective_order_in_file(task, index_task_currently_treated); + task_currently_treated = task; + index_task_currently_treated++; +#endif + starpu_sched_component_worker_pre_exec_hook(task, sci); +} + +void _sched_visu_get_current_tasks_for_visualization(struct starpu_task *task, unsigned sci) +{ + (void)task; + (void)sci; +#ifdef PRINT_PYTHON + if (_index_current_task_for_visualization == 0) + { + _starpu_HFP_initialize_global_variable(task); + } + _sched_visu_print_effective_order_in_file(task, _index_current_task_for_visualization); + task_currently_treated = task; + index_task_currently_treated++; + _index_current_task_for_visualization++; +#endif +} + +void _sched_visu_print_matrix(int **matrix, int x, int y, char *msg) +{ + (void)matrix; + (void)x; + (void)y; + (void)msg; +#ifdef PRINT + _STARPU_SCHED_PRINT("%s", msg); + int i; + for (i = 0; i < x; i++) + { + int j; + for (j = 0; j < y; j++) + { + printf("%d ", matrix[i][j]); + } + printf("\n"); + } +#endif +} + +void _sched_visu_print_vector(int *vector, int x, char *msg) +{ + (void)vector; + (void)x; + (void)msg; +#ifdef PRINT + _STARPU_SCHED_PRINT("%s", msg); + int i; + for (i = 0; i < x; i++) + { + printf("%d ", vector[i]); + } + printf("\n"); +#endif +} + +void _sched_visu_print_data_for_task(struct starpu_task *task, const char *msg) +{ + (void)task; + (void)msg; +#ifdef PRINT + _STARPU_SCHED_PRINT(msg, task); + unsigned x; + for (x = 0; x < STARPU_TASK_GET_NBUFFERS(task); x++) + { + printf("%p ", STARPU_TASK_GET_HANDLE(task, x)); + } + printf("\n"); +#endif +} diff --git a/src/sched_policies/sched_visu.h b/src/sched_policies/sched_visu.h new file mode 100644 index 0000000000..6d1fb1a818 --- /dev/null +++ b/src/sched_policies/sched_visu.h @@ -0,0 +1,74 @@ +/* StarPU --- Runtime system for heterogeneous multicore architectures. + * + * Copyright (C) 2020-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria + * + * StarPU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or (at + * your option) any later version. + * + * StarPU is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * See the GNU Lesser General Public License in COPYING.LGPL for more details. + */ + +#ifndef __SCHED_VISU_H__ +#define __SCHED_VISU_H__ + +// environment variables: +// STARPU_SCHED_PRINT_IN_TERMINAL +// O we print nothing, 1 we print in terminal and also fill data +// coordinate order, task order etc... so it can take more time. +// STARPU_SCHED_PRINT_N +// to precise the value of N for visualization in scheduelers +// that does not count the toal number of tasks. Also use +// PRINT3D=1 or 2 so we know we are in 3D +// STARPU_SCHED_PRINT_TIME +// Pour afficher le temps d'exécution des fonctions dans HFP. A +// 1 on print à la 11ème itération et on fera la moyenne. A 2 +// on print à la première itération. Utile pour que cela +// fonctionne avec Grid5k. + +#include +#include +#include + +#pragma GCC visibility push(hidden) + +#ifdef STARPU_DARTS_VERBOSE +#define PRINT /* A dé-commenter pour afficher les printfs dans le code, les mesures du temps et les écriture dans les fichiers. A pour objectif de remplacer la var d'env PRINTF de HFP. Pour le moment j'ai toujours besoin de PRINTF=1 pour les visualisations par exemple. Attention pour DARTS j'ai besoin de PRINTF=1 et de PRINT pour les visu pour le moment. */ +#define PRINT_PYTHON /* Visu python */ +#endif + +#ifdef PRINT +# define _STARPU_SCHED_PRINT(fmt, ...) do { fprintf(stderr, fmt, ## __VA_ARGS__); fflush(stderr); } while(0) +#else +# define _STARPU_SCHED_PRINT(fmt, ...) do { } while (0) +#endif + +extern int _print3d; +extern int _print_in_terminal; +extern int _print_n; +extern struct starpu_task *task_currently_treated; + +void _sched_visu_init(int nb_gpus); +char *_sched_visu_get_output_directory(); +void _sched_visu_print_data_to_load_prefetch(struct starpu_task *task, int gpu_id, int force); +void _sched_visu_pop_ready_task(struct starpu_task *task); +struct starpu_task *_sched_visu_get_data_to_load(unsigned sched_ctx); +void _sched_visu_print_packages_in_terminal(struct _starpu_HFP_paquets *a, int nb_of_loop, const char *msg); +void _sched_visu_print_effective_order_in_file(struct starpu_task *task, int index_task); +void _sched_visu_get_current_tasks(struct starpu_task *task, unsigned sci); +void _sched_visu_get_current_tasks_for_visualization(struct starpu_task *task, unsigned sci); + +void _sched_visu_print_matrix(int **matrix, int x, int y, char *msg); +void _sched_visu_print_vector(int *vector, int x, char *msg); +void _sched_visu_print_data_for_task(struct starpu_task *task, const char *msg); + +void _sched_visu_get_current_tasks_for_visualization(struct starpu_task *task, unsigned sci); + +#pragma GCC visibility pop + +#endif // __SCHED_VISU_H__ diff --git a/tools/darts/color_darts.py b/tools/darts/color_darts.py new file mode 100644 index 0000000000..82143fab28 --- /dev/null +++ b/tools/darts/color_darts.py @@ -0,0 +1,109 @@ +# StarPU --- Runtime system for heterogeneous multicore architectures. +# +# Copyright (C) 2020-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria +# +# StarPU is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at +# your option) any later version. +# +# StarPU is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the GNU Lesser General Public License in COPYING.LGPL for more details. +# + +# Return a color to each task when building a visualization + +import matplotlib.pyplot as plt +import numpy as np +import sys +import math +from colour import Color + +def gradiant_color(gpu, order, number_task_gpu): + r = 0 + g = 0 + b = 0 + if (gpu == 0): + r = 1 + elif (gpu == 1): + g = 1 + elif (gpu == 2): + r = 73/255 + g = 116/255 + b = 1 + elif (gpu == 3): + r = 1 + b = 1 + elif (gpu == 4): + g = 1 + b = 1 + elif (gpu == 5): + r = 1 + b = 1 + elif (gpu == 6): + r = 1 + g = 0.5 + b = 0.5 + elif (gpu == 7): + r = 0.5 + g = 1 + b = 0.5 + else: + r = 1/gpu + g = 1/gpu + b = 1/gpu + + if (r != 0): + r = r - (r*order)/(number_task_gpu*1.3) # The multiplier help avoid darker colors + if (g != 0): + g = g - (g*order)/(number_task_gpu*1.5) + if (b != 0): + b = b - (b*order)/(number_task_gpu*1.5) + + if (r != 0 and g == 0 and b == 0): + g = 0.3 - (r*order)/(number_task_gpu*1.2) + b = 0.3 - (r*order)/(number_task_gpu*1.2) + elif (r == 0 and g != 0 and b == 0): + r = 0.3 - (g*order)/(number_task_gpu*1.5) + b = 0.3 - (g*order)/(number_task_gpu*1.5) + elif (r == 0 and g == 0 and b != 0): + g = 0.3 - (b*order)/(number_task_gpu*1.5) + r = 0.3 - (b*order)/(number_task_gpu*1.5) + return(r, g, b) + +# Multiple colors for the same GPU +def gradiant_multiple_color(order, number_task_gpu, NGPU, current_gpu): + pas = 500 # The pas is used to use a new color every pas tasks + + color_list = ((1,0,0), (0,1,0), (0,0,1), (1,1,0), (1,0,1), (0,1,1), (1,0.5,0.5), (0.5,1,0.5), (0.5,0.5,1), (1,0.25,0.25), (0.25,1,0.25), (0.25,0.25,1), (1,0.75,0.75), (0.75,0.75,1), (0.75,1,0.75), (1,0.5,0), (1,0,0.5), (0.5,1,0), (0.5,0,1), (0,1,0.5), (0,0.5,1), (1,0.25,0), (1,0,0.25), (0.25,1,0), (0.25,0,1), (0,1,0.25), (0,0.25,1)) + + triplet_index = order//pas + if triplet_index >= len(color_list): + triplet_index = len(color_list) - 1 + + r, g, b = color_list[triplet_index] + + order = order%pas + + multiplier_to_lighten_up = 1.8 + if (r != 0): + r = r - (r*order)/(pas*multiplier_to_lighten_up) + if (g != 0): + g = g - (g*order)/(pas*multiplier_to_lighten_up) + if (b != 0): + b = b - (b*order)/(pas*multiplier_to_lighten_up) + + if (r != 0 and g == 0 and b == 0): + g = 0.3 - (r*order)/(pas*1.2) + b = 0.3 - (r*order)/(pas*1.2) + elif (r == 0 and g != 0 and b == 0): + r = 0.3 - (g*order)/(pas*1.5) + b = 0.3 - (g*order)/(pas*1.5) + elif (r == 0 and g == 0 and b != 0): + g = 0.3 - (b*order)/(pas*1.5) + r = 0.3 - (b*order)/(pas*1.5) + + return(r, g, b) diff --git a/tools/darts/example_script_visualization_darts.sh b/tools/darts/example_script_visualization_darts.sh new file mode 100644 index 0000000000..3a3ee88381 --- /dev/null +++ b/tools/darts/example_script_visualization_darts.sh @@ -0,0 +1,60 @@ +# StarPU --- Runtime system for heterogeneous multicore architectures. +# +# Copyright (C) 2020-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria +# +# StarPU is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at +# your option) any later version. +# +# StarPU is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the GNU Lesser General Public License in COPYING.LGPL for more details. +# + +# Script that launch an experiment with darts and then plot a visualization of the execution. +# Requirements for a visualization: +# 1. Configure with the options --enable-darts-stats --enable-darts-verbose +# 2. Use the following environemnt variable: PRINT_N=$((N)) with N the side of the matrix used in the application +# 3. To launch this script from the starpu/ folder use: +# bash tools/darts/example_script_visualization_darts.sh $PATH_TO_STARPU_FOLDER N Application NGPU scheduler block_size memory_limitation_of_the_gpus $PATH_TO_PERFMODEL (optional, can also be left empty if the experiment is not done in simulation) hostname (optional if you are not using simulation) +# For instance it can be: bash tools/darts/example_script_visualization_darts.sh /home/name/ 15 Cholesky 1 darts 960 2000 /home/name/starpu/tools/perfmodels/sampling/ attila +# 4. If your targeted application is Cholesky, use -niter 1 +# 5. If your targeted application is Gemm, use -iter 1 +# The output image will be saved in the starpu/ folder + +make -j 6 +PATH_STARPU=$1 +N=$2 +DOSSIER=$3 +NGPU=$4 +ORDO=$5 +block_size=$6 +CM=$7 +OUTPUT_PATH="/tmp/" +SAVE_DIRECTORY="" +if (( $# > 7 )); +then + echo "simulation" + export STARPU_PERF_MODEL_DIR=$8 + HOST=$9 +else + echo "no simulation" +fi +ulimit -S -s 5000000 + +if [ $DOSSIER = "Matrice_ligne" ] +then + STARPU_SCHED_OUTPUT=${OUTPUT_PATH} STARPU_SCHED=${ORDO} PRINT_IN_TERMINAL=1 PRINT_N=$((N)) STARPU_NTASKS_THRESHOLD=30 STARPU_CUDA_PIPELINE=5 STARPU_SIMGRID_CUDA_MALLOC_COST=0 STARPU_MINIMUM_CLEAN_BUFFERS=0 STARPU_TARGET_CLEAN_BUFFERS=0 STARPU_LIMIT_CUDA_MEM=$((CM)) STARPU_NCPU=0 STARPU_NCUDA=$((NGPU)) STARPU_NOPENCL=0 STARPU_HOSTNAME=${HOST} ./examples/mult/sgemm -xy $((block_size*N)) -nblocks $((N)) -iter 1 + + python3 ${PATH_STARPU}/starpu/tools/darts/visualization_darts.py ${N} ${ORDO} ${NGPU} ${DOSSIER} 1 ${CM} ${block_size} ${OUTPUT_PATH} +fi + +if [ $DOSSIER = "Cholesky" ] +then + STARPU_SCHED_OUTPUT=${OUTPUT_PATH} STARPU_SIMGRID_CUDA_MALLOC_COST=0 STARPU_HOSTNAME=${HOST} PRINT_N=$((N)) DOPT_SELECTION_ORDER=7 STARPU_LIMIT_CUDA_MEM=$((CM)) DEPENDANCES=1 APP=1 EVICTION_STRATEGY_DYNAMIC_DATA_AWARE=1 STARPU_SCHED=${ORDO} STARPU_NTASKS_THRESHOLD=$((TH)) STARPU_CUDA_PIPELINE=$((CP)) STARPU_EXPECTED_TRANSFER_TIME_WRITEBACK=1 STARPU_MINIMUM_CLEAN_BUFFERS=0 STARPU_TARGET_CLEAN_BUFFERS=0 STARPU_NCPU=0 STARPU_NCUDA=$((NGPU)) STARPU_NOPENCL=0 ./examples/cholesky/cholesky_implicit -size $((block_size*N)) -nblocks $((N)) -niter 1 + + python3 ${PATH_STARPU}/starpu/tools/darts/visualization_darts.py ${N} ${ORDO} ${NGPU} ${DOSSIER} 1 ${CM} ${block_size} ${OUTPUT_PATH} +fi diff --git a/tools/darts/visualization_darts.py b/tools/darts/visualization_darts.py new file mode 100644 index 0000000000..d888d14450 --- /dev/null +++ b/tools/darts/visualization_darts.py @@ -0,0 +1,1023 @@ +# StarPU --- Runtime system for heterogeneous multicore architectures. +# +# Copyright (C) 2020-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria +# +# StarPU is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at +# your option) any later version. +# +# StarPU is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See the GNU Lesser General Public License in COPYING.LGPL for more details. +# + +# Use the output of an execution using dmdar or darts with --enable-darts-stats --enable-darts-verbose enabled to build a visualization in svg, saved in the starpu/ repository + +import matplotlib.pyplot as plt +import numpy as np +import sys +import math +import matplotlib.patheffects as PathEffects +from color_darts import gradiant_color +from color_darts import gradiant_multiple_color + +# Give a color to a gpu +def gpu_color(gpu): + r = 0 + g = 0 + b = 0 + if (gpu == 0): + r = 1 + elif (gpu == 1): + g = 1 + elif (gpu == 2): + r = 73/255 + g = 116/255 + b = 1 + elif (gpu == 3): + r = 1 + b = 1 + elif (gpu == 4): + g = 1 + b = 1 + elif (gpu == 5): + r = 1 + b = 1 + elif (gpu == 6): + r = 1 + g = 0.5 + b = 0.5 + elif (gpu == 7): + r = 0.5 + g = 1 + b = 0.5 + else: + r = 1/gpu + g = 1/gpu + b = 1/gpu + + return (r, g, b) + +# Print a line on the side of the matrix depending on wheter or not a data load was necessary +def lignes_sur_le_cote(case, axe, epaisseur, gpu, line_type): + epaisseur = epaisseur/2 + + if (gpu == 0): + decalage = 3 + elif (gpu == 1): + decalage = 4 + elif (gpu == 2): + decalage = 5 + elif (gpu == 3): + decalage = 6 + elif (gpu == 4): + decalage = 7 + elif (gpu == 5): + decalage = 8 + elif (gpu == 6): + decalage = 9 + elif (gpu == 7): + decalage = 10 + else: + decalage = 3 + gpu + + if (axe == "x"): + trans = ax.get_xaxis_transform() + ax.annotate('', xy = (case, -0.1), xycoords=trans, ha = "center", va = "top") + return ax.plot([case - 0.49, case + 0.49],[-.02 * decalage, -.02 * decalage], color = gpu_color(gpu), linewidth = epaisseur, transform = trans, clip_on = False, linestyle = line_type) + elif (axe == "y"): + trans = ax.get_yaxis_transform() + return ax.plot([-.02 * decalage, -.02 * decalage], [case - 0.49, case + 0.49], color = gpu_color(gpu), linewidth = epaisseur, transform = trans, clip_on = False, linestyle = line_type) + +def custom_lignes_sur_le_cote(case, axe, epaisseur, gpu, line_type, decalage): + if (axe == "x"): + trans = ax.get_xaxis_transform() + ax.annotate('', xy = (case, -0.1), xycoords=trans, ha = "center", va = "top") + return ax.plot([case - 0.45, case + 0.45],[-.02 * decalage, -.02 * decalage], color = gpu_color(gpu), linewidth = epaisseur, transform = trans, clip_on = False, linestyle = line_type) + elif (axe == "y"): + trans = ax.get_yaxis_transform() + return ax.plot([-.02 * decalage, -.02 * decalage], [case - 0.45, case + 0.45], color = gpu_color(gpu), linewidth = epaisseur, transform = trans, clip_on = False, linestyle = line_type) + +def data_sur_le_cote_3D(x, y, axe, line_width, gpu, line_type, i, j, z): + if (gpu == 0): + decalage = 0 + elif (gpu == 1): + decalage = 0.2 + elif (gpu == 2): + decalage = 0.4 + elif (gpu == 3): + decalage = 0.6 + elif (gpu == 4): + decalage = 0.8 + elif (gpu == 5): + decalage = 1 + elif (gpu == 6): + decalage = 1.2 + elif (gpu == 7): + decalage = 1.4 + else: + decalage = 1.6 + 0.2*(gpu%8) + + if (axe == "x"): + return ax[i, j].plot([x - 0.49, x + 0.49], [z + decalage, z + decalage], color = gpu_color(gpu), linewidth = line_width, linestyle = line_type) + elif (axe == "y"): + return ax[i, j].plot([z + decalage, z + decalage], [y - 0.49, y + 0.49], color = gpu_color(gpu), linewidth = line_width, linestyle = line_type) + elif (axe == "z"): + return ax[i, j].plot([x - 0.49, x + 0.49 - decalage], [y - 0.49 + decalage, y + 0.49], color = gpu_color(gpu), linewidth = line_width, linestyle = line_type) + else: + sys.exit("Axe must be x, y or z in def data_sur_le_cote_3D(x, y, axe, epaisseur, gpu, line_type, i, j)") + +# Printing in a separate white matrix the data loaded with a heat map +def data_sur_le_cote_3D_heat_map(x, y, axe, heat, gpu, i, j, z): + r = 0 + g = 0 + b = 0 + if (heat == 1): + g = 1 + elif (heat == 2): + r = 1 + g = 127/255 + elif (heat == 3): + r = 1 + elif (heat == 4): + r = 0.5 + b = 0.5 + else: + sys.exit("Heat must be 1, 2, 3 or 4 in data_sur_le_cote_3D_heat_map") + + if (axe == "z"): + m[i, j][x, y, :] = (r, g, b) + else: + sys.exit("Axe must be z in data_sur_le_cote_3D_heat_map") + +def tache_load_balanced(x, y, gpu): + ax.plot([x - 0.49, x + 0.49],[y - 0.49, y - 0.49], color = gpu_color(gpu), clip_on = False) + ax.plot([x - 0.49, x + 0.49],[y + 0.49, y + 0.49], color = gpu_color(gpu), clip_on = False) + ax.plot([x - 0.49, x - 0.49],[y - 0.49, y + 0.49], color = gpu_color(gpu), clip_on = False) + ax.plot([x + 0.49, x + 0.49],[y - 0.49, y + 0.49], color = gpu_color(gpu), clip_on = False) + return + +def tache_load_balanced_3D(x, y, gpu, i, j): + ax[i, j].plot([x - 0.49, x + 0.49],[y - 0.49, y - 0.49], color = gpu_color(gpu), clip_on = False) + ax[i, j].plot([x - 0.49, x + 0.49],[y + 0.49, y + 0.49], color = gpu_color(gpu), clip_on = False) + ax[i, j].plot([x - 0.49, x - 0.49],[y - 0.49, y + 0.49], color = gpu_color(gpu), clip_on = False) + ax[i, j].plot([x + 0.49, x + 0.49],[y - 0.49, y + 0.49], color = gpu_color(gpu), clip_on = False) + return + +def line_to_load(x, y): + return plt.plot([x - 0.44, x + 0.44], [y, y], color='#FFBB96', lw = 4.2, zorder = 5) + +def column_to_load(x, y): + return plt.plot([x, x], [y - 0.44, y + 0.44], '#FFBB96', lw = 4.2, zorder = 5) + +def line_to_load_prefetch(x, y): + # ~ return plt.plot([x - 0.48, x + 0.48], [y, y], 'white', lw = 4.2, zorder = 5, linestyle='dotted') + return plt.plot([x - 0.48, x + 0.48], [y, y], '#FFBB96', lw = 4.2, zorder = 5, linestyle='dotted') + +def column_to_load_prefetch(x, y): + return plt.plot([x, x], [y - 0.48, y + 0.48], '#FFBB96', lw = 4.2, zorder = 5, linestyle='dotted') + +def line_to_load_3D(x, y, i, j): + return ax[i, j].plot([x - 0.44, x + 0.44], [y, y], '#FFBB96', lw = 4.2, zorder = 5) + +def column_to_load_3D(x, y, i, j): + return ax[i, j].plot([x, x], [y - 0.44, y + 0.44], '#FFBB96', lw = 4.2, zorder = 5) + +def Z_to_load_3D(x, y, i, j): + return ax[i, j].plot([x - 0.44, x + 0.44], [y - 0.44, y + 0.44], '#FFBB96', lw = 4.2, zorder = 5) + +def line_to_load_3D_prefetch(x, y, i, j): + return ax[i, j].plot([x - 0.48, x + 0.48], [y, y], '#FFBB96', lw = 4.2, zorder = 5, linestyle='dotted') + +def column_to_load_3D_prefetch(x, y, i, j): + return ax[i, j].plot([x, x], [y - 0.48, y + 0.48], '#FFBB96', lw = 4.2, zorder = 5, linestyle='dotted') + +def Z_to_load_3D_prefetch(x, y, i, j): + return ax[i, j].plot([x - 0.48, x + 0.48], [y - 0.48, y + 0.48], '#FFBB96', lw = 4.2, zorder = 5, linestyle='dotted') + +def sous_paquets(x, y, sous_paquet): + return ax.annotate(sous_paquet, xy = (x, y), ha = "center") + +def separation_sous_paquets(x, y, x_bis, y_bis): + if (x == 0 and x_bis == N - 1): + return ax.plot([x - 0.49, x - 0.49],[y - 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (x == N - 1 and x_bis == 0): + return ax.plot([x + 0.49, x + 0.49],[y - 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (y == 0 and y_bis == N - 1): + return ax.plot([x - 0.49, x + 0.49],[y - 0.49, y - 0.49], color = "black", linewidth = 4, clip_on = False) + elif (y == N - 1 and y_bis == 0): + return ax.plot([x - 0.49, x + 0.49],[y + 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (x < x_bis): + return ax.plot([x + 0.49, x + 0.49],[y - 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (x > x_bis): + return ax.plot([x - 0.49, x - 0.49],[y - 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (y < y_bis): + return ax.plot([x - 0.49, x + 0.49],[y + 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (y > y_bis): + return ax.plot([x - 0.49, x + 0.49],[y - 0.49, y - 0.49], color = "black", linewidth = 4, clip_on = False) + +def separation_sous_paquets_3D(x, y, x_bis, y_bis, i, j): + if (x == 0 and x_bis == N - 1): + return ax[i, j].plot([x - 0.49, x - 0.49],[y - 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (x == N - 1 and x_bis == 0): + return ax[i, j].plot([x + 0.49, x + 0.49],[y - 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (y == 0 and y_bis == N - 1): + return ax[i, j].plot([x - 0.49, x + 0.49],[y - 0.49, y - 0.49], color = "black", linewidth = 4, clip_on = False) + elif (y == N - 1 and y_bis == 0): + return ax[i, j].plot([x - 0.49, x + 0.49],[y + 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (x < x_bis): + return ax[i, j].plot([x + 0.49, x + 0.49],[y - 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (x > x_bis): + return ax[i, j].plot([x - 0.49, x - 0.49],[y - 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (y < y_bis): + return ax[i, j].plot([x - 0.49, x + 0.49],[y + 0.49, y + 0.49], color = "black", linewidth = 4, clip_on = False) + elif (y > y_bis): + return ax[i, j].plot([x - 0.49, x + 0.49],[y - 0.49, y - 0.49], color = "black", linewidth = 4, clip_on = False) + +def get_i_j(z): + i = 0 + j = 0 + if(z == 1): + j = 1 + elif(z == 2): + i = 1 + elif(z == 3): + i = 1 + j = 1 + elif(z > 3): + print("get_i_j pas défini au dela de Z = 4") + return i, j + +def get_i_j_cholesky(iterationk, figure_par_ligne): + i = int(iterationk) // int(figure_par_ligne) + j = int(iterationk) % int(figure_par_ligne) + return i, j + +N = int(sys.argv[1]) +ORDO = sys.argv[2] +NGPU = int(sys.argv[3]) +APPLI = sys.argv[4] +PATH = sys.argv[8] + +# Opening input files and initializing tabulars +file_coord = open(PATH + "/Data_coordinates_order_last_SCHEDULER.txt", "r") +file_data = open(PATH + "/Data_to_load_SCHEDULER.txt", "r") +file_coord_prefetch = open(PATH + "/Data_to_load_prefetch_SCHEDULER.txt", "r") +nb_tache_par_gpu = [0 for i in range(NGPU)] +order = [0 for i in range(NGPU)] +data_to_load = [[0] * N for i in range(N)] + +# Options you can switch to true or false depending on what you want to plot: + +lignes_dans_les_cases = True # If you don't want the lines in the tiles un-comment false +# ~ lignes_dans_les_cases = False + +# ~ lignes_sur_les_cote = True # If you don't want the lines of the side un-comment false +lignes_sur_les_cote = False + +# ~ numerotation_des_cases = True # If you don't want a numerotation in the tiles un-comment false +numerotation_des_cases = False + +# ~ numerotation_des_cases_partielles = True # If you don't a partial numerotation in the tiles un-comment false +numerotation_des_cases_partielles = False + +numerotation_axes_complete = True # If you don't want a numerotation of the axis un-comment false +# ~ numerotation_axes_complete = False + +# ~ z_dans_les_cases = True # If you don't want to show the line in the tiles for the z dimension in the case of GEMM un-comment false +z_dans_les_cases = False + + +if (ORDO == "HFP"): + sous_paquets_and_task_stealing = False # pour afficher ou non les sous paquets et le stealing avec HFP +else: + sous_paquets_and_task_stealing = False + +plt.tick_params(labelsize=50) # Pour la taille des chiffres sur les axes x et y des matrices + +if (APPLI == "Matrice_ligne" or APPLI == "Matrice3D" or APPLI == "MatriceZ4" or APPLI == "MatriceZN"): + + NDIMENSIONS = int(sys.argv[5]) + + epaisseur_lignes_sur_le_cote = [[1] * N for i in range(NGPU)] + epaisseur_colonnes_sur_le_cote = [[1] * N for i in range(NGPU)] + epaisseur_lignes_sur_le_cote_prefetch = [[1] * N for i in range(NGPU)] + epaisseur_colonnes_sur_le_cote_prefetch = [[1] * N for i in range(NGPU)] + + if (NDIMENSIONS == 1): + + size_numero_dans_les_cases = 19 + + ORDRE_GLOBAL = 1 + + # ~ ax = plt.gca(); + fig, ax = plt.subplots( nrows=1, ncols=1 ) # create figure & 1 axis + + # Grid + if (numerotation_axes_complete == True): + ax.set_xticks(np.arange(0, N, 1)) # numérotations des axes X et Y + ax.set_yticks(np.arange(0, N, 1)) + else: + ax.set_xticks(np.arange(0, N, 5)) # numérotations des axes X et Y + ax.set_yticks(np.arange(0, N, 5)) + ax.set_xticks(np.arange(0.5, N, 1), minor=True) + ax.set_yticks(np.arange(0.5, N, 1), minor=True) + ax.grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + + # Filling a matrix with 0 + m = np.zeros((N, N, 3)) + + for line in file_coord: + fields = line.split() + nb_tache_par_gpu[int(fields[2])] = nb_tache_par_gpu[int(fields[2])] + 1 + + file_coord.seek(0) + + if (ORDRE_GLOBAL == 1): + for i in range (0, NGPU): + nb_tache_par_gpu[i] = N*N + + # Coloring tiles in function of their numbering + for line in file_coord: + fields = line.split() + # X Y GPU + if (ORDRE_GLOBAL == 0): + index_ordre = int(fields[2]) + else: + index_ordre = 0 + m[int(fields[1]), int(fields[0]), :] = gradiant_color(int(fields[2]), order[index_ordre], nb_tache_par_gpu[int(fields[2])]) + + if (numerotation_des_cases == True): + ax.text(int(fields[0]), int(fields[1]), order[index_ordre], va="center", weight="bold", ha="center", color = "white", size = size_numero_dans_les_cases) + elif (numerotation_des_cases_partielles == True and order[index_ordre]%10 == 0): + ax.text(int(fields[0]), int(fields[1]), order[index_ordre], weight="bold", va="center", ha="center", color = "white", size = size_numero_dans_les_cases, zorder=10) + order[index_ordre] = order[index_ordre] + 1 + + if (lignes_dans_les_cases == True or lignes_sur_les_cote == True): + for line in file_data: + fields = line.split() + + if (int(fields[2]) != 0): + column_to_load(int(fields[0]), int(fields[1])) + + if (lignes_sur_les_cote == True): + lignes_sur_le_cote(int(fields[1]), "y", epaisseur_colonnes_sur_le_cote[int(fields[4])][int(fields[1])], int(fields[4]), "solid") + epaisseur_colonnes_sur_le_cote[int(fields[4])][int(fields[1])] += 4 + + data_to_load[int(fields[0])][int(fields[1])] = 1 + + if (int(fields[3]) != 0): + line_to_load(int(fields[0]), int(fields[1])) + + if (lignes_sur_les_cote == True): + lignes_sur_le_cote(int(fields[0]), "x", epaisseur_lignes_sur_le_cote[int(fields[4])][int(fields[0])], int(fields[4]), "solid") + epaisseur_lignes_sur_le_cote[int(fields[4])][int(fields[0])] += 4 + + if (data_to_load[int(fields[0])][int(fields[1])] != 0): + data_to_load[int(fields[0])][int(fields[1])] = 3 + else: + data_to_load[int(fields[0])][int(fields[1])] = 2 + + for line in file_coord_prefetch: + fields = line.split() + if (int(fields[2]) != 0 and data_to_load[int(fields[0])][int(fields[1])] != 1 and data_to_load[int(fields[0])][int(fields[1])] != 3): + column_to_load_prefetch(int(fields[0]), int(fields[1])) + + if (lignes_sur_les_cote == True): + if (epaisseur_colonnes_sur_le_cote[int(fields[4])][int(fields[1])] == 1): + lignes_sur_le_cote(int(fields[1]), "y", epaisseur_colonnes_sur_le_cote_prefetch[int(fields[4])][int(fields[1])], int(fields[4]), "dashed") + epaisseur_colonnes_sur_le_cote_prefetch[int(fields[4])][int(fields[1])] += 4 + else: + lignes_sur_le_cote(int(fields[1]), "y", epaisseur_colonnes_sur_le_cote[int(fields[4])][int(fields[1])], int(fields[4]), "solid") + epaisseur_colonnes_sur_le_cote[int(fields[4])][int(fields[1])] += 4 + + if (int(fields[3]) != 0 and data_to_load[int(fields[0])][int(fields[1])] != 2 and data_to_load[int(fields[0])][int(fields[1])] != 3): + line_to_load_prefetch(int(fields[0]), int(fields[1])) + + if (lignes_sur_les_cote == True): + if (epaisseur_lignes_sur_le_cote[int(fields[4])][int(fields[0])] == 1): + lignes_sur_le_cote(int(fields[0]), "x", epaisseur_lignes_sur_le_cote_prefetch[int(fields[4])][int(fields[0])], int(fields[4]), "dashed") + epaisseur_lignes_sur_le_cote_prefetch[int(fields[4])][int(fields[0])] += 4 + else: + lignes_sur_le_cote(int(fields[0]), "x", epaisseur_lignes_sur_le_cote[int(fields[4])][int(fields[0])], int(fields[4]), "solid") + epaisseur_lignes_sur_le_cote[int(fields[4])][int(fields[0])] += 4 + + if (sous_paquets_and_task_stealing == True): + # Load balance steal + file_load_balance = open(PATH + "/Data_stolen_load_balance.txt", "r") + + for line in file_load_balance: + fields = line.split() + tache_load_balanced(int(fields[0]), int(fields[1]), int(fields[2])) + + file_load_balance.close() + + file_last_package = open(PATH + "/last_package_split.txt", "r") + + hierarchie_paquets = [[0] * N for i in range(N)] + for line in file_last_package: + fields = line.split() + + hierarchie_paquets[int(fields[0])][int(fields[1])] = int(fields[3]) + + for i in range(N): + for j in range(N): + if (i != 0): + if (hierarchie_paquets[i][j] != hierarchie_paquets[i - 1][j]): + separation_sous_paquets(i, j, i - 1, j) + else: + if (hierarchie_paquets[i][j] != hierarchie_paquets[N - 1][j]): + separation_sous_paquets(i, j, N - 1, j) + if (i != N - 1): + if (hierarchie_paquets[i][j] != hierarchie_paquets[i + 1][j]): + separation_sous_paquets(i, j, i + 1, j) + else: + if (hierarchie_paquets[i][j] != hierarchie_paquets[0][j]): + separation_sous_paquets(i, j, 0, j) + if (j != 0): + if (hierarchie_paquets[i][j] != hierarchie_paquets[i][j - 1]): + separation_sous_paquets(i, j, i, j - 1) + else: + if (hierarchie_paquets[i][j] != hierarchie_paquets[i][N - 1]): + separation_sous_paquets(i, j, i, N - 1) + if (j != N - 1): + if (hierarchie_paquets[i][j] != hierarchie_paquets[i][j + 1]): + separation_sous_paquets(i, j, i, j + 1) + else: + if (hierarchie_paquets[i][j] != hierarchie_paquets[i][0]): + separation_sous_paquets(i, j, i, 0) + + file_last_package.close() + + plt.imshow(m) + # End of 2D matrix + + # Start of 3D matrix, in 3D in the files you have x y z so the GPU is always one fields later + else: + NROW = 2 + NCOL = 2 + + fig, ax = plt.subplots(nrows = NROW, ncols = NCOL) + + size_numero_dans_les_cases = 8 + + for i in range(NROW): + for j in range(NCOL): + if (numerotation_axes_complete == True): + ax[i, j].set_xticks(np.arange(0, N, 1)) # numérotations des axes X et Y + ax[i, j].set_yticks(np.arange(0, N, 1)) + else: + ax[i, j].set_xticks(np.arange(0, N, 5)) # numérotations des axes X et Y + ax[i, j].set_yticks(np.arange(0, N, 5)) + ax[i, j].set_xticks(np.arange(0.5, N, 1), minor=True) + ax[i, j].set_yticks(np.arange(0.5, N, 1), minor=True) + ax[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax[i, j].tick_params(labelsize=13) # Plus petit pour 3D car dans FGCS c'est agrandiss + if (NDIMENSIONS == 4): + i_x_on_side = 2 + j_x_on_side = 0 + i_y_on_side = 2 + j_y_on_side = 1 + i_z_on_side = 2 + j_z_on_side = 2 + + # Filling a matrix with 0. m is for colors. + m = {} + already_fetched_x = {} + already_fetched_y = {} + already_fetched_z = {} + hierarchie_paquets = {} + epaisseur_x = {} + epaisseur_y = {} + epaisseur_z = {} + + for i in range(NROW): + for j in range(NCOL): + already_fetched_x[i, j] = np.zeros((N, N, 1)) + already_fetched_y[i, j] = np.zeros((N, N, 1)) + already_fetched_z[i, j] = np.zeros((N, N, 1)) + hierarchie_paquets[i, j] = np.zeros((N, N, 1)) + + for i in range(NROW): + for j in range(NCOL): + m[i, j] = np.ones((N, N, 3)) + + for line in file_coord: + fields = line.split() + nb_tache_par_gpu[int(fields[3])] = nb_tache_par_gpu[int(fields[3])] + 1 + + file_coord.seek(0) + for line in file_coord: + fields = line.split() + i, j = get_i_j(int(fields[2])) + print(i, j, int(fields[1]), int(fields[0])) + m[i, j][int(fields[1]), int(fields[0]), :] = gradiant_color(int(fields[3]), order[int(fields[3])], nb_tache_par_gpu[int(fields[3])]) + + index_ordre = int(fields[3]) + + if (numerotation_des_cases == True): + ax[i,j].text(int(fields[0]), int(fields[1]), order[index_ordre], va="center", weight="bold", ha="center", color = "white", size = size_numero_dans_les_cases) + elif (numerotation_des_cases_partielles == True and order[index_ordre]%10 == 0): + ax[i,j].text(int(fields[0]), int(fields[1]), order[index_ordre], va="center", weight="bold", ha="center", color = "white", size = size_numero_dans_les_cases, zorder=10) + + order[int(fields[3])] = order[int(fields[3])] + 1 + + for line in file_data: + fields = line.split() + i, j = get_i_j(int(fields[2])) + + if (int(fields[3]) != 0): + column_to_load_3D(int(fields[0]), int(fields[1]), i, j) + + if (lignes_sur_les_cote == True): + data_sur_le_cote_3D(int(fields[0]), int(fields[1]), "y", epaisseur_y[i_y_on_side, j_y_on_side][int(fields[2]) + (int(fields[6])*4), int(fields[1])], int(fields[6]), "solid", i_x_on_side, j_x_on_side, int(fields[2])) + epaisseur_y[i_y_on_side, j_y_on_side][int(fields[2]) + (int(fields[6])*4), int(fields[1])] += 4 + + already_fetched_x[i, j][int(fields[0]), int(fields[1])] = 1 + + if (int(fields[4]) != 0): + line_to_load_3D(int(fields[0]), int(fields[1]), i, j) + + if (lignes_sur_les_cote == True): + data_sur_le_cote_3D(int(fields[0]), int(fields[1]), "x", epaisseur_x[i_x_on_side, j_x_on_side][int(fields[0]), int(fields[2]) + (int(fields[6])*4)], int(fields[6]), "solid", i_y_on_side, j_y_on_side, int(fields[2])) + epaisseur_x[i_x_on_side, j_x_on_side][int(fields[0]), int(fields[2]) + (int(fields[6])*4)] += 4 + + already_fetched_y[i, j][int(fields[0]), int(fields[1])] = 1 + + # The "diagonal" (Z) + if (z_dans_les_cases == True): + if (int(fields[5]) != 0): + Z_to_load_3D(int(fields[0]), int(fields[1]), i, j) + + if (NGPU == 1 and NDIMENSIONS == 4): + data_sur_le_cote_3D_heat_map(int(fields[1]), int(fields[0]), "z", epaisseur_z[i_z_on_side, j_z_on_side][int(fields[0]), int(fields[1])], int(fields[6]), i_z_on_side, j_z_on_side, int(fields[2])) + epaisseur_z[i_z_on_side, j_z_on_side][int(fields[0]), int(fields[1])] += 1 + else: + print("La heat map 3D ne gère pas plus de 1 GPU ou ZN :/") + break + + already_fetched_z[i, j][int(fields[0]), int(fields[1])] = 1 + + for line in file_coord_prefetch: + fields = line.split() + i, j = get_i_j(int(fields[2])) + if (int(fields[3]) != 0 and already_fetched_x[i, j][int(fields[0]), int(fields[1])] == 0): + column_to_load_3D_prefetch(int(fields[0]), int(fields[1]), i, j) + + if (lignes_sur_les_cote == True): + data_sur_le_cote_3D(int(fields[0]), int(fields[1]), "y", 1, int(fields[6]), "dashed", i_x_on_side, j_x_on_side, int(fields[2])) + + if (int(fields[4]) != 0 and already_fetched_y[i, j][int(fields[0]), int(fields[1])] == 0): + line_to_load_3D_prefetch(int(fields[0]), int(fields[1]), i, j) + + if (lignes_sur_les_cote == True): + data_sur_le_cote_3D(int(fields[0]), int(fields[1]), "x", 1, int(fields[6]), "dashed", i_y_on_side, j_y_on_side, int(fields[2])) + + if (z_dans_les_cases == True): + if (int(fields[5]) != 0 and already_fetched_z[i, j][int(fields[0]), int(fields[1])] == 0): + Z_to_load_3D_prefetch(int(fields[0]), int(fields[1]), i, j) + + if (ORDO == "HFP"): + file_load_balance = open(PATH + "/Data_stolen_load_balance.txt", "r") + + for line in file_load_balance: + fields = line.split() + i, j = get_i_j(int(fields[2])) + tache_load_balanced_3D(int(fields[0]), int(fields[1]), int(fields[3]), i, j) + + file_load_balance.close() + + file_last_package = open(PATH + "/last_package_split.txt", "r") + + if (sous_paquets_and_task_stealing == True): + for line in file_last_package: + fields = line.split() + i, j = get_i_j(int(fields[2])) + hierarchie_paquets[i, j][int(fields[0]), int(fields[1])] = int(fields[4]) + if(NDIMENSIONS == 4): + for i_bis in range(2): + for j_bis in range(2): + for i in range(N): + for j in range(N): + if (i != 0): + if (hierarchie_paquets[i_bis, j_bis][i, j] != hierarchie_paquets[i_bis, j_bis][i - 1, j]): + separation_sous_paquets_3D(i, j, i - 1, j, i_bis, j_bis) + else: + if (hierarchie_paquets[i_bis, j_bis][i, j] != hierarchie_paquets[i_bis, j_bis][N - 1, j]): + separation_sous_paquets_3D(i, j, N - 1, j, i_bis, j_bis) + if (i != N - 1): + if (hierarchie_paquets[i_bis, j_bis][i, j] != hierarchie_paquets[i_bis, j_bis][i + 1, j]): + separation_sous_paquets_3D(i, j, i + 1, j, i_bis, j_bis) + else: + if (hierarchie_paquets[i_bis, j_bis][i, j] != hierarchie_paquets[i_bis, j_bis][0, j]): + separation_sous_paquets_3D(i, j, 0, j, i_bis, j_bis) + if (j != 0): + if (hierarchie_paquets[i_bis, j_bis][i, j] != hierarchie_paquets[i_bis, j_bis][i, j - 1]): + separation_sous_paquets_3D(i, j, i, j - 1, i_bis, j_bis) + else: + if (hierarchie_paquets[i_bis, j_bis][i, j] != hierarchie_paquets[i_bis, j_bis][i, N - 1]): + separation_sous_paquets_3D(i, j, i, N - 1, i_bis, j_bis) + if (j != N - 1): + if (hierarchie_paquets[i_bis, j_bis][i, j] != hierarchie_paquets[i_bis, j_bis][i, j + 1]): + separation_sous_paquets_3D(i, j, i, j + 1, i_bis, j_bis) + else: + if (hierarchie_paquets[i_bis, j_bis][i, j] != hierarchie_paquets[i_bis, j_bis][i, 0]): + separation_sous_paquets_3D(i, j, i, 0, i_bis, j_bis) + else: + print("hierarchie 3D not implemented yet for Z != 4") + file_last_package.close() + + for i in range(NROW): + for j in range(NCOL): + string = "K=" + str(j*NCOL+i) + if NGPU == 1: + ax[j, i].text(-0.29, -0.09, string, fontsize = 13, color="red", transform=ax[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + + + # Printing + for i in range(NROW): + for j in range(NCOL): + ax[i, j].imshow(m[i, j]) + # End of 3D matrix + +elif (APPLI == "Cholesky"): + + # ~ numerotation_des_cases = True + numerotation_des_cases = False + + # ~ numerotation_des_cases_partielles = True + numerotation_des_cases_partielles = False + + # ~ text_sous_les_figures = True + text_sous_les_figures = False + + # ~ lignes_dans_les_cases_et_sur_le_cote = True + lignes_dans_les_cases_et_sur_le_cote = False + + # ~ nb_load_dans_les_cases = True + nb_load_dans_les_cases = False + + memory_size_in_tiles = True + # ~ memory_size_in_tiles = False + + if (memory_size_in_tiles == True): + MEMOIRE = int(sys.argv[6]) + TILE_SIZE = int(sys.argv[7]) + + size_numero_dans_les_cases = 1.3 + + NCOL = math.ceil(math.sqrt(N)) + NROW = math.ceil(math.sqrt(N)) + + if NGPU > 8 or NGPU == 3 or NGPU == 5 or NGPU == 6 or NGPU == 7: + print(NGPU, "GPUs not dealt with. Please use 1, 2, 4 or 8 GPUs") + exit + + if NGPU >= 1: + fig1, ax1 = plt.subplots(nrows = NROW, ncols = NCOL) + if NGPU >= 2: + fig2, ax2 = plt.subplots(nrows = NROW, ncols = NCOL) + if NGPU >= 4: + fig3, ax3 = plt.subplots(nrows = NROW, ncols = NCOL) + fig4, ax4 = plt.subplots(nrows = NROW, ncols = NCOL) + if NGPU >= 8: + fig5, ax5 = plt.subplots(nrows = NROW, ncols = NCOL) + fig6, ax6 = plt.subplots(nrows = NROW, ncols = NCOL) + fig7, ax7 = plt.subplots(nrows = NROW, ncols = NCOL) + fig8, ax8 = plt.subplots(nrows = NROW, ncols = NCOL) + + for i in range(NROW): + for j in range(NCOL): + if NGPU >= 1: + ax1[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax1[i, j].set_xticks([]) + ax1[i, j].set_yticks([]) + if NGPU >= 2: + ax2[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax2[i, j].set_xticks([]) + ax2[i, j].set_yticks([]) + if NGPU >= 4: + ax3[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax3[i, j].set_xticks([]) + ax3[i, j].set_yticks([]) + ax4[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax4[i, j].set_xticks([]) + ax4[i, j].set_yticks([]) + if NGPU >= 8: + ax5[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax5[i, j].set_xticks([]) + ax5[i, j].set_yticks([]) + ax6[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax6[i, j].set_xticks([]) + ax6[i, j].set_yticks([]) + ax7[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax7[i, j].set_xticks([]) + ax7[i, j].set_yticks([]) + ax8[i, j].grid(which = 'minor', color = 'black', linestyle = '-', linewidth = 1) + ax8[i, j].set_xticks([]) + ax8[i, j].set_yticks([]) + + row_to_suppr = 1 + for i in range(NCOL*NROW - N): + if (i%(NCOL) == 0 and i != 0): + row_to_suppr += 1 + if NGPU >= 1: + fig1.delaxes(ax1[NROW - row_to_suppr, NCOL - 1 - i%(NCOL)]) + if NGPU >= 2: + fig2.delaxes(ax2[NROW - row_to_suppr, NCOL - 1 - i%(NCOL)]) + if NGPU >= 4: + fig3.delaxes(ax3[NROW - row_to_suppr, NCOL - 1 - i%(NCOL)]) + fig4.delaxes(ax4[NROW - row_to_suppr, NCOL - 1 - i%(NCOL)]) + if NGPU >= 8: + fig5.delaxes(ax5[NROW - row_to_suppr, NCOL - 1 - i%(NCOL)]) + fig6.delaxes(ax6[NROW - row_to_suppr, NCOL - 1 - i%(NCOL)]) + fig7.delaxes(ax7[NROW - row_to_suppr, NCOL - 1 - i%(NCOL)]) + fig8.delaxes(ax8[NROW - row_to_suppr, NCOL - 1 - i%(NCOL)]) + + if NGPU >= 1: + m1 = {} + if NGPU >= 2: + m2 = {} + if NGPU >= 4: + m3 = {} + m4 = {} + if NGPU >= 8: + m5 = {} + m6 = {} + m7 = {} + m8 = {} + already_fetched_x = {} + already_fetched_y = {} + already_fetched_z = {} + + for i in range(NROW): + for j in range(NCOL): + if NGPU >= 1: + m1[i, j] = np.ones((N, N, 3)) + if NGPU >= 2: + m2[i, j] = np.ones((N, N, 3)) + if NGPU >= 4: + m3[i, j] = np.ones((N, N, 3)) + m4[i, j] = np.ones((N, N, 3)) + if NGPU >= 8: + m5[i, j] = np.ones((N, N, 3)) + m6[i, j] = np.ones((N, N, 3)) + m7[i, j] = np.ones((N, N, 3)) + m8[i, j] = np.ones((N, N, 3)) + + for i in range(NROW): + for j in range(NCOL): + already_fetched_x[i, j] = np.zeros((N, N, 1)) + already_fetched_y[i, j] = np.zeros((N, N, 1)) + already_fetched_z[i, j] = np.zeros((N, N, 1)) + + if (memory_size_in_tiles == True): + taille_1_tuile = TILE_SIZE*TILE_SIZE*4 # Car c'est du simple. Pour LU en double ce sera *8 + nb_tuile__qui_rentre_en_memoire = int((MEMOIRE*1000000)/taille_1_tuile) + x_to_fill = int(math.sqrt(nb_tuile__qui_rentre_en_memoire)) + y_to_fill = int(math.sqrt(nb_tuile__qui_rentre_en_memoire)) + remaining_tile_to_fill = nb_tuile__qui_rentre_en_memoire - (x_to_fill*y_to_fill) + + if x_to_fill < N: + x = N-1 + y = 0 + + for i in range(0, x_to_fill): + for j in range(0, y_to_fill): + if NGPU >= 1: + m1[0, 0][y+i, x-j, :] = (0, 0, 0) + if NGPU >= 2: + m2[0, 0][y+i, x-j, :] = (0, 0, 0) + if NGPU >= 4: + m3[0, 0][y+i, x-j, :] = (0, 0, 0) + m4[0, 0][y+i, x-j, :] = (0, 0, 0) + if NGPU >= 8: + m5[0, 0][y+i, x-j, :] = (0, 0, 0) + m6[0, 0][y+i, x-j, :] = (0, 0, 0) + m7[0, 0][y+i, x-j, :] = (0, 0, 0) + m8[0, 0][y+i, x-j, :] = (0, 0, 0) + for i in range(0, remaining_tile_to_fill): + if NGPU >= 1: + m1[0, 0][y, x-x_to_fill, :] = (0, 0, 0) + if NGPU >= 2: + m2[0, 0][y, x-x_to_fill, :] = (0, 0, 0) + if NGPU >= 4: + m3[0, 0][y, x-x_to_fill, :] = (0, 0, 0) + m4[0, 0][y, x-x_to_fill, :] = (0, 0, 0) + if NGPU >= 8: + m5[0, 0][y, x-x_to_fill, :] = (0, 0, 0) + m6[0, 0][y, x-x_to_fill, :] = (0, 0, 0) + m7[0, 0][y, x-x_to_fill, :] = (0, 0, 0) + m8[0, 0][y, x-x_to_fill, :] = (0, 0, 0) + if i >= y_to_fill: + x_to_fill-=1 + else: + y+=1 + + next(file_coord) + for line in file_coord: + fields = line.split() + nb_tache_par_gpu[int(fields[3])] = nb_tache_par_gpu[int(fields[3])] + 1 + + file_coord.seek(0) + next(file_coord) + for line in file_coord: + fields = line.split() + i, j = get_i_j_cholesky(fields[4], NCOL) + + if int(fields[3]) == 0: + m1[i, j][int(fields[1]), int(fields[2]), :] = gradiant_multiple_color(order[int(fields[3])], nb_tache_par_gpu[int(fields[3])], NGPU, int(fields[3])) + elif int(fields[3]) == 1: + m2[i, j][int(fields[1]), int(fields[2]), :] = gradiant_multiple_color(order[int(fields[3])], nb_tache_par_gpu[int(fields[3])], NGPU, int(fields[3])) + elif int(fields[3]) == 2: + m3[i, j][int(fields[1]), int(fields[2]), :] = gradiant_multiple_color(order[int(fields[3])], nb_tache_par_gpu[int(fields[3])], NGPU, int(fields[3])) + elif int(fields[3]) == 3: + m4[i, j][int(fields[1]), int(fields[2]), :] = gradiant_multiple_color(order[int(fields[3])], nb_tache_par_gpu[int(fields[3])], NGPU, int(fields[3])) + elif int(fields[3]) == 4: + m5[i, j][int(fields[1]), int(fields[2]), :] = gradiant_multiple_color(order[int(fields[3])], nb_tache_par_gpu[int(fields[3])], NGPU, int(fields[3])) + elif int(fields[3]) == 5: + m6[i, j][int(fields[1]), int(fields[2]), :] = gradiant_multiple_color(order[int(fields[3])], nb_tache_par_gpu[int(fields[3])], NGPU, int(fields[3])) + elif int(fields[3]) == 6: + m7[i, j][int(fields[1]), int(fields[2]), :] = gradiant_multiple_color(order[int(fields[3])], nb_tache_par_gpu[int(fields[3])], NGPU, int(fields[3])) + elif int(fields[3]) == 7: + m8[i, j][int(fields[1]), int(fields[2]), :] = gradiant_multiple_color(order[int(fields[3])], nb_tache_par_gpu[int(fields[3])], NGPU, int(fields[3])) + + if (numerotation_des_cases == True): + ax[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif (numerotation_des_cases_partielles == True and order[int(fields[3])]%20 == 0): + if int(fields[3]) == 0: + ax1[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif int(fields[3]) == 1: + ax2[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif int(fields[3]) == 2: + ax3[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif int(fields[3]) == 3: + ax4[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif int(fields[3]) == 4: + ax5[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif int(fields[3]) == 5: + ax6[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif int(fields[3]) == 6: + ax7[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif int(fields[3]) == 7: + ax8[i, j].text(int(fields[2]), int(fields[1]), order[int(fields[3])], va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + + + order[int(fields[3])] = order[int(fields[3])] + 1 + + if (lignes_dans_les_cases_et_sur_le_cote == True): + next(file_data) + for line in file_data: + fields = line.split() + + i, j = get_i_j_cholesky(fields[7], NCOL) + + if (int(fields[3]) == 1): + ax1[i, j].plot([int(fields[2]), int(fields[2])], [int(fields[1]) - 0.44, int(fields[1]) + 0.44], '#FFBB96', lw = 1.2, zorder = 5) + already_fetched_x[i, j][int(fields[2]), int(fields[1])] = 1 + if (int(fields[4]) == 1): + ax1[i, j].plot([int(fields[2]) - 0.44, int(fields[2]) + 0.44], [int(fields[1]), int(fields[1])], '#FFBB96', lw = 1.2, zorder = 5) + already_fetched_y[i, j][int(fields[2]), int(fields[1])] = 1 + if (int(fields[5]) == 1): + ax1[i, j].plot([int(fields[2]) - 0.44, int(fields[2]) + 0.44], [int(fields[1]) - 0.44, int(fields[1]) + 0.44], '#FFBB96', lw = 1.2, zorder = 5) + already_fetched_z[i, j][int(fields[2]), int(fields[1])] = 1 + + next(file_coord_prefetch) + for line in file_coord_prefetch: + fields = line.split() + + i, j = get_i_j_cholesky(fields[7], NCOL) + + if (int(fields[3]) == 1 and already_fetched_x[i, j][int(fields[2]), int(fields[1])] == 0): + ax1[i, j].plot([int(fields[2]), int(fields[2])], [int(fields[1]) - 0.44, int(fields[1]) + 0.44], '#FFBB96', lw = 1.2, zorder = 5, linestyle = "dotted") + if (int(fields[4]) == 1 and already_fetched_y[i, j][int(fields[2]), int(fields[1])] == 0): + ax1[i, j].plot([int(fields[2]) - 0.44, int(fields[2]) + 0.44], [int(fields[1]), int(fields[1])], '#FFBB96', lw = 1.2, zorder = 5, linestyle = "dotted") + if (int(fields[5]) == 1 and already_fetched_z[i, j][int(fields[2]), int(fields[1])] == 0): + ax1[i, j].plot([int(fields[2]) - 0.44, int(fields[2]) + 0.44], [int(fields[1]) - 0.44, int(fields[1]) + 0.44], '#FFBB96', lw = 1.2, zorder = 5, linestyle = "dotted") + + + if (nb_load_dans_les_cases == True): + next(file_data) + for line in file_data: + fields = line.split() + + i, j = get_i_j_cholesky(fields[7], NCOL) + + nb_of_fetch = int(fields[3]) + int(fields[4]) + int(fields[5]) + if (nb_of_fetch == 1): + ax1[i, j].text(int(fields[2]), int(fields[1]), 1, va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif (nb_of_fetch == 2): + ax1[i, j].text(int(fields[2]), int(fields[1]), 2, va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + elif (nb_of_fetch == 3): + ax1[i, j].text(int(fields[2]), int(fields[1]), 3, va="center", ha="center", color = "white", size = size_numero_dans_les_cases) + + if (int(fields[3]) == 1): + already_fetched_x[i, j][int(fields[2]), int(fields[1])] = 1 + if (int(fields[4]) == 1): + already_fetched_y[i, j][int(fields[2]), int(fields[1])] = 1 + if (int(fields[5]) == 1): + already_fetched_z[i, j][int(fields[2]), int(fields[1])] = 1 + + + next(file_coord_prefetch) + for line in file_coord_prefetch: + fields = line.split() + + i, j = get_i_j_cholesky(fields[7], NCOL) + + nb_of_prefetch = 0 + if (int(fields[3]) == 1 and already_fetched_x[i, j][int(fields[2]), int(fields[1])] == 0): + nb_of_prefetch += 1 + if (int(fields[4]) == 1 and already_fetched_y[i, j][int(fields[2]), int(fields[1])] == 0): + nb_of_prefetch += 1 + if (int(fields[5]) == 1 and already_fetched_z[i, j][int(fields[2]), int(fields[1])] == 0): + nb_of_prefetch += 1 + + # ~ if (nb_of_fetch == 0): + if (nb_of_prefetch == 1): + ax1[i, j].text(int(fields[2]), int(fields[1]), 1, va="center", ha="center", color = "black", size = size_numero_dans_les_cases) + elif (nb_of_fetch == 2): + ax1[i, j].text(int(fields[2]), int(fields[1]), 2, va="center", ha="center", color = "black", size = size_numero_dans_les_cases) + elif (nb_of_fetch == 3): + ax1[i, j].text(int(fields[2]), int(fields[1]), 3, va="center", ha="center", color = "black", size = size_numero_dans_les_cases) + + # Adding text under the figures + for i in range(NROW): + for j in range(NCOL): + string = "K=" + str(j*NCOL+i) + if NGPU >= 1: + ax1[j, i].text(-0.39, 0.1, string, fontsize = 7, color="red", transform=ax1[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + if NGPU >= 2: + ax2[j, i].text(-0.39, 0.1, string, fontsize = 7, color="red", transform=ax2[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + if NGPU >= 4: + ax3[j, i].text(-0.39, 0.1, string, fontsize = 7, color="red", transform=ax3[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + ax4[j, i].text(-0.39, 0.1, string, fontsize = 7, color="red", transform=ax4[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + if NGPU == 8: + ax5[j, i].text(-0.39, 0.1, string, fontsize = 7, color="red", transform=ax5[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + ax6[j, i].text(-0.39, 0.1, string, fontsize = 7, color="red", transform=ax6[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + ax7[j, i].text(-0.39, 0.1, string, fontsize = 7, color="red", transform=ax7[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + ax8[j, i].text(-0.39, 0.1, string, fontsize = 7, color="red", transform=ax8[j, i].transAxes, bbox=dict(facecolor='none', edgecolor='red')) + + # Printing + for i in range(NROW): + for j in range(NCOL): + if NGPU >= 1: + ax1[i, j].imshow(m1[i, j]) + if NGPU >= 2: + ax2[i, j].imshow(m2[i, j]) + if NGPU >= 4: + ax3[i, j].imshow(m3[i, j]) + ax4[i, j].imshow(m4[i, j]) + if NGPU >= 8: + ax5[i, j].imshow(m5[i, j]) + ax6[i, j].imshow(m6[i, j]) + ax7[i, j].imshow(m7[i, j]) + ax8[i, j].imshow(m8[i, j]) + +else: + print("Application not supported; Please Use gemm or cholesky") + sys.exit(1) + +# Closing open files +file_coord.close() +file_data.close() +file_coord_prefetch.close() + +if (APPLI == "Matrice3D" or APPLI == "MatriceZ4"): + image_format = 'svg' + image_name1 = ORDO + '_M3D_N' + str(N) + "." + "image_format" + fig.savefig(image_name1, format=image_format, dpi=1200) + +if (APPLI == "Matrice_ligne"): + image_format = 'svg' + image_name1 = ORDO + '_M2D_N' + str(N) + "." + image_format + fig.savefig(image_name1, format=image_format, dpi=1200) + + +if (APPLI == "Cholesky"): + image_format = 'svg' + if NGPU >= 1: + image_name1 = ORDO + '_CHO_N' + str(N) + '_GPU_1.' + image_format + fig1.savefig(image_name1, format=image_format, dpi=1200) + if NGPU >= 2: + image_name2 = ORDO + '_CHO_test_GPU_2.' + image_format + fig2.savefig(image_name2, format=image_format, dpi=1200) + if NGPU >= 4: + image_name3 = ORDO + '_CHO_test_GPU_3.' + image_format + fig3.savefig(image_name3, format=image_format, dpi=1200) + image_name4 = ORDO + '_CHO_test_GPU_4.' + image_format + fig4.savefig(image_name4, format=image_format, dpi=1200) + if NGPU >= 8: + image_name5 = ORDO + '_CHO_test_GPU_5.' + image_format + fig5.savefig(image_name5, format=image_format, dpi=1200) + image_name6 = ORDO + '_CHO_test_GPU_6.' + image_format + fig6.savefig(image_name6, format=image_format, dpi=1200) + image_name7 = ORDO + '_CHO_test_GPU_7.' + image_format + fig7.savefig(image_name7, format=image_format, dpi=1200) + image_name8 = ORDO + '_CHO_test_GPU_8.' + image_format + fig8.savefig(image_name8, format=image_format, dpi=1200) +else: + plt.show()