Merge pull request #70 from intel/develop

Develop
intel · Dec 23, 2017 · d412e53 · d412e53
2 parents 27a01a0 + b9e9696
commit d412e53
Show file tree

Hide file tree

Showing 12 changed files with 458 additions and 191 deletions.
diff --git a/bin/gen_loops.pl b/bin/gen_loops.pl
@@ -695,15 +695,15 @@ ($)
             push @loopPrefix,
                 " // Distribute iterations among OpenMP threads.", 
                 "#pragma OMP_PRAGMA_PREFIX $priv OMP_PRAGMA_SUFFIX";
-            warn "info: using OpenMP on following loop.\n";
+            print "info: using OpenMP on following loop.\n";
         }
 
         # generate simd in next loop.
         elsif (lc $tok eq 'simd') {
 
             push @loopPrefix, '_Pragma("simd")';
             $features |= $bSimd;
-            warn "info: generating SIMD in following loop.\n";
+            print "info: generating SIMD in following loop.\n";
         }
 
         # use grouped path in next loop if possible.
@@ -747,7 +747,7 @@ ($)
             }
 
             # print more info.
-            warn "info: generating scan over ".dimStr(@loopDims)."...\n";
+            print "info: generating scan over ".dimStr(@loopDims)."...\n";
 
             # add initial code for index vars, but don't start loop body yet.
             addIndexVars1(\@code, \@loopDims, $features);
@@ -881,7 +881,7 @@ ($)
     if (!defined which($indent)) {
         $indent = 'gindent';
         if (!defined which($indent)) {
-            warn "note: cannot find [g]indent utility--output will be unformatted.\n";
+            print "note: cannot find [g]indent utility--output will be unformatted.\n";
             undef $indent;
         }
     }
@@ -975,7 +975,7 @@ ()
     }
 
     @dims = 0 .. ($OPT{ndims} - 1);
-    warn "info: generating scanning code for ".scalar(@dims)."-D grids...\n";
+    print "info: generating scanning code for ".scalar(@dims)."-D grids...\n";
     $inputVar = $OPT{inVar};
 
     my $codeString = join(' ', @ARGV); # just concat all non-options params together.

diff --git a/docs/api/mainpage.txt b/docs/api/mainpage.txt
@@ -19,7 +19,7 @@ There are two sets of APIs provided by YASK corresponding to these tasks:
 -# The YASK Stencil Compiler API (available in C++ and Python).
 -# The YASK Stencil Kernel API (available in C++ and Python).
 
-For each of the tasks, you can either use use the YASK-provided application
+For each of the tasks, you can either use the YASK-provided application
 or create your own application built with the corresponding API.
 
 These alternatives may be mixed-or-matched in all combinations.
@@ -59,7 +59,7 @@ A new stencil solution may be defined in one of the following ways:
   - See \ref yc for documentation on the compiler API.
 
 In either case, the resulting generated code should written to the C++ stencil-code file,
-src/kernel/gen/yask_stencil_code.hpp`.
+`src/kernel/gen/yask_stencil_code.hpp`.
 
 @subsection yk_intro Create a Stencil Kernel Library and Stencil-based Application
 
@@ -140,9 +140,10 @@ this distinguishes them from the 'yk_'-prefixed types used in the "YASK kernel"
   - To complete each equation, use yc_node_factory::new_equation_node() to specify an expression
     on the right-hand side (RHS) and the grid point that is defined to be equal
     to it on the left-hand side (LHS).
-- Specify the solution step dimension via yc_solution::set_step_dim().
-  (This is usually "t" for time.)
-- Specify the number of bytes in a floating-point element via yc_solution::set_elem_bytes().
+  - Specify the solution domain dimension via yc_node_factory::new_domain_index().
+  - Specify the solution step dimension via yc_node_factory::new_step_index().
+    (This is usually "t" for time.)
+- Specify the number of bytes in a floating-point element via yc_solution::set_element_bytes().
   This should be 4 or 8.
 - Optionally specify the vector-folding and/or vector-clustering via 
   yc_solution::set_fold_len() and/or yc_solution::set_cluster_mult().
@@ -171,9 +172,9 @@ this distinguishes them from the 'yc_'-prefixed types used in the "YASK compiler
   Be sure to modify any settings before calling yk_solution::prepare_solution().
 - Access the solution grids via yk_solution::get_grid().
   Various properties of each grid may be retrieved or set.
-  In particular, you should initialize the data in each grid via yk_grid::set_all_elements()
+  In particular, you should initialize the data in each grid via yk_grid::set_all_elements_same()
   and yk_grid::set_element().
-- Apply the stencil(s) to the grids via yk_solution::apply_solution().
+- Apply the stencil(s) to the grids via yk_solution::run_solution().
   There are versions for advancing one or more steps.
 - Retrieve the final results via yk_grid::get_element().
 */

diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp
@@ -41,7 +41,7 @@ namespace yask {
     // for numbers above 9 (at least up to 99).
 
     // Format: "major.minor.patch".
-    const string version = "2.01.04";
+    const string version = "2.01.05";
 
     string yask_get_version_string() {
         return version;

diff --git a/src/compiler/lib/Cpp.cpp b/src/compiler/lib/Cpp.cpp
@@ -392,7 +392,10 @@ namespace yask {
 
                 // Output write using base addr.
                 printPointComment(os, gp, "Write aligned");
-                os << _linePrefix << *p << "[" << ofs << "] = " << val << _lineSuffix;
+
+                os << _linePrefix << val << ".storeTo_masked(" << *p << "+" << ofs << ", write_mask)" << _lineSuffix;
+                // without mask: os << _linePrefix << *p << "[" << ofs << "] = " << val << _lineSuffix;
+
                 return "";
             }
         }
@@ -434,7 +437,8 @@ namespace yask {
     string CppVecPrintHelper::printAlignedVecWrite(ostream& os, const GridPoint& gp,
                                                    const string& val) {
         printPointComment(os, gp, "Write aligned");
-        auto vn = printVecPointCall(os, gp, "writeVecNorm", val, "__LINE__", true);
+        auto vn = printVecPointCall(os, gp, "writeVecNorm_masked", val, "write_mask, __LINE__", true);
+        // without mask: auto vn = printVecPointCall(os, gp, "writeVecNorm", val, "__LINE__", true);
 
         // Write temp var to memory.
         os << vn;

diff --git a/src/compiler/lib/YaskKernel.cpp b/src/compiler/lib/YaskKernel.cpp
@@ -499,104 +499,114 @@ namespace yask {
                 delete sp;
             }
 
-            // Cluster/Vector code.
-            {
+            // Vector/Cluster code.
+            for (int do_cluster = 0; do_cluster <= 1; do_cluster++) {
+
                 // Cluster eqGroup at same 'ei' index.
                 // This should be the same eq-group because it was copied from the
                 // scalar one.
-                auto& ceq = _clusterEqGroups.at(ei);
-                assert(egDesc == ceq.getDescription());
+                auto& vceq = do_cluster ? _clusterEqGroups.at(ei) : _eqGroups.at(ei);
+                assert(egDesc == vceq.getDescription());
 
                 // Create vector info for this eqGroup.
                 // The visitor is accepted at all nodes in the cluster AST;
                 // for each grid access node in the AST, the vectors
                 // needed are determined and saved in the visitor.
                 VecInfoVisitor vv(*_dims);
-                ceq.visitEqs(&vv);
+                vceq.visitEqs(&vv);
 
                 // Reorder some equations based on vector info.
                 ExprReorderVisitor erv(vv);
-                ceq.visitEqs(&erv);
+                vceq.visitEqs(&erv);
 
                 // Collect stats.
                 CounterVisitor cv;
-                ceq.visitEqs(&cv);
-                int numResults = _dims->_clusterPts.product();
-
+                vceq.visitEqs(&cv);
+                int numResults = do_cluster ? _dims->_clusterPts.product() : _dims->_fold.product();
+
+                // Vector/cluster vars.
+                string idim = _dims->_innerDim;
+                string vcstr = do_cluster ? "cluster" : "vector";
+                string funcstr = "calc_loop_of_" + vcstr + "s";
+                string nvecs = do_cluster ? "CMULT_" + allCaps(idim) : "1";
+                string nelems = (do_cluster ? nvecs + " * ": "") + "VLEN_" + allCaps(idim);
+
                 // Loop-calculation code.
-                {
-
-                    // Function header.
-                    string idim = _dims->_innerDim;
-                    string istart = "start_" + idim;
-                    string istop = "stop_" + idim;
-                    string istep = "step_" + idim;
-                    string iestep = "step_" + idim + "_elem";
-                    os << endl << " // Calculate a series of clusters iterating in +'" << idim <<
-                        "' direction from " << _dims->_stencilDims.makeDimStr() <<
-                        " indices in 'idxs' to '" << istop << "'.\n" <<
-                        " // Each cluster calculates '" << _dims->_clusterPts.makeDimValStr(" * ") <<
-                        "' points containing " << _dims->_clusterMults.product() << " '" <<
-                        _dims->_fold.makeDimValStr(" * ") << "' vector(s).\n"
-                        " // Indices must be rank-relative (not global).\n"
-                        " // Indices must be normalized, i.e., already divided by VLEN_*.\n"
-                        " // SIMD calculations use " << vv.getNumPoints() <<
-                        " vector block(s) created from " << vv.getNumAlignedVecs() <<
-                        " aligned vector-block(s).\n"
-                        " // There are approximately " << (stats.getNumOps() * numResults) <<
-                        " FP operation(s) per loop iteration.\n"
-                        " void calc_loop_of_clusters(const Indices& idxs, idx_t " <<
-                        istop << ") {\n";
-                    printIndices(os);
-                    os << " idx_t " << istart << " = " << idim << ";\n";
-                    os << " idx_t " << istep << " = CMULT_" <<
-                        allCaps(idim) << "; // number of vectors.\n";
-                    os << " idx_t " << iestep << " = " <<
-                        istep << " * VLEN_" << allCaps(idim) << "; // number of elements.\n";
-
-                    // C++ vector print assistant.
-                    CppVecPrintHelper* vp = newCppVecPrintHelper(vv, cv);
-                    vp->printElemIndices(os);
-
-                    // Start forced-inline code.
-                    os << "\n // Force inlining if possible.\n"
-                        "#if !defined(DEBUG) && defined(__INTEL_COMPILER)\n"
-                        "#pragma forceinline recursive\n"
-                        "#endif\n"
-                        " {\n";
+                // Function header.
+                string istart = "start_" + idim;
+                string istop = "stop_" + idim;
+                string istep = "step_" + idim;
+                string iestep = "step_" + idim + "_elem";
+                os << endl << " // Calculate a series of " << vcstr << "s iterating in +'" << idim <<
+                    "' direction from " << _dims->_stencilDims.makeDimStr() <<
+                    " indices in 'idxs' to '" << istop << "'.\n";
+                if (do_cluster)
+                    os << " // Each cluster calculates '" << _dims->_clusterPts.makeDimValStr(" * ") <<
+                        "' point(s) containing " << _dims->_clusterMults.product() << " '" <<
+                        _dims->_fold.makeDimValStr(" * ") << "' vector(s).\n";
+                else
+                    os << " // Each vector calculates '" << _dims->_fold.makeDimValStr(" * ") <<
+                        "' point(s).\n";
+                os << " // Indices must be rank-relative (not global).\n"
+                    " // Indices must be normalized, i.e., already divided by VLEN_*.\n"
+                    " // SIMD calculations use " << vv.getNumPoints() <<
+                    " vector block(s) created from " << vv.getNumAlignedVecs() <<
+                    " aligned vector-block(s).\n"
+                    " // There are approximately " << (stats.getNumOps() * numResults) <<
+                    " FP operation(s) per iteration.\n" <<
+                    " void " << funcstr << "(const Indices& idxs, idx_t " << istop;
+                if (!do_cluster)
+                    os << ", idx_t write_mask";
+                os << ") {\n";
+                printIndices(os);
+                os << " idx_t " << istart << " = " << idim << ";\n";
+                os << " idx_t " << istep << " = " << nvecs << "; // number of vectors per iter.\n";
+                os << " idx_t " << iestep << " = " << nelems << "; // number of elements per iter.\n";
+                if (do_cluster)
+                    os << " idx_t write_mask = idx_t(-1); // no masking for clusters.\n";
+
+                // C++ vector print assistant.
+                CppVecPrintHelper* vp = newCppVecPrintHelper(vv, cv);
+                vp->printElemIndices(os);
+
+                // Start forced-inline code.
+                os << "\n // Force inlining if possible.\n"
+                    "#if !defined(DEBUG) && defined(__INTEL_COMPILER)\n"
+                    "#pragma forceinline recursive\n"
+                    "#endif\n"
+                    " {\n";
 
-                    // Print loop-invariants.
-                    CppLoopVarPrintVisitor lvv(os, *vp, _settings);
-                    ceq.visitEqs(&lvv);
+                // Print loop-invariants.
+                CppLoopVarPrintVisitor lvv(os, *vp, _settings);
+                vceq.visitEqs(&lvv);
 
-                    // Print pointers and prefetches.
-                    vp->printBasePtrs(os);
+                // Print pointers and prefetches.
+                vp->printBasePtrs(os);
 
-                    // Actual Loop.
-                    os << "\n // Inner loop.\n"
-                        " for (idx_t " << idim << " = " << istart << "; " <<
-                        idim << " < " << istop << "; " <<
-                        idim << " += " << istep << ", " <<
-                        vp->getElemIndex(idim) << " += " << iestep << ") {\n";
+                // Actual Loop.
+                os << "\n // Inner loop.\n"
+                    " for (idx_t " << idim << " = " << istart << "; " <<
+                    idim << " < " << istop << "; " <<
+                    idim << " += " << istep << ", " <<
+                    vp->getElemIndex(idim) << " += " << iestep << ") {\n";
 
-                    // Generate loop body using vars stored in print helper.
-                    // Visit all expressions to cover the whole cluster.
-                    PrintVisitorBottomUp pcv(os, *vp, _settings);
-                    ceq.visitEqs(&pcv);
+                // Generate loop body using vars stored in print helper.
+                // Visit all expressions to cover the whole vector/cluster.
+                PrintVisitorBottomUp pcv(os, *vp, _settings);
+                vceq.visitEqs(&pcv);
 
-                    // Insert prefetches using vars stored in print helper for next iteration.
-                    vp->printPrefetches(os, true);
+                // Insert prefetches using vars stored in print helper for next iteration.
+                vp->printPrefetches(os, true);
 
-                    // End of loop.
-                    os << " } // '" << idim << "' loop.\n";
+                // End of loop.
+                os << " } // '" << idim << "' loop.\n";
 
-                    // End forced-inline code.
-                    os << " } // Forced-inline block.\n";
+                // End forced-inline code.
+                os << " } // Forced-inline block.\n";
 
-                    // End of function.
-                    os << "} // calc_loop_of_clusters.\n";
-                    delete vp;
-                }
+                // End of function.
+                os << "} // " << funcstr << ".\n";
+                delete vp;
             }
 
             os << "}; // " << egsName << ".\n"; // end of class.

diff --git a/src/kernel/Makefile b/src/kernel/Makefile
@@ -292,7 +292,7 @@ BLOCK_LOOP_ORDER	?=	1 .. N-1
 BLOCK_LOOP_CODE		?=	$(BLOCK_LOOP_OUTER_MODS) loop($(BLOCK_LOOP_ORDER)) { \
 				$(BLOCK_LOOP_INNER_MODS) call(calc_sub_block); }
 
-# Sub-block loops break up a sub-block into vector clusters.  These loops
+# Sub-block loops break up a sub-block into clusters or vectors.  These loops
 # are run by a single OMP thread.  The N-1 (inner) loop is generated by the
 # stencil compiler.  There is no time loop because threaded temporal
 # blocking is not yet supported.  The indexes in this loop are 'normalized',
@@ -301,7 +301,7 @@ SUB_BLOCK_LOOP_OPTS		?=     	$(NDIMS_OPT) -inVar norm_sub_block_idxs
 SUB_BLOCK_LOOP_OUTER_MODS	?=	square_wave serpentine
 SUB_BLOCK_LOOP_ORDER		?=	1 .. N-2
 SUB_BLOCK_LOOP_CODE		?=	$(SUB_BLOCK_LOOP_OUTER_MODS) loop($(SUB_BLOCK_LOOP_ORDER)) { \
-					$(SUB_BLOCK_LOOP_INNER_MODS) call(calc_loop_of_clusters); }
+					$(SUB_BLOCK_LOOP_INNER_MODS) call(calc_inner_loop); }
 
 # General-purpose parallel loop.
 # Nested OpenMP is not used here because there is no sharing between threads.
@@ -507,6 +507,7 @@ endif # compiler.
 
 # Compile with model_cache=1 or 2 to check prefetching.
 # Turn off OpenMP when modeling a cache.
+# This is currently not operative! TODO: re-enable cache model.
 ifeq ($(model_cache),1)
  MACROS       	+=      MODEL_CACHE=1
  OMPFLAGS	:=	-qopenmp-stubs
@@ -819,7 +820,7 @@ help:
 	@echo " "
 	@echo "Example debug builds of kernel cmd-line tool:"
 	@echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG'"
-	@echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG' model_cache=2"
+	@echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG'"
 	@echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' EXTRA_MACROS='DEBUG TRACE TRACE_MEM TRACE_INTRINSICS'"
 	@echo " "
 	@echo "Example builds with test runs:"

diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp
@@ -111,7 +111,7 @@ namespace yask {
 
     ///// Top-level methods for evaluating reference and optimized stencils.
 
-    // Eval stencil group(s) over grid(s) using scalar code.
+    // Eval stencil group(s) over grid(s) using reference scalar code.
     void StencilContext::calc_rank_ref()
     {
         run_time.start();
@@ -1836,9 +1836,9 @@ namespace yask {
             auto& dname = dim.getName();
             if (bb_len[dname] % dims->_cluster_pts[dname] != 0) {
                 if (bb_is_full && bb_is_aligned)
-                    os << "Warning: '" << name << "' domain"
+                    os << "Note: '" << name << "' domain"
                         " has one or more sizes that are not vector-cluster multiples;"
-                        " slower scalar calculations will be used in remainder sub-blocks.\n";
+                        " masked calculations will be used in remainder sub-blocks.\n";
                 bb_is_cluster_mult = false;
                 break;
             }