diff --git a/include/pando-lib-galois/utility/gptr_monad.hpp b/include/pando-lib-galois/utility/gptr_monad.hpp index 0d0c7872..3dd74717 100644 --- a/include/pando-lib-galois/utility/gptr_monad.hpp +++ b/include/pando-lib-galois/utility/gptr_monad.hpp @@ -4,6 +4,8 @@ #ifndef PANDO_LIB_GALOIS_UTILITY_GPTR_MONAD_HPP_ #define PANDO_LIB_GALOIS_UTILITY_GPTR_MONAD_HPP_ +#include + /** * @brief lifts a function with no arguments to work on references */ @@ -70,4 +72,45 @@ auto applyFunc(pando::GlobalRef ref, F func) { return func(obj); } +#if 1 +#define apply(ref, func, ...) \ + __extension__({ \ + auto ptrComputed##__LINE__ = &(ref); \ + typename std::pointer_traits::element_type tmp = \ + *ptrComputed##__LINE__; \ + auto ret = tmp.func(__VA_ARGS__); \ + ret; \ + }) + +#elif 0 + +/* F is a method pointer */ +template +auto apply(pando::GlobalRef ref, F func, Args... args) { + T obj = ref; + return (obj.*func)(args...); +} + +/* F is a method pointer */ +template +auto apply(T& ref, F func, Args... args) { + T obj = ref; + return (obj.*func)(args...); +} +#else + +template +R apply(pando::GlobalRef ref, R (T::*func)(As...), As... args) { + T obj = ref; + return (obj.*func)(args...); +} + +template +R apply(T& ref, R (T::*func)(As...), As... args) { + T obj = ref; + return (obj.*func)(args...); +} + +#endif + #endif // PANDO_LIB_GALOIS_UTILITY_GPTR_MONAD_HPP_ diff --git a/microbench/triangle-counting/include/tc_algos.hpp b/microbench/triangle-counting/include/tc_algos.hpp index 53cc4fc6..470414a6 100644 --- a/microbench/triangle-counting/include/tc_algos.hpp +++ b/microbench/triangle-counting/include/tc_algos.hpp @@ -5,7 +5,7 @@ #include "utils.hpp" -template +template void tc_no_chunk(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count); @@ -15,11 +15,12 @@ void tc_chunk_edges(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count); */ -template +template void tc_chunk_vertices(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count); void HBMainTC(pando::Array filename, int64_t num_vertices, bool load_balanced_graph, - TC_CHUNK tc_chunk, galois::DAccumulator final_tri_count); + TC_CHUNK tc_chunk, bool binary_search, + galois::DAccumulator final_tri_count); #endif // TRIANGLE_COUNTING_INCLUDE_TC_ALGOS_HPP_ diff --git a/microbench/triangle-counting/include/utils.hpp b/microbench/triangle-counting/include/utils.hpp index 71aaac71..c5ba63fe 100644 --- a/microbench/triangle-counting/include/utils.hpp +++ b/microbench/triangle-counting/include/utils.hpp @@ -42,6 +42,7 @@ struct CommandLineOptions { int64_t num_vertices = 0; bool load_balanced_graph = false; TC_CHUNK tc_chunk = TC_CHUNK::NO_CHUNK; + bool binary_search = false; void print() { printf("******** CommandLineOptions ******** \n"); @@ -49,11 +50,16 @@ struct CommandLineOptions { std::cout << "num_vertices = " << num_vertices << '\n'; std::cout << "load_balanced_graph = " << load_balanced_graph << '\n'; std::cout << "tc_chunk = " << tc_chunk << '\n'; + std::cout << "binary_search = " << binary_search << '\n'; printf("******** END CommandLineOptions ******** \n"); } CommandLineOptions() - : elFile(""), num_vertices(0), load_balanced_graph(false), tc_chunk(TC_CHUNK::NO_CHUNK) {} + : elFile(""), + num_vertices(0), + load_balanced_graph(false), + tc_chunk(TC_CHUNK::NO_CHUNK), + binary_search(false) {} }; std::unique_ptr read_cmd_line_args(int argc, char** argv); @@ -63,7 +69,7 @@ void printUsage(char* argv0); // CONNECTION KERNELS // ##################################################################### template -void intersect_dag_merge(galois::WaitGroup::HandleType wgh, pando::GlobalPtr graph_ptr, +void intersect_dag_merge(pando::GlobalPtr graph_ptr, typename GraphType::VertexTopologyID v0, typename GraphType::VertexTopologyID v1, galois::DAccumulator final_tri_count) { @@ -90,12 +96,10 @@ void intersect_dag_merge(galois::WaitGroup::HandleType wgh, pando::GlobalPtr -void intersect_dag_merge_double_binary(galois::WaitGroup::HandleType wgh, - pando::GlobalPtr graph_ptr, +void intersect_dag_merge_double_binary(pando::GlobalPtr graph_ptr, typename GraphType::VertexTopologyID v0, typename GraphType::VertexTopologyID v1, galois::DAccumulator final_tri_count) { @@ -144,7 +148,6 @@ void intersect_dag_merge_double_binary(galois::WaitGroup::HandleType wgh, } } final_tri_count.add(count); - wgh.done(); } // ##################################################################### @@ -212,7 +215,7 @@ void vertexset_intersection(pando::GlobalPtr graph_ptr, auto [graph_ptr, v1, final_tri_count, connection_kernel, v1_token] = state; GraphType g = *graph_ptr; (void)eh; // Required to prevent -Werror=unused-parameter - return fmap(g, getLocalityVertex, v1); + return apply(g, getLocalityVertex, v1); }; galois::doAll( @@ -220,9 +223,9 @@ void vertexset_intersection(pando::GlobalPtr graph_ptr, +[](decltype(state) state, typename GraphType::EdgeHandle eh) { auto [graph_ptr, v1, final_tri_count, connection_kernel, v1_token] = state; GraphType g = *graph_ptr; - typename GraphType::VertexTopologyID neighbor_of_v0 = fmap(g, getEdgeDst, eh); + typename GraphType::VertexTopologyID neighbor_of_v0 = apply(g, getEdgeDst, eh); typename GraphType::VertexTokenID neighbor_of_v0_token = - fmap(g, getTokenID, neighbor_of_v0); + apply(g, getTokenID, neighbor_of_v0); // Because of DAG optimization if (neighbor_of_v0_token <= v1_token) diff --git a/microbench/triangle-counting/src/tc.cpp b/microbench/triangle-counting/src/tc.cpp index da67d602..4cd01205 100644 --- a/microbench/triangle-counting/src/tc.cpp +++ b/microbench/triangle-counting/src/tc.cpp @@ -23,7 +23,7 @@ int pandoMain(int argc, char** argv) { PANDO_CHECK(final_tri_count.initialize()); HBMainTC(filename, opts->num_vertices, opts->load_balanced_graph, opts->tc_chunk, - final_tri_count); + opts->binary_search, final_tri_count); std::cout << "*** FINAL TRI COUNT = " << final_tri_count.reduce() << "\n"; #if BENCHMARK diff --git a/microbench/triangle-counting/src/tc_algos.cpp b/microbench/triangle-counting/src/tc_algos.cpp index 6c1d813c..7041b5a4 100644 --- a/microbench/triangle-counting/src/tc_algos.cpp +++ b/microbench/triangle-counting/src/tc_algos.cpp @@ -2,6 +2,7 @@ // Copyright (c) 2023. University of Texas at Austin. All rights reserved. #include +#include // ##################################################################### // TC UTILS @@ -13,13 +14,10 @@ * @param[in] graph_ptr Pointer to the in-memory graph * @param[in] final_tri_count Thread-safe counter */ -template -void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexTopologyID v0, - typename Graph::EdgeRange edge_range, +template +void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr graph_ptr, + typename Graph::VertexTopologyID v0, typename Graph::EdgeRange edge_range, galois::DAccumulator final_tri_count) { - galois::WaitGroup wg; - PANDO_CHECK(wg.initialize(0)); - auto wgh = wg.getHandle(); auto innerState = galois::make_tpl(graph_ptr, v0, wgh, final_tri_count); Graph graph = *graph_ptr; galois::doAll( @@ -27,19 +25,20 @@ void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexT +[](decltype(innerState) innerState, typename Graph::EdgeHandle eh) { auto [graph_ptr, v0, wgh, final_tri_count] = innerState; Graph g = *graph_ptr; - typename Graph::VertexTopologyID v1 = fmap(g, getEdgeDst, eh); - wgh.addOne(); - intersect_dag_merge(wgh, graph_ptr, v0, v1, final_tri_count); + typename Graph::VertexTopologyID v1 = apply(g, getEdgeDst, eh); + if (binary_search) + intersect_dag_merge_double_binary(graph_ptr, v0, v1, final_tri_count); + else + intersect_dag_merge(graph_ptr, v0, v1, final_tri_count); }, [&graph](decltype(innerState) innerState, typename Graph::EdgeHandle eh) -> pando::Place { auto v0 = std::get<1>(innerState); - typename Graph::VertexTopologyID v1 = fmap(graph, getEdgeDst, eh); - bool v0_higher_degree = fmap(graph, getNumEdges, v0) >= fmap(graph, getNumEdges, v1); - pando::Place locality = v0_higher_degree ? fmap(graph, getLocalityVertex, v0) - : fmap(graph, getLocalityVertex, v1); + typename Graph::VertexTopologyID v1 = apply(graph, getEdgeDst, eh); + bool v0_higher_degree = apply(graph, getNumEdges, v0) >= apply(graph, getNumEdges, v1); + pando::Place locality = v0_higher_degree ? apply(graph, getLocalityVertex, v0) + : apply(graph, getLocalityVertex, v1); return locality; }); - PANDO_CHECK(wg.wait()); } // ##################################################################### @@ -51,14 +50,20 @@ void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexT * @param[in] graph_ptr Pointer to the in-memory graph * @param[in] final_tri_count Thread-safe counter */ -template +template void tc_no_chunk(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count) { GraphType graph = *graph_ptr; - auto state = galois::make_tpl(graph_ptr, final_tri_count); + + galois::WaitGroup wg; + PANDO_CHECK(wg.initialize(0)); + auto wgh = wg.getHandle(); + auto state = galois::make_tpl(graph_ptr, final_tri_count, wgh); + galois::doAll( - state, graph.vertices(), +[](decltype(state) state, typename GraphType::VertexTopologyID v0) { - auto [graph_ptr, final_tri_count] = state; + wgh, state, graph.vertices(), + +[](decltype(state) state, typename GraphType::VertexTopologyID v0) { + auto [graph_ptr, final_tri_count, wgh] = state; GraphType graph = *graph_ptr; // Degree Filtering Optimization @@ -66,8 +71,11 @@ void tc_no_chunk(pando::GlobalPtr graph_ptr, if (v0_degree < (TC_EMBEDDING_SZ - 1)) return; - edge_tc_counting(graph_ptr, v0, graph.edges(v0), final_tri_count); + edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), + final_tri_count); }); + PANDO_CHECK(wg.wait()); + wg.deinitialize(); } /** @@ -131,8 +139,11 @@ void tc_chunk_edges(pando::GlobalPtr graph_ptr, * @param[in] graph_ptr Pointer to the in-memory graph * @param[in] final_tri_count Thread-safe counter */ +template void tc_chunk_vertices(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count) { + using LCSR = galois::LCSR; + GraphDL graph = *graph_ptr; uint64_t query_sz = 1; uint64_t iters = 0; @@ -159,11 +170,14 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, auto lcsr = graph.getLocalCSR(); uint64_t host_vertex_iter_offset = host_vertex_iter_offset_ref; - auto inner_state = galois::make_tpl(graph_ptr, final_tri_count); + galois::WaitGroup wg; + PANDO_CHECK(wg.initialize(0)); + auto wgh = wg.getHandle(); + auto inner_state = galois::make_tpl(graph_ptr, final_tri_count, wgh); galois::doAll( - inner_state, fmap(lcsr, vertices, host_vertex_iter_offset, query_sz), + inner_state, apply(lcsr, vertices, host_vertex_iter_offset, query_sz), +[](decltype(inner_state) inner_state, typename GraphDL::VertexTopologyID v0) { - auto [graph_ptr, final_tri_count] = inner_state; + auto [graph_ptr, final_tri_count, wgh] = inner_state; GraphDL graph = *graph_ptr; // Degree Filtering Optimization @@ -171,15 +185,18 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, if (v0_degree < (TC_EMBEDDING_SZ - 1)) return; - edge_tc_counting(graph_ptr, v0, graph.edges(v0), final_tri_count); + edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), + final_tri_count); }); + PANDO_CHECK(wg.wait()); // Move iter offset - uint64_t lcsr_num_vertices = fmap(lcsr, size); + uint64_t lcsr_num_vertices = apply(lcsr, size); host_vertex_iter_offset += query_sz; if (host_vertex_iter_offset < lcsr_num_vertices) work_remaining.increment(); host_vertex_iter_offset_ref = host_vertex_iter_offset; + wg.deinitialize(); }); uint64_t current_count = final_tri_count.reduce(); @@ -198,7 +215,8 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, // TC GRAPH HBMAINS // ##################################################################### void HBGraphDL(pando::Place thisPlace, pando::Array filename, int64_t num_vertices, - TC_CHUNK tc_chunk, galois::DAccumulator final_tri_count) { + TC_CHUNK tc_chunk, bool binary_search, + galois::DAccumulator final_tri_count) { #if BENCHMARK auto time_graph_import_st = std::chrono::high_resolution_clock().now(); #endif @@ -227,7 +245,10 @@ void HBGraphDL(pando::Place thisPlace, pando::Array filename, int64_t num_ switch (tc_chunk) { case TC_CHUNK::CHUNK_VERTICES: - tc_chunk_vertices(graph_ptr, final_tri_count); + if (binary_search) + tc_chunk_vertices(graph_ptr, final_tri_count); + else + tc_chunk_vertices(graph_ptr, final_tri_count); break; /** case TC_CHUNK::CHUNK_EDGES: @@ -235,7 +256,10 @@ void HBGraphDL(pando::Place thisPlace, pando::Array filename, int64_t num_ break; */ default: - tc_no_chunk(graph_ptr, final_tri_count); + if (binary_search) + tc_no_chunk(graph_ptr, final_tri_count); + else + tc_no_chunk(graph_ptr, final_tri_count); break; } @@ -253,7 +277,7 @@ void HBGraphDL(pando::Place thisPlace, pando::Array filename, int64_t num_ } void HBGraphDA(pando::Place thisPlace, pando::Array filename, int64_t num_vertices, - galois::DAccumulator final_tri_count) { + bool binary_search, galois::DAccumulator final_tri_count) { #if BENCHMARK auto time_graph_import_st = std::chrono::high_resolution_clock().now(); #endif @@ -280,7 +304,10 @@ void HBGraphDA(pando::Place thisPlace, pando::Array filename, int64_t num_ auto time_tc_algo_st = std::chrono::high_resolution_clock().now(); #endif PANDO_MEM_STAT_NEW_KERNEL("TC_DFS_Algo Start"); - tc_no_chunk(graph_ptr, final_tri_count); + if (binary_search) + tc_no_chunk(graph_ptr, final_tri_count); + else + tc_no_chunk(graph_ptr, final_tri_count); #if BENCHMARK auto time_tc_algo_end = std::chrono::high_resolution_clock().now(); if (thisPlace.node.id == COORDINATOR_ID) @@ -295,11 +322,12 @@ void HBGraphDA(pando::Place thisPlace, pando::Array filename, int64_t num_ } void HBMainTC(pando::Array filename, int64_t num_vertices, bool load_balanced_graph, - TC_CHUNK tc_chunk, galois::DAccumulator final_tri_count) { + TC_CHUNK tc_chunk, bool binary_search, + galois::DAccumulator final_tri_count) { auto thisPlace = pando::getCurrentPlace(); if (load_balanced_graph) - HBGraphDL(thisPlace, filename, num_vertices, tc_chunk, final_tri_count); + HBGraphDL(thisPlace, filename, num_vertices, tc_chunk, binary_search, final_tri_count); else - HBGraphDA(thisPlace, filename, num_vertices, final_tri_count); + HBGraphDA(thisPlace, filename, num_vertices, binary_search, final_tri_count); } diff --git a/microbench/triangle-counting/src/utils.cpp b/microbench/triangle-counting/src/utils.cpp index 8dbb0a3e..a0bea367 100644 --- a/microbench/triangle-counting/src/utils.cpp +++ b/microbench/triangle-counting/src/utils.cpp @@ -13,7 +13,7 @@ std::unique_ptr read_cmd_line_args(int argc, char** argv) { int32_t flag = 0; int32_t num_vertices = 0; int32_t tc_chunk = NO_CHUNK; - while ((flag = getopt(argc, argv, "v:i:c:l")) != -1) { + while ((flag = getopt(argc, argv, "v:i:c:lb")) != -1) { switch (flag) { case 'v': sscanf(optarg, "%d", &num_vertices); @@ -25,6 +25,9 @@ std::unique_ptr read_cmd_line_args(int argc, char** argv) { case 'l': opts_ptr->load_balanced_graph = true; break; + case 'b': + opts_ptr->binary_search = true; + break; case 'c': sscanf(optarg, "%d", &tc_chunk); switch (tc_chunk) { @@ -65,6 +68,7 @@ void printUsage(char* argv0) { std::cerr << "Usage: " << argv0 << " -i filepath -v numVertices" << std::endl; std::cerr << "\n Can specify runtime algorithm with -c. Valid options: [0 (NO_CHUNK), 1 " "(CHUNK_EDGES), 2 (CHUNK_VERTICES)]\n"; + std::cerr << "Can use double binary search counting with -b. Defaults to linear search\n"; } void printUsageExit(char* argv0) { diff --git a/microbench/triangle-counting/test/test_tc.cpp b/microbench/triangle-counting/test/test_tc.cpp index 2b588538..9a7b23e8 100644 --- a/microbench/triangle-counting/test/test_tc.cpp +++ b/microbench/triangle-counting/test/test_tc.cpp @@ -13,10 +13,10 @@ uint64_t get_expected_TC(const std::string okFile) { } void e2e_tc_test(uint64_t expected_tc, pando::Array filename, uint64_t num_vertices, - bool load_balanced_graph, TC_CHUNK tc_chunk) { + bool load_balanced_graph, bool binary_search, TC_CHUNK tc_chunk) { galois::DAccumulator final_tri_count; EXPECT_EQ(final_tri_count.initialize(), pando::Status::Success); - HBMainTC(filename, num_vertices, load_balanced_graph, tc_chunk, final_tri_count); + HBMainTC(filename, num_vertices, load_balanced_graph, tc_chunk, binary_search, final_tri_count); EXPECT_EQ(final_tri_count.reduce(), expected_tc); final_tri_count.deinitialize(); } @@ -34,7 +34,7 @@ TEST_P(TriangleCountChunking, BasicDL) { for (uint64_t i = 0; i < elFile.size(); i++) filename[i] = elFile[i]; - e2e_tc_test(expected_tc, filename, num_vertices, true, tc_chunk); + e2e_tc_test(expected_tc, filename, num_vertices, true, false, tc_chunk); filename.deinitialize(); } @@ -61,7 +61,7 @@ TEST_P(TriangleCountDACSR, BasicDA) { for (uint64_t i = 0; i < elFile.size(); i++) filename[i] = elFile[i]; - e2e_tc_test(expected_tc, filename, num_vertices, false, TC_CHUNK::NO_CHUNK); + e2e_tc_test(expected_tc, filename, num_vertices, false, false, TC_CHUNK::NO_CHUNK); filename.deinitialize(); } diff --git a/pando-drv/tests/drv_pandohammer.py b/pando-drv/tests/drv_pandohammer.py index d2f54c80..27a5c205 100644 --- a/pando-drv/tests/drv_pandohammer.py +++ b/pando-drv/tests/drv_pandohammer.py @@ -39,8 +39,8 @@ def MakePANDOHammer(make_tile): "num_ports" : PODS * (CORES + POD_L2_BANKS) + (1 if arguments.with_command_processor else 0) + PXN_MAINMEM_BANKS + PXNS - 1, # If number of PXNS is equal to 1 we do not need additional port. Hence -1 "topology" : "merlin.singlerouter", # performance models - "xbar_bw" : "256GB/s", - "link_bw" : "256GB/s", + "xbar_bw" : f"{PODS *(CORES + POD_L2_BANKS)}GB/s", + "link_bw" : f"{PODS *(CORES + POD_L2_BANKS)}GB/s", "flit_size" : "8B", "input_buf_size" : arguments.network_onchip_buffer_size, "output_buf_size" : arguments.network_onchip_buffer_size, diff --git a/pando-rt/src/init.cpp b/pando-rt/src/init.cpp index f151fe28..ecfd1001 100644 --- a/pando-rt/src/init.cpp +++ b/pando-rt/src/init.cpp @@ -198,29 +198,35 @@ int main(int argc, char* argv[]) { rc = getrusage(RUSAGE_SELF, &end); if(rc != 0) {PANDO_ABORT("GETRUSAGE FAILED");} auto thisPlace = pando::getCurrentPlace(); - SPDLOG_WARN("Total time on node: {}, was {}ns", - thisPlace.node.id, - end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - - (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + - end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - - (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); - for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { - SPDLOG_WARN("Idle time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - idleCount.get(i)); - SPDLOG_WARN("Pointer time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - pointerCount.get(i)); - SPDLOG_WARN("Scheduler time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - schedulerCount.get(i)); - SPDLOG_WARN("DoAll time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - doAllCount.get(i)); + for(std::int64_t j = 0; j < std::int64_t(dims.node.id); j++) { + if (j == thisPlace.node.id) { + SPDLOG_WARN("Total time on node: {}, was {}ns", + thisPlace.node.id, + end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - + (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + + end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - + (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); + for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { + SPDLOG_WARN("Idle time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + idleCount.get(i)); + SPDLOG_WARN("Pointer time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + pointerCount.get(i)); + SPDLOG_WARN("Scheduler time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + schedulerCount.get(i)); + SPDLOG_WARN("DoAll time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + doAllCount.get(i)); + } + } + + pando::Nodes::barrier(); } @@ -258,17 +264,27 @@ extern "C" __attribute__((visibility("default"))) int __drv_api_main(int argc, c if(rc != 0) {PANDO_ABORT("GETRUSAGE FAILED");} auto thisPlace = pando::getCurrentPlace(); auto dims = pando::getPlaceDims(); - SPDLOG_WARN("Total time on node: {}, was {}ns", - thisPlace.node.id, - end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - - (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + - end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - - (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); - for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 2); i++) { - SPDLOG_WARN("Idle time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x + 1)) ? -1 : i), - idleCount.get(i)); + + + if (pando::isOnCP()) { + for (std::int64_t j = 0; j < std::int64_t(dims.node.id); j++) { + if (j == thisPlace.node.id) { + SPDLOG_WARN("Total time on node: {}, was {}ns", + thisPlace.node.id, + end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - + (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + + end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - + (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); + for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { + SPDLOG_WARN("Idle time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + idleCount.get(i)); + } + } + + pando::CommandProcessor::barrier(); + } } return ret;