From 2f82b1feabad1d95678997802dbd1289c05c34aa Mon Sep 17 00:00:00 2001 From: AdityaAtulTewari Date: Mon, 29 Jul 2024 18:18:21 +0000 Subject: [PATCH 1/6] Update chiprtr so throughput scales correctly --- pando-drv/tests/drv_pandohammer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pando-drv/tests/drv_pandohammer.py b/pando-drv/tests/drv_pandohammer.py index d2f54c80..27a5c205 100644 --- a/pando-drv/tests/drv_pandohammer.py +++ b/pando-drv/tests/drv_pandohammer.py @@ -39,8 +39,8 @@ def MakePANDOHammer(make_tile): "num_ports" : PODS * (CORES + POD_L2_BANKS) + (1 if arguments.with_command_processor else 0) + PXN_MAINMEM_BANKS + PXNS - 1, # If number of PXNS is equal to 1 we do not need additional port. Hence -1 "topology" : "merlin.singlerouter", # performance models - "xbar_bw" : "256GB/s", - "link_bw" : "256GB/s", + "xbar_bw" : f"{PODS *(CORES + POD_L2_BANKS)}GB/s", + "link_bw" : f"{PODS *(CORES + POD_L2_BANKS)}GB/s", "flit_size" : "8B", "input_buf_size" : arguments.network_onchip_buffer_size, "output_buf_size" : arguments.network_onchip_buffer_size, From b8ac9f192015142729fa134ec640570410b6a2a1 Mon Sep 17 00:00:00 2001 From: Brenden Elgarten Date: Tue, 30 Jul 2024 15:27:53 +0000 Subject: [PATCH 2/6] Fix hang in TC microbenchmark Added a shared WaitGroup between vertex doAll and the nested, per vertex, edge doAll. Original code would hang because of the separate wait groups: after enqueueing a doAll in edge_tc_couting, harts wait for it to complete (tc_algos.cpp:42). However, this occurs on every hart because of the outer doAll in tc_no_chunk, therefore every hart is waiting and none is available to complete the work being waited on. When using one combined wait group, the outer doAll tasks are able to complete after enqueuing, but before completion of, the inner doAll tasks. Thus, harts are freed to complete the inner doAll and therefore forward progress. --- .../triangle-counting/include/utils.hpp | 3 +- microbench/triangle-counting/src/tc_algos.cpp | 38 +++++++++++-------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/microbench/triangle-counting/include/utils.hpp b/microbench/triangle-counting/include/utils.hpp index 71aaac71..d2410d0c 100644 --- a/microbench/triangle-counting/include/utils.hpp +++ b/microbench/triangle-counting/include/utils.hpp @@ -63,7 +63,7 @@ void printUsage(char* argv0); // CONNECTION KERNELS // ##################################################################### template -void intersect_dag_merge(galois::WaitGroup::HandleType wgh, pando::GlobalPtr graph_ptr, +void intersect_dag_merge(pando::GlobalPtr graph_ptr, typename GraphType::VertexTopologyID v0, typename GraphType::VertexTopologyID v1, galois::DAccumulator final_tri_count) { @@ -90,7 +90,6 @@ void intersect_dag_merge(galois::WaitGroup::HandleType wgh, pando::GlobalPtr diff --git a/microbench/triangle-counting/src/tc_algos.cpp b/microbench/triangle-counting/src/tc_algos.cpp index 6c1d813c..b9fcbeaf 100644 --- a/microbench/triangle-counting/src/tc_algos.cpp +++ b/microbench/triangle-counting/src/tc_algos.cpp @@ -14,12 +14,9 @@ * @param[in] final_tri_count Thread-safe counter */ template -void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexTopologyID v0, - typename Graph::EdgeRange edge_range, +void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr graph_ptr, + typename Graph::VertexTopologyID v0, typename Graph::EdgeRange edge_range, galois::DAccumulator final_tri_count) { - galois::WaitGroup wg; - PANDO_CHECK(wg.initialize(0)); - auto wgh = wg.getHandle(); auto innerState = galois::make_tpl(graph_ptr, v0, wgh, final_tri_count); Graph graph = *graph_ptr; galois::doAll( @@ -28,8 +25,7 @@ void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexT auto [graph_ptr, v0, wgh, final_tri_count] = innerState; Graph g = *graph_ptr; typename Graph::VertexTopologyID v1 = fmap(g, getEdgeDst, eh); - wgh.addOne(); - intersect_dag_merge(wgh, graph_ptr, v0, v1, final_tri_count); + intersect_dag_merge(graph_ptr, v0, v1, final_tri_count); }, [&graph](decltype(innerState) innerState, typename Graph::EdgeHandle eh) -> pando::Place { auto v0 = std::get<1>(innerState); @@ -39,7 +35,6 @@ void edge_tc_counting(pando::GlobalPtr graph_ptr, typename Graph::VertexT : fmap(graph, getLocalityVertex, v1); return locality; }); - PANDO_CHECK(wg.wait()); } // ##################################################################### @@ -55,10 +50,16 @@ template void tc_no_chunk(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count) { GraphType graph = *graph_ptr; - auto state = galois::make_tpl(graph_ptr, final_tri_count); + + galois::WaitGroup wg; + PANDO_CHECK(wg.initialize(0)); + auto wgh = wg.getHandle(); + auto state = galois::make_tpl(graph_ptr, final_tri_count, wgh); + galois::doAll( - state, graph.vertices(), +[](decltype(state) state, typename GraphType::VertexTopologyID v0) { - auto [graph_ptr, final_tri_count] = state; + wgh, state, graph.vertices(), + +[](decltype(state) state, typename GraphType::VertexTopologyID v0) { + auto [graph_ptr, final_tri_count, wgh] = state; GraphType graph = *graph_ptr; // Degree Filtering Optimization @@ -66,8 +67,10 @@ void tc_no_chunk(pando::GlobalPtr graph_ptr, if (v0_degree < (TC_EMBEDDING_SZ - 1)) return; - edge_tc_counting(graph_ptr, v0, graph.edges(v0), final_tri_count); + edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), final_tri_count); }); + PANDO_CHECK(wg.wait()); + wg.deinitialize(); } /** @@ -159,11 +162,14 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, auto lcsr = graph.getLocalCSR(); uint64_t host_vertex_iter_offset = host_vertex_iter_offset_ref; - auto inner_state = galois::make_tpl(graph_ptr, final_tri_count); + galois::WaitGroup wg; + PANDO_CHECK(wg.initialize(0)); + auto wgh = wg.getHandle(); + auto inner_state = galois::make_tpl(graph_ptr, final_tri_count, wgh); galois::doAll( inner_state, fmap(lcsr, vertices, host_vertex_iter_offset, query_sz), +[](decltype(inner_state) inner_state, typename GraphDL::VertexTopologyID v0) { - auto [graph_ptr, final_tri_count] = inner_state; + auto [graph_ptr, final_tri_count, wgh] = inner_state; GraphDL graph = *graph_ptr; // Degree Filtering Optimization @@ -171,8 +177,9 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, if (v0_degree < (TC_EMBEDDING_SZ - 1)) return; - edge_tc_counting(graph_ptr, v0, graph.edges(v0), final_tri_count); + edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), final_tri_count); }); + PANDO_CHECK(wg.wait()); // Move iter offset uint64_t lcsr_num_vertices = fmap(lcsr, size); @@ -180,6 +187,7 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, if (host_vertex_iter_offset < lcsr_num_vertices) work_remaining.increment(); host_vertex_iter_offset_ref = host_vertex_iter_offset; + wg.deinitialize(); }); uint64_t current_count = final_tri_count.reduce(); From 43702d69a56f198d24954cb939693ab57bb787e1 Mon Sep 17 00:00:00 2001 From: Brenden Elgarten Date: Tue, 30 Jul 2024 15:38:20 +0000 Subject: [PATCH 3/6] fix overlapping prep timers --- pando-rt/src/init.cpp | 52 ++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/pando-rt/src/init.cpp b/pando-rt/src/init.cpp index f151fe28..209b965c 100644 --- a/pando-rt/src/init.cpp +++ b/pando-rt/src/init.cpp @@ -198,29 +198,35 @@ int main(int argc, char* argv[]) { rc = getrusage(RUSAGE_SELF, &end); if(rc != 0) {PANDO_ABORT("GETRUSAGE FAILED");} auto thisPlace = pando::getCurrentPlace(); - SPDLOG_WARN("Total time on node: {}, was {}ns", - thisPlace.node.id, - end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - - (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + - end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - - (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); - for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { - SPDLOG_WARN("Idle time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - idleCount.get(i)); - SPDLOG_WARN("Pointer time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - pointerCount.get(i)); - SPDLOG_WARN("Scheduler time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - schedulerCount.get(i)); - SPDLOG_WARN("DoAll time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), - doAllCount.get(i)); + for(std::int64_t j = 0; j < std::int64_t(dims.node.id); j++) { + if (j == thisPlace.node.id) { + SPDLOG_WARN("Total time on node: {}, was {}ns", + thisPlace.node.id, + end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - + (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + + end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - + (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); + for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { + SPDLOG_WARN("Idle time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + idleCount.get(i)); + SPDLOG_WARN("Pointer time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + pointerCount.get(i)); + SPDLOG_WARN("Scheduler time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + schedulerCount.get(i)); + SPDLOG_WARN("DoAll time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + doAllCount.get(i)); + } + } + + pando::Nodes::barrier(); } From 536c9a370dff459511f3dc929d80750d30f1bd11 Mon Sep 17 00:00:00 2001 From: Brenden Elgarten Date: Tue, 6 Aug 2024 19:07:48 +0000 Subject: [PATCH 4/6] add synchronization to drv timers --- pando-rt/src/init.cpp | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/pando-rt/src/init.cpp b/pando-rt/src/init.cpp index 209b965c..ecfd1001 100644 --- a/pando-rt/src/init.cpp +++ b/pando-rt/src/init.cpp @@ -264,17 +264,27 @@ extern "C" __attribute__((visibility("default"))) int __drv_api_main(int argc, c if(rc != 0) {PANDO_ABORT("GETRUSAGE FAILED");} auto thisPlace = pando::getCurrentPlace(); auto dims = pando::getPlaceDims(); - SPDLOG_WARN("Total time on node: {}, was {}ns", - thisPlace.node.id, - end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - - (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + - end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - - (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); - for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 2); i++) { - SPDLOG_WARN("Idle time on node: {}, core: {} was {}", - thisPlace.node.id, - std::int8_t((i == std::uint64_t(dims.core.x + 1)) ? -1 : i), - idleCount.get(i)); + + + if (pando::isOnCP()) { + for (std::int64_t j = 0; j < std::int64_t(dims.node.id); j++) { + if (j == thisPlace.node.id) { + SPDLOG_WARN("Total time on node: {}, was {}ns", + thisPlace.node.id, + end.ru_utime.tv_sec * 1000000000 + end.ru_utime.tv_usec * 1000 - + (start.ru_utime.tv_sec * 1000000000 + start.ru_utime.tv_usec * 1000) + + end.ru_stime.tv_sec * 1000000000 + end.ru_stime.tv_usec * 1000 - + (start.ru_stime.tv_sec * 1000000000 + start.ru_stime.tv_usec * 1000)); + for(std::uint64_t i = 0; i < std::uint64_t(dims.core.x + 1); i++) { + SPDLOG_WARN("Idle time on node: {}, core: {} was {}", + thisPlace.node.id, + std::int8_t((i == std::uint64_t(dims.core.x)) ? -1 : i), + idleCount.get(i)); + } + } + + pando::CommandProcessor::barrier(); + } } return ret; From 5aa41b5e696855fbea2e9e7f18d15968d3a3ef70 Mon Sep 17 00:00:00 2001 From: Brenden Elgarten Date: Fri, 16 Aug 2024 18:43:38 +0000 Subject: [PATCH 5/6] Add flag to tc microbench to enable binary search --- .../triangle-counting/include/tc_algos.hpp | 7 +-- .../triangle-counting/include/utils.hpp | 12 ++++-- microbench/triangle-counting/src/tc.cpp | 2 +- microbench/triangle-counting/src/tc_algos.cpp | 43 +++++++++++++------ microbench/triangle-counting/src/utils.cpp | 6 ++- microbench/triangle-counting/test/test_tc.cpp | 8 ++-- 6 files changed, 52 insertions(+), 26 deletions(-) diff --git a/microbench/triangle-counting/include/tc_algos.hpp b/microbench/triangle-counting/include/tc_algos.hpp index 53cc4fc6..470414a6 100644 --- a/microbench/triangle-counting/include/tc_algos.hpp +++ b/microbench/triangle-counting/include/tc_algos.hpp @@ -5,7 +5,7 @@ #include "utils.hpp" -template +template void tc_no_chunk(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count); @@ -15,11 +15,12 @@ void tc_chunk_edges(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count); */ -template +template void tc_chunk_vertices(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count); void HBMainTC(pando::Array filename, int64_t num_vertices, bool load_balanced_graph, - TC_CHUNK tc_chunk, galois::DAccumulator final_tri_count); + TC_CHUNK tc_chunk, bool binary_search, + galois::DAccumulator final_tri_count); #endif // TRIANGLE_COUNTING_INCLUDE_TC_ALGOS_HPP_ diff --git a/microbench/triangle-counting/include/utils.hpp b/microbench/triangle-counting/include/utils.hpp index d2410d0c..5b39da67 100644 --- a/microbench/triangle-counting/include/utils.hpp +++ b/microbench/triangle-counting/include/utils.hpp @@ -42,6 +42,7 @@ struct CommandLineOptions { int64_t num_vertices = 0; bool load_balanced_graph = false; TC_CHUNK tc_chunk = TC_CHUNK::NO_CHUNK; + bool binary_search = false; void print() { printf("******** CommandLineOptions ******** \n"); @@ -49,11 +50,16 @@ struct CommandLineOptions { std::cout << "num_vertices = " << num_vertices << '\n'; std::cout << "load_balanced_graph = " << load_balanced_graph << '\n'; std::cout << "tc_chunk = " << tc_chunk << '\n'; + std::cout << "binary_search = " << binary_search << '\n'; printf("******** END CommandLineOptions ******** \n"); } CommandLineOptions() - : elFile(""), num_vertices(0), load_balanced_graph(false), tc_chunk(TC_CHUNK::NO_CHUNK) {} + : elFile(""), + num_vertices(0), + load_balanced_graph(false), + tc_chunk(TC_CHUNK::NO_CHUNK), + binary_search(false) {} }; std::unique_ptr read_cmd_line_args(int argc, char** argv); @@ -93,8 +99,7 @@ void intersect_dag_merge(pando::GlobalPtr graph_ptr, } template -void intersect_dag_merge_double_binary(galois::WaitGroup::HandleType wgh, - pando::GlobalPtr graph_ptr, +void intersect_dag_merge_double_binary(pando::GlobalPtr graph_ptr, typename GraphType::VertexTopologyID v0, typename GraphType::VertexTopologyID v1, galois::DAccumulator final_tri_count) { @@ -143,7 +148,6 @@ void intersect_dag_merge_double_binary(galois::WaitGroup::HandleType wgh, } } final_tri_count.add(count); - wgh.done(); } // ##################################################################### diff --git a/microbench/triangle-counting/src/tc.cpp b/microbench/triangle-counting/src/tc.cpp index da67d602..4cd01205 100644 --- a/microbench/triangle-counting/src/tc.cpp +++ b/microbench/triangle-counting/src/tc.cpp @@ -23,7 +23,7 @@ int pandoMain(int argc, char** argv) { PANDO_CHECK(final_tri_count.initialize()); HBMainTC(filename, opts->num_vertices, opts->load_balanced_graph, opts->tc_chunk, - final_tri_count); + opts->binary_search, final_tri_count); std::cout << "*** FINAL TRI COUNT = " << final_tri_count.reduce() << "\n"; #if BENCHMARK diff --git a/microbench/triangle-counting/src/tc_algos.cpp b/microbench/triangle-counting/src/tc_algos.cpp index b9fcbeaf..d6662a1d 100644 --- a/microbench/triangle-counting/src/tc_algos.cpp +++ b/microbench/triangle-counting/src/tc_algos.cpp @@ -13,7 +13,7 @@ * @param[in] graph_ptr Pointer to the in-memory graph * @param[in] final_tri_count Thread-safe counter */ -template +template void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr graph_ptr, typename Graph::VertexTopologyID v0, typename Graph::EdgeRange edge_range, galois::DAccumulator final_tri_count) { @@ -25,7 +25,10 @@ void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr auto [graph_ptr, v0, wgh, final_tri_count] = innerState; Graph g = *graph_ptr; typename Graph::VertexTopologyID v1 = fmap(g, getEdgeDst, eh); - intersect_dag_merge(graph_ptr, v0, v1, final_tri_count); + if (binary_search) + intersect_dag_merge_double_binary(graph_ptr, v0, v1, final_tri_count); + else + intersect_dag_merge(graph_ptr, v0, v1, final_tri_count); }, [&graph](decltype(innerState) innerState, typename Graph::EdgeHandle eh) -> pando::Place { auto v0 = std::get<1>(innerState); @@ -46,7 +49,7 @@ void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr * @param[in] graph_ptr Pointer to the in-memory graph * @param[in] final_tri_count Thread-safe counter */ -template +template void tc_no_chunk(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count) { GraphType graph = *graph_ptr; @@ -67,7 +70,8 @@ void tc_no_chunk(pando::GlobalPtr graph_ptr, if (v0_degree < (TC_EMBEDDING_SZ - 1)) return; - edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), final_tri_count); + edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), + final_tri_count); }); PANDO_CHECK(wg.wait()); wg.deinitialize(); @@ -134,6 +138,7 @@ void tc_chunk_edges(pando::GlobalPtr graph_ptr, * @param[in] graph_ptr Pointer to the in-memory graph * @param[in] final_tri_count Thread-safe counter */ +template void tc_chunk_vertices(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count) { GraphDL graph = *graph_ptr; @@ -177,7 +182,8 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, if (v0_degree < (TC_EMBEDDING_SZ - 1)) return; - edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), final_tri_count); + edge_tc_counting(wgh, graph_ptr, v0, graph.edges(v0), + final_tri_count); }); PANDO_CHECK(wg.wait()); @@ -206,7 +212,8 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, // TC GRAPH HBMAINS // ##################################################################### void HBGraphDL(pando::Place thisPlace, pando::Array filename, int64_t num_vertices, - TC_CHUNK tc_chunk, galois::DAccumulator final_tri_count) { + TC_CHUNK tc_chunk, bool binary_search, + galois::DAccumulator final_tri_count) { #if BENCHMARK auto time_graph_import_st = std::chrono::high_resolution_clock().now(); #endif @@ -235,7 +242,10 @@ void HBGraphDL(pando::Place thisPlace, pando::Array filename, int64_t num_ switch (tc_chunk) { case TC_CHUNK::CHUNK_VERTICES: - tc_chunk_vertices(graph_ptr, final_tri_count); + if (binary_search) + tc_chunk_vertices(graph_ptr, final_tri_count); + else + tc_chunk_vertices(graph_ptr, final_tri_count); break; /** case TC_CHUNK::CHUNK_EDGES: @@ -243,7 +253,10 @@ void HBGraphDL(pando::Place thisPlace, pando::Array filename, int64_t num_ break; */ default: - tc_no_chunk(graph_ptr, final_tri_count); + if (binary_search) + tc_no_chunk(graph_ptr, final_tri_count); + else + tc_no_chunk(graph_ptr, final_tri_count); break; } @@ -261,7 +274,7 @@ void HBGraphDL(pando::Place thisPlace, pando::Array filename, int64_t num_ } void HBGraphDA(pando::Place thisPlace, pando::Array filename, int64_t num_vertices, - galois::DAccumulator final_tri_count) { + bool binary_search, galois::DAccumulator final_tri_count) { #if BENCHMARK auto time_graph_import_st = std::chrono::high_resolution_clock().now(); #endif @@ -288,7 +301,10 @@ void HBGraphDA(pando::Place thisPlace, pando::Array filename, int64_t num_ auto time_tc_algo_st = std::chrono::high_resolution_clock().now(); #endif PANDO_MEM_STAT_NEW_KERNEL("TC_DFS_Algo Start"); - tc_no_chunk(graph_ptr, final_tri_count); + if (binary_search) + tc_no_chunk(graph_ptr, final_tri_count); + else + tc_no_chunk(graph_ptr, final_tri_count); #if BENCHMARK auto time_tc_algo_end = std::chrono::high_resolution_clock().now(); if (thisPlace.node.id == COORDINATOR_ID) @@ -303,11 +319,12 @@ void HBGraphDA(pando::Place thisPlace, pando::Array filename, int64_t num_ } void HBMainTC(pando::Array filename, int64_t num_vertices, bool load_balanced_graph, - TC_CHUNK tc_chunk, galois::DAccumulator final_tri_count) { + TC_CHUNK tc_chunk, bool binary_search, + galois::DAccumulator final_tri_count) { auto thisPlace = pando::getCurrentPlace(); if (load_balanced_graph) - HBGraphDL(thisPlace, filename, num_vertices, tc_chunk, final_tri_count); + HBGraphDL(thisPlace, filename, num_vertices, tc_chunk, binary_search, final_tri_count); else - HBGraphDA(thisPlace, filename, num_vertices, final_tri_count); + HBGraphDA(thisPlace, filename, num_vertices, binary_search, final_tri_count); } diff --git a/microbench/triangle-counting/src/utils.cpp b/microbench/triangle-counting/src/utils.cpp index 8dbb0a3e..a0bea367 100644 --- a/microbench/triangle-counting/src/utils.cpp +++ b/microbench/triangle-counting/src/utils.cpp @@ -13,7 +13,7 @@ std::unique_ptr read_cmd_line_args(int argc, char** argv) { int32_t flag = 0; int32_t num_vertices = 0; int32_t tc_chunk = NO_CHUNK; - while ((flag = getopt(argc, argv, "v:i:c:l")) != -1) { + while ((flag = getopt(argc, argv, "v:i:c:lb")) != -1) { switch (flag) { case 'v': sscanf(optarg, "%d", &num_vertices); @@ -25,6 +25,9 @@ std::unique_ptr read_cmd_line_args(int argc, char** argv) { case 'l': opts_ptr->load_balanced_graph = true; break; + case 'b': + opts_ptr->binary_search = true; + break; case 'c': sscanf(optarg, "%d", &tc_chunk); switch (tc_chunk) { @@ -65,6 +68,7 @@ void printUsage(char* argv0) { std::cerr << "Usage: " << argv0 << " -i filepath -v numVertices" << std::endl; std::cerr << "\n Can specify runtime algorithm with -c. Valid options: [0 (NO_CHUNK), 1 " "(CHUNK_EDGES), 2 (CHUNK_VERTICES)]\n"; + std::cerr << "Can use double binary search counting with -b. Defaults to linear search\n"; } void printUsageExit(char* argv0) { diff --git a/microbench/triangle-counting/test/test_tc.cpp b/microbench/triangle-counting/test/test_tc.cpp index 2b588538..9a7b23e8 100644 --- a/microbench/triangle-counting/test/test_tc.cpp +++ b/microbench/triangle-counting/test/test_tc.cpp @@ -13,10 +13,10 @@ uint64_t get_expected_TC(const std::string okFile) { } void e2e_tc_test(uint64_t expected_tc, pando::Array filename, uint64_t num_vertices, - bool load_balanced_graph, TC_CHUNK tc_chunk) { + bool load_balanced_graph, bool binary_search, TC_CHUNK tc_chunk) { galois::DAccumulator final_tri_count; EXPECT_EQ(final_tri_count.initialize(), pando::Status::Success); - HBMainTC(filename, num_vertices, load_balanced_graph, tc_chunk, final_tri_count); + HBMainTC(filename, num_vertices, load_balanced_graph, tc_chunk, binary_search, final_tri_count); EXPECT_EQ(final_tri_count.reduce(), expected_tc); final_tri_count.deinitialize(); } @@ -34,7 +34,7 @@ TEST_P(TriangleCountChunking, BasicDL) { for (uint64_t i = 0; i < elFile.size(); i++) filename[i] = elFile[i]; - e2e_tc_test(expected_tc, filename, num_vertices, true, tc_chunk); + e2e_tc_test(expected_tc, filename, num_vertices, true, false, tc_chunk); filename.deinitialize(); } @@ -61,7 +61,7 @@ TEST_P(TriangleCountDACSR, BasicDA) { for (uint64_t i = 0; i < elFile.size(); i++) filename[i] = elFile[i]; - e2e_tc_test(expected_tc, filename, num_vertices, false, TC_CHUNK::NO_CHUNK); + e2e_tc_test(expected_tc, filename, num_vertices, false, false, TC_CHUNK::NO_CHUNK); filename.deinitialize(); } From 4ef6e070fba54b093a7837ca82eef4f0f84f9795 Mon Sep 17 00:00:00 2001 From: Brenden Elgarten Date: Thu, 22 Aug 2024 18:07:09 +0000 Subject: [PATCH 6/6] basic apply implementation + usage in tc microbench --- .../pando-lib-galois/utility/gptr_monad.hpp | 43 +++++++++++++++++++ .../triangle-counting/include/utils.hpp | 6 +-- microbench/triangle-counting/src/tc_algos.cpp | 17 +++++--- 3 files changed, 56 insertions(+), 10 deletions(-) diff --git a/include/pando-lib-galois/utility/gptr_monad.hpp b/include/pando-lib-galois/utility/gptr_monad.hpp index 0d0c7872..3dd74717 100644 --- a/include/pando-lib-galois/utility/gptr_monad.hpp +++ b/include/pando-lib-galois/utility/gptr_monad.hpp @@ -4,6 +4,8 @@ #ifndef PANDO_LIB_GALOIS_UTILITY_GPTR_MONAD_HPP_ #define PANDO_LIB_GALOIS_UTILITY_GPTR_MONAD_HPP_ +#include + /** * @brief lifts a function with no arguments to work on references */ @@ -70,4 +72,45 @@ auto applyFunc(pando::GlobalRef ref, F func) { return func(obj); } +#if 1 +#define apply(ref, func, ...) \ + __extension__({ \ + auto ptrComputed##__LINE__ = &(ref); \ + typename std::pointer_traits::element_type tmp = \ + *ptrComputed##__LINE__; \ + auto ret = tmp.func(__VA_ARGS__); \ + ret; \ + }) + +#elif 0 + +/* F is a method pointer */ +template +auto apply(pando::GlobalRef ref, F func, Args... args) { + T obj = ref; + return (obj.*func)(args...); +} + +/* F is a method pointer */ +template +auto apply(T& ref, F func, Args... args) { + T obj = ref; + return (obj.*func)(args...); +} +#else + +template +R apply(pando::GlobalRef ref, R (T::*func)(As...), As... args) { + T obj = ref; + return (obj.*func)(args...); +} + +template +R apply(T& ref, R (T::*func)(As...), As... args) { + T obj = ref; + return (obj.*func)(args...); +} + +#endif + #endif // PANDO_LIB_GALOIS_UTILITY_GPTR_MONAD_HPP_ diff --git a/microbench/triangle-counting/include/utils.hpp b/microbench/triangle-counting/include/utils.hpp index 5b39da67..c5ba63fe 100644 --- a/microbench/triangle-counting/include/utils.hpp +++ b/microbench/triangle-counting/include/utils.hpp @@ -215,7 +215,7 @@ void vertexset_intersection(pando::GlobalPtr graph_ptr, auto [graph_ptr, v1, final_tri_count, connection_kernel, v1_token] = state; GraphType g = *graph_ptr; (void)eh; // Required to prevent -Werror=unused-parameter - return fmap(g, getLocalityVertex, v1); + return apply(g, getLocalityVertex, v1); }; galois::doAll( @@ -223,9 +223,9 @@ void vertexset_intersection(pando::GlobalPtr graph_ptr, +[](decltype(state) state, typename GraphType::EdgeHandle eh) { auto [graph_ptr, v1, final_tri_count, connection_kernel, v1_token] = state; GraphType g = *graph_ptr; - typename GraphType::VertexTopologyID neighbor_of_v0 = fmap(g, getEdgeDst, eh); + typename GraphType::VertexTopologyID neighbor_of_v0 = apply(g, getEdgeDst, eh); typename GraphType::VertexTokenID neighbor_of_v0_token = - fmap(g, getTokenID, neighbor_of_v0); + apply(g, getTokenID, neighbor_of_v0); // Because of DAG optimization if (neighbor_of_v0_token <= v1_token) diff --git a/microbench/triangle-counting/src/tc_algos.cpp b/microbench/triangle-counting/src/tc_algos.cpp index d6662a1d..7041b5a4 100644 --- a/microbench/triangle-counting/src/tc_algos.cpp +++ b/microbench/triangle-counting/src/tc_algos.cpp @@ -2,6 +2,7 @@ // Copyright (c) 2023. University of Texas at Austin. All rights reserved. #include +#include // ##################################################################### // TC UTILS @@ -24,7 +25,7 @@ void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr +[](decltype(innerState) innerState, typename Graph::EdgeHandle eh) { auto [graph_ptr, v0, wgh, final_tri_count] = innerState; Graph g = *graph_ptr; - typename Graph::VertexTopologyID v1 = fmap(g, getEdgeDst, eh); + typename Graph::VertexTopologyID v1 = apply(g, getEdgeDst, eh); if (binary_search) intersect_dag_merge_double_binary(graph_ptr, v0, v1, final_tri_count); else @@ -32,10 +33,10 @@ void edge_tc_counting(galois::WaitGroup::HandleType wgh, pando::GlobalPtr }, [&graph](decltype(innerState) innerState, typename Graph::EdgeHandle eh) -> pando::Place { auto v0 = std::get<1>(innerState); - typename Graph::VertexTopologyID v1 = fmap(graph, getEdgeDst, eh); - bool v0_higher_degree = fmap(graph, getNumEdges, v0) >= fmap(graph, getNumEdges, v1); - pando::Place locality = v0_higher_degree ? fmap(graph, getLocalityVertex, v0) - : fmap(graph, getLocalityVertex, v1); + typename Graph::VertexTopologyID v1 = apply(graph, getEdgeDst, eh); + bool v0_higher_degree = apply(graph, getNumEdges, v0) >= apply(graph, getNumEdges, v1); + pando::Place locality = v0_higher_degree ? apply(graph, getLocalityVertex, v0) + : apply(graph, getLocalityVertex, v1); return locality; }); } @@ -141,6 +142,8 @@ void tc_chunk_edges(pando::GlobalPtr graph_ptr, template void tc_chunk_vertices(pando::GlobalPtr graph_ptr, galois::DAccumulator final_tri_count) { + using LCSR = galois::LCSR; + GraphDL graph = *graph_ptr; uint64_t query_sz = 1; uint64_t iters = 0; @@ -172,7 +175,7 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, auto wgh = wg.getHandle(); auto inner_state = galois::make_tpl(graph_ptr, final_tri_count, wgh); galois::doAll( - inner_state, fmap(lcsr, vertices, host_vertex_iter_offset, query_sz), + inner_state, apply(lcsr, vertices, host_vertex_iter_offset, query_sz), +[](decltype(inner_state) inner_state, typename GraphDL::VertexTopologyID v0) { auto [graph_ptr, final_tri_count, wgh] = inner_state; GraphDL graph = *graph_ptr; @@ -188,7 +191,7 @@ void tc_chunk_vertices(pando::GlobalPtr graph_ptr, PANDO_CHECK(wg.wait()); // Move iter offset - uint64_t lcsr_num_vertices = fmap(lcsr, size); + uint64_t lcsr_num_vertices = apply(lcsr, size); host_vertex_iter_offset += query_sz; if (host_vertex_iter_offset < lcsr_num_vertices) work_remaining.increment();