From ac4754dbd0bda5feff03a7a97143045507973807 Mon Sep 17 00:00:00 2001 From: Christine Yuan Date: Wed, 3 Apr 2024 18:11:51 -0500 Subject: [PATCH] random cancel policy implementation --- src/linnos/data_gather/Makefile | 21 +++ src/linnos/data_gather/parse-info.py | 48 +++++++ src/linnos/data_gather/run_benchmark.sh | 56 ++++++++ src/linnos/io_replayer/src/op_replayers.cpp | 152 +++++++++++++++----- src/linnos/io_replayer/src/op_replayers.hpp | 12 +- src/linnos/io_replayer/src/replayer.cpp | 22 ++- src/linnos/io_replayer/stats.py | 2 +- 7 files changed, 269 insertions(+), 44 deletions(-) create mode 100644 src/linnos/data_gather/Makefile create mode 100755 src/linnos/data_gather/parse-info.py create mode 100755 src/linnos/data_gather/run_benchmark.sh diff --git a/src/linnos/data_gather/Makefile b/src/linnos/data_gather/Makefile new file mode 100644 index 00000000..d9d83a45 --- /dev/null +++ b/src/linnos/data_gather/Makefile @@ -0,0 +1,21 @@ +# Makefile for running insert_mod.sh 10 times + +# Number of times to run the script +NUM_RUNS := 1 + +# Default target +.PHONY: all +all: run_script + +# Rule to run the script multiple times +.PHONY: run_script +run_script: + @for i in $$(seq 1 $(NUM_RUNS)); do \ + ./run_benchmark.sh; \ + done + +# Clean rule +.PHONY: clean +clean: + # Add clean commands if needed + diff --git a/src/linnos/data_gather/parse-info.py b/src/linnos/data_gather/parse-info.py new file mode 100755 index 00000000..91ba48e8 --- /dev/null +++ b/src/linnos/data_gather/parse-info.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python + +import os + +# Initialize variables to store cumulative sums +total_avg_read_latency = 0 +total_read_latency_p95 = 0 +total_read_latency_p99 = 0 +total_avg_write_latency = 0 +total_io_inter_arrival_time = 0 + +# Count the number of entries +count = 0 + +# Change to user's directory +os.chdir("/home/cyuan/LAKE/src/linnos/io_replayer") + +# Change to the specific policy's output file +with open("baseline_random_cancel_mix.out", "r") as file: + lines = file.readlines() + for i in range(0, len(lines), 6): + count += 1 + avg_read_latency = float(lines[i+1].split(":")[1].strip().split()[0]) + read_latency_p95 = float(lines[i+2].split(":")[1].strip().split()[0]) + read_latency_p99 = float(lines[i+3].split(":")[1].strip().split()[0]) + avg_write_latency = float(lines[i+4].split(":")[1].strip().split()[0]) + io_inter_arrival_time = float(lines[i+5].split(":")[1].strip().split()[0]) + + # Add to cumulative sums + total_avg_read_latency += avg_read_latency + total_read_latency_p95 += read_latency_p95 + total_read_latency_p99 += read_latency_p99 + total_avg_write_latency += avg_write_latency + total_io_inter_arrival_time += io_inter_arrival_time + +# Calculate averages +average_avg_read_latency = total_avg_read_latency / count +average_read_latency_p95 = total_read_latency_p95 / count +average_read_latency_p99 = total_read_latency_p99 / count +average_avg_write_latency = total_avg_write_latency / count +average_io_inter_arrival_time = total_io_inter_arrival_time / count + +# Print results +print("Average read latency:", average_avg_read_latency, "us") +print("Average read latency p95:", average_read_latency_p95, "us") +print("Average read latency p99:", average_read_latency_p99, "us") +print("Average write latency:", average_avg_write_latency, "us") +print("Average IO inter arrival time:", average_io_inter_arrival_time, "us") diff --git a/src/linnos/data_gather/run_benchmark.sh b/src/linnos/data_gather/run_benchmark.sh new file mode 100755 index 00000000..53532cef --- /dev/null +++ b/src/linnos/data_gather/run_benchmark.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# enable, disable, make_then_disable, make_then_enable +prep="disable" +count=1 +benchmark="mix" +# baseline, baseline_random_cancel, failover, failover_random_cancel +policy="baseline_random_cancel" +outfile="${policy}_${benchmark}.out" + +if [ "$prep" = "disable" ]; then + sudo ./disable_linnos.sh +elif [ "$prep" = "make_then_disable" ]; then + cd .. + sudo make + cd kernel_hook + echo "building kernel_hook main" + sudo make clean all + sudo ./disable_linnos.sh +elif [ "$prep" = "enable" ]; then + sudo ./disable_linnos.sh + sudo ./enable_linnos_gpu.sh +elif [ "$prep" = "make_then_enable" ]; then + cd .. + sudo make + cd kernel_hook + echo "building kernel_hook main" + sudo make clean all + sudo ./disable_linnos.sh + sudo ./enable_linnos_gpu.sh +fi + +# remake io_replayer +cd ../io_replayer +sudo make + +if [ "$benchmark" != "mix" ]; then + echo "first" + # for all benchmarks that are not mix + for i in $(seq $count); do + echo "Run $i" + ./run_3ssds.sh ${policy} ../trace_tools/${benchmark}/${benchmark}1.trace \ ../trace_tools/${benchmark}/${benchmark}1.trace \ ../trace_tools/${benchmark}/${benchmark}.trace + cd ../io_replayer + python3 stats.py 3ssds_${policy}.data>>${outfile} + sudo rm 3ssds_${policy}.data + done +else + echo "second" + for i in $(seq $count); do + echo "Run $i" + ./run_3ssds.sh ${policy} ../trace_tools/azure/azure1.trace \ ../trace_tools/bing_i/bing_i1.trace \ ../trace_tools/cosmos/cosmos1.trace + cd ../io_replayer + python3 stats.py 3ssds_${policy}.data>>${outfile} + sudo rm 3ssds_${policy}.data + done +fi \ No newline at end of file diff --git a/src/linnos/io_replayer/src/op_replayers.cpp b/src/linnos/io_replayer/src/op_replayers.cpp index ec395f2c..588ce619 100644 --- a/src/linnos/io_replayer/src/op_replayers.cpp +++ b/src/linnos/io_replayer/src/op_replayers.cpp @@ -18,17 +18,22 @@ */ -#include +#include +#include +#include #include "op_replayers.hpp" - -// static int64_t get_next_multiple(uint64_t A, uint64_t B) { -// if (A % B) -// A = A + (B - A % B); -// return A; -// } +#include +#include +#include #define MAX_FAIL 2 +std::random_device rd; +std::mt19937 gen(rd()); +std::mt19937 gen2(rd()); +std::uniform_int_distribution<> distribution2(1, 2); +std::uniform_int_distribution<> distribution(1, 100); // Example range from 1 to 100 + static int sleep_until(uint64_t next) { uint64_t now = get_ns_ts(); int64_t diff = next - now; @@ -51,7 +56,49 @@ void baseline_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* if(trace_op.op == 0) { trace->add_io_count(device); ret = pread(fds[device], buf, trace_op.size, trace_op.offset); - } else if(trace_op.op == 1) { + } + else if(trace_op.op == 1) { + trace->add_io_count(device); + ret = pwrite(fds[device], buf, trace_op.size, trace_op.offset); + } else { + printf("Wrong OP code! %d\n", trace_op.op); + } + + if (ret < 0){ + printf("FAILURE ret: %d\n", ret); + printf("IO error during pread/pwrite: %s\n", strerror(errno)); + printf("err %d\n", errno); + printf("offset in B : %lu\n", trace_op.offset ); + printf("size in B : %lu\n", trace_op.size); + } +} + +/* + This method randomly cancels the request for LAKE, as specified by the rand_int generated. After selecting + whether or not to cancel, the device that the request will be assigned to will also be randomly generated/selected. + This gives us data on the accuracy of the policy when collecting different percentages of cancellation. +*/ +void baseline_random_cancel_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf) { + int ret; + int *fds = trace->get_fds(); + //read + if(trace_op.op == 0) { + baseline_total_count++; + std::srand(static_cast(std::time(nullptr))); + int rand_int = distribution(gen); // Generate a random number + if (1 <= rand_int && rand_int <= 5) { // Decides whether ot not to cancel, change the 5 to 10 or 50 to edit percentages + baseline_random_cancel_count++; + int random = distribution2(gen2); // Randomly picks a device to send request to + if (random == 1) { + device = (--device + 3) % 3; + } else { + device = ++device % 3; + } + } + trace->add_io_count(device); + ret = pread(fds[device], buf, trace_op.size, trace_op.offset); + } + else if(trace_op.op == 1) { trace->add_io_count(device); ret = pwrite(fds[device], buf, trace_op.size, trace_op.offset); } else { @@ -86,7 +133,6 @@ void failover_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* trace->add_fail(device); } if (!success) { - //printf("IO never finished..\n"); trace->add_unique_fail(device); pread(fds[device], buf, trace_op.size, 0); //this is what linnos does } @@ -97,33 +143,64 @@ void failover_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* } else { printf("Wrong OP code! %d\n", trace_op.op); } +} + +/* + This method randomly cancels the request for LAKE, as specified by the rand_int generated. After selecting + whether or not to cancel, the device that the request will be assigned to will also be randomly generated/selected. + This gives us data on the accuracy of the policy when collecting different percentages of cancellation. +*/ +void failover_random_cancel_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf) { + int ret, i; + int *fds = trace->get_fds(); + bool success = false; + bool random_cancel; - // int ret, i; - // int *fds = trace->get_fds(); - // bool success = false; - // //read - // if(trace_op.op == 0) { - // for (i = 0 ; i < MAX_FAIL ; i++) { - // trace->add_io_count((device+i)%2); - // ret = pread(fds[(device+i)%2], buf, trace_op.size, trace_op.offset); - // if (ret > 0) { - // success = true; - // break; - // } - // trace->add_fail(device); - // } - // //max fail.. it looped around, linnos never handled this case - // if (!success) { - // //printf("IO never finished..\n"); - // trace->add_unique_fail(device); - // pread(fds[device], buf, trace_op.size, 0); //this is what linnos does - // } - // } else if(trace_op.op == 1) { - // trace->add_io_count(device); - // ret = pwrite(fds[device], buf, trace_op.size, trace_op.offset); - // } else { - // printf("Wrong OP code! %d\n", trace_op.op); - // } + //read + if(trace_op.op == 0) { + total_count++; + trace->add_io_count(device); + ret = pread(fds[device], buf, trace_op.size, trace_op.offset); + if (ret < 0) { + trace->add_fail(device); + for (i = 0 ; i < MAX_FAIL ; i++) { + std::srand(static_cast(std::time(nullptr))); + int rand_int = distribution(gen); // Generate a random number + if (1 <= rand_int && rand_int <= 5) { // Decides whether ot not to cancel, change the 5 to 10 or 50 to edit percentages + random_cancel_count++; + random_cancel = true; + device = (--device + 3) % 3; + } else { + random_cancel = false; + device = ++device % 3; + } + trace->add_io_count(device); + ret = pread(fds[device], buf, trace_op.size, trace_op.offset); + if (random_cancel == true) { + if (ret < 0) { + success = false; + } else { + success = true; + } + break; + } + if (ret > 0) { + success = true; + break; + } + trace->add_fail(device); + } + if (!success) { + trace->add_unique_fail(device); + pread(fds[device], buf, trace_op.size, 0); //this is what linnos does + } + } + } else if(trace_op.op == 1) { + trace->add_io_count(device); + ret = pwrite(fds[device], buf, trace_op.size, trace_op.offset); + } else { + printf("Wrong OP code! %d\n", trace_op.op); + } } void strawman_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf) { @@ -194,6 +271,7 @@ void* replayer_fn(void* arg) { //start together pthread_barrier_wait(targ->sync_barrier); int is_late; + //add variable to see if ml_enabled has changed while (1) { trace_op = trace->get_line(device); if (trace_op.timestamp == -1) { @@ -212,6 +290,8 @@ void* replayer_fn(void* arg) { uint32_t elaps = std::chrono::duration_cast(end - begin).count(); uint64_t end_ts = get_ns_ts(); + //print out whether it's changed or not within the loop + //store results trace->write_output_line(end_ts/1000, elaps, trace_op.op, trace_op.size, trace_op.offset, submission/1000, @@ -220,3 +300,5 @@ void* replayer_fn(void* arg) { free(buf); return 0; } + + diff --git a/src/linnos/io_replayer/src/op_replayers.hpp b/src/linnos/io_replayer/src/op_replayers.hpp index b317dca3..86c2ee4a 100644 --- a/src/linnos/io_replayer/src/op_replayers.hpp +++ b/src/linnos/io_replayer/src/op_replayers.hpp @@ -26,6 +26,9 @@ #include #include "replayer.hpp" +// added +#include + struct Thread_arg { Trace *trace; uint32_t device; @@ -33,15 +36,20 @@ struct Thread_arg { uint64_t start_ts; void (*executor)(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf); + //int percent }; - void baseline_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf); +void baseline_random_cancel_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf); void strawman_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf); void failover_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf); +void failover_random_cancel_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf); void strawman_2ssds_execute_op(TraceOp &trace_op, Trace *trace, uint32_t device, char* buf); void* replayer_fn(void* arg); - +static uint64_t random_cancel_count = 0; +static uint64_t total_count = 0; +static uint64_t baseline_random_cancel_count = 0; +static uint64_t baseline_total_count = 0; #endif \ No newline at end of file diff --git a/src/linnos/io_replayer/src/replayer.cpp b/src/linnos/io_replayer/src/replayer.cpp index da1e8b07..ba71c9b0 100644 --- a/src/linnos/io_replayer/src/replayer.cpp +++ b/src/linnos/io_replayer/src/replayer.cpp @@ -34,6 +34,10 @@ #include #include +// added +#include +#include + #include "replayer.hpp" #include "op_replayers.hpp" @@ -52,7 +56,7 @@ int main (int argc, char **argv) std::string devices_to_trace(argv[3]); int n_devices_to_trace = std::stoi(devices_to_trace); Trace trace(argv[4]); - + printf("Number devices %d\n", n_devices_to_trace); for (int i=0; i < n_devices_to_trace ; i++) { printf("parsing trace %d\n", i); trace.parse_file(i, argv[5+i]); @@ -72,13 +76,16 @@ int main (int argc, char **argv) targs[dev][j].trace = &trace; targs[dev][j].device = dev; targs[dev][j].sync_barrier = &sync_barrier; - - if(type == "baseline") + if(type == "baseline") { targs[dev][j].executor = baseline_execute_op; - else if (type == "strawman") { + } else if (type == "baseline_random_cancel") { + targs[dev][j].executor = baseline_random_cancel_execute_op; + } else if (type == "strawman") { targs[dev][j].executor = strawman_execute_op; } else if (type == "failover") { targs[dev][j].executor = failover_execute_op; + } else if (type == "failover_random_cancel") { + targs[dev][j].executor = failover_random_cancel_execute_op; } else if (type == "strawman2") { targs[dev][j].executor = strawman_2ssds_execute_op; } else { @@ -108,9 +115,12 @@ int main (int argc, char **argv) auto end = std::chrono::steady_clock::now(); uint64_t elaps = std::chrono::duration_cast(end - begin).count(); - printf("Trace took %lu seconds to finish.\n", elaps); + printf("Trace took %ld seconds to finish.\n", elaps); + // printf("Total number of read operations in failover: %" PRIu64 "\n", total_count); + // printf("Total number of randomly cancelled operations in failover: %" PRIu64 "\n", random_cancel_count); trace.print_stats(); + // printf("Trace has %ld requests\n", trace->get_io_count(device)); return 0; -} +} \ No newline at end of file diff --git a/src/linnos/io_replayer/stats.py b/src/linnos/io_replayer/stats.py index 5cc5a587..22aad688 100755 --- a/src/linnos/io_replayer/stats.py +++ b/src/linnos/io_replayer/stats.py @@ -90,7 +90,7 @@ #for i in range(3): # print(f"Avg read for drive [{i}]: {statistics.mean(rd_lat_dev[i]):.2f}") - print (f"IO inter arrival time average {statistics.mean(inter_arrivals):.2f}us") + print (f"IO inter arrival time average: {statistics.mean(inter_arrivals):.2f} us") # count, x = np.histogram(inters, bins=500)