From 76d77f75c46d7f4cf75c364499d0aaa32140818b Mon Sep 17 00:00:00 2001 From: kaoudis Date: Mon, 2 Dec 2024 21:56:37 +0000 Subject: [PATCH 001/112] add ignoring vector types to gep tainting - otherwise I think we overtaint AND things break --- polytracker/src/passes/tainted_control_flow.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index d8142794..8498d681 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -95,6 +95,11 @@ void TaintedControlFlowPass::visitGetElementPtrInst( continue; } + // we do not handle VectorTypes yet + if ((*(idx->getType())).isVectorTy()) { + continue; + } + auto callret = ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), get_function_id_const(gep)}); From f39a88b22e25f2525b7b0826b42e29e19f282559 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Tue, 3 Dec 2024 15:11:54 +0000 Subject: [PATCH 002/112] merge the relevant bits of the separate function tracing pass into tcf, so that we can reuse or get rid of some of the duplicate tdag sections, and get rid of the separate json file for functions --- polytracker/build.py | 20 +------ polytracker/include/taintdag/fntrace.h | 35 ------------ polytracker/include/taintdag/polytracker.h | 13 +---- polytracker/src/CMakeLists.txt | 1 - polytracker/src/passes/CMakeLists.txt | 2 +- polytracker/src/passes/function_tracing.cpp | 57 ------------------- polytracker/src/passes/pass_plugin.cpp | 5 -- .../src/passes/tainted_control_flow.cpp | 17 +++++- polytracker/src/polytracker/polytracker.cpp | 16 ------ polytracker/src/taintdag/fntrace.cpp | 24 -------- polytracker/src/taintdag/polytracker.cpp | 17 ------ unittests/src/taintdag/fntrace.cpp | 42 +++++++------- 12 files changed, 41 insertions(+), 208 deletions(-) delete mode 100644 polytracker/include/taintdag/fntrace.h delete mode 100644 polytracker/src/passes/function_tracing.cpp delete mode 100644 polytracker/src/taintdag/fntrace.cpp diff --git a/polytracker/build.py b/polytracker/build.py index eeded59c..3a81f6d8 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -198,7 +198,7 @@ def _instrument_bitcode( pass_pipeline.append("pt-taint") if add_function_tracing: - pass_pipeline.append("pt-ftrace") + pass_pipeline.append("pt-tcf") if add_taint_tracking: pass_pipeline += ["pt-dfsan", "pt-rm-fn-attr"] @@ -218,7 +218,7 @@ def _instrument_bitcode( cmd.append(f"-pt-dfsan-abilist={ABI_PATH}/{item}") if add_function_tracing: - # ignore lists for `pt-ftrace` + # ignore lists for `pt-tcf` (function tracing for control flow logging) cmd.append(f"-pt-ftrace-ignore-list={POLY_ABI_LIST_PATH}") for item in ignore_lists: cmd.append(f"-pt-ftrace-ignore-list={ABI_PATH}/{item}") @@ -322,12 +322,6 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="instrument with taint tracking", ) - parser.add_argument( - "--ftrace", - action="store_true", - help="instrument with function tracing", - ) - parser.add_argument( "--ignore-lists", nargs="+", @@ -341,7 +335,6 @@ def run(self, args: argparse.Namespace): args.output, args.ignore_lists, args.taint, - args.ftrace, ) @@ -405,12 +398,6 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="instrument with taint tracking", ) - parser.add_argument( - "--ftrace", - action="store_true", - help="instrument with function tracing", - ) - parser.add_argument( "--ignore-lists", nargs="+", @@ -421,7 +408,7 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): parser.add_argument( "--cflog", action="store_true", - help="instrument with control affecting dataflow logging", + help="instrument with function tracing and control affecting dataflow logging", ) def run(self, args: argparse.Namespace): @@ -442,6 +429,5 @@ def run(self, args: argparse.Namespace): inst_bc_path, args.ignore_lists, args.taint, - args.ftrace, ) _lower_bitcode(inst_bc_path, Path(inst_bc_path.stem), target_cmd) diff --git a/polytracker/include/taintdag/fntrace.h b/polytracker/include/taintdag/fntrace.h deleted file mode 100644 index c150ab44..00000000 --- a/polytracker/include/taintdag/fntrace.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2022-present, Trail of Bits, Inc. - * All rights reserved. - * - * This source code is licensed in accordance with the terms specified in - * the LICENSE file found in the root directory of this source tree. - */ - -#pragma once - -#include "taintdag/fnmapping.h" - -#include - -namespace taintdag { - -struct Event { -public: - enum class kind_t : uint8_t { entry, exit }; - kind_t kind; - Functions::index_t function; -}; - -class Events : public FixedSizeAlloc { -public: - static constexpr uint8_t tag{7}; - static constexpr size_t allocation_size{std::numeric_limits::max() * - sizeof(Event)}; - - template Events(SectionArg of) : FixedSizeAlloc{of.range} {} - - void log_fn_event(Event::kind_t kind, Functions::index_t idx); -}; - -} // namespace taintdag \ No newline at end of file diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index 751f6d17..6c27b69e 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -14,7 +14,6 @@ #include "taintdag/bitmap_section.h" #include "taintdag/control_flow_log.h" #include "taintdag/fnmapping.h" -#include "taintdag/fntrace.h" #include "taintdag/labels.h" #include "taintdag/sink.h" #include "taintdag/stream_offset.h" @@ -60,14 +59,9 @@ class PolyTracker { void log_tainted_control_flow(label_t taint_label, uint32_t function_id); // Instrumentation callback for when execution enters a function - // NOTE: There is a overlap in functionality between this and `function_entry` - // they will co-exist for now as they operate slightly different. The - // underlying reason is that this was developed separately to support the - // Tainted Control Flow logging mechanism. void enter_function(uint32_t function_id); // Instrumentation callback for when execution leaves a function - // NOTE: Se `enter_function` comment about overlap. void leave_function(uint32_t function_id); // Log tainted data flowed into the sink @@ -75,11 +69,6 @@ class PolyTracker { // Same as before, but use same label for all data void taint_sink(int fd, util::Offset offset, label_t label, size_t length); - // Log function entry - Functions::index_t function_entry(std::string_view name); - // Log function exit - void function_exit(Functions::index_t index); - private: taint_range_t create_source_taint(source_index_t src, std::span dst, @@ -95,7 +84,7 @@ class PolyTracker { // sections and in which order they appear. using ConcreteOutputFile = OutputFile; + SourceLabelIndexSection, Functions, ControlFlowLog>; ConcreteOutputFile output_file_; // Tracking source offsets for streams (where offsets can be determined by diff --git a/polytracker/src/CMakeLists.txt b/polytracker/src/CMakeLists.txt index 66070107..2950ea1f 100644 --- a/polytracker/src/CMakeLists.txt +++ b/polytracker/src/CMakeLists.txt @@ -42,7 +42,6 @@ set(TAINTDAG_SOURCES ${TAINTDAG_DIR}/polytracker.cpp ${TAINTDAG_DIR}/print.cpp ${TAINTDAG_DIR}/fnmapping.cpp - ${TAINTDAG_DIR}/fntrace.cpp ${TAINTDAG_DIR}/util.cpp) add_library(Polytracker STATIC ${POLYTRACKER_SOURCES} ${TAINT_SOURCES} diff --git a/polytracker/src/passes/CMakeLists.txt b/polytracker/src/passes/CMakeLists.txt index af6aaa9d..bd68ca52 100644 --- a/polytracker/src/passes/CMakeLists.txt +++ b/polytracker/src/passes/CMakeLists.txt @@ -6,7 +6,7 @@ endif(APPLE) add_library( PolytrackerPass SHARED - taint_tracking.cpp remove_fn_attr.cpp function_tracing.cpp tainted_control_flow.cpp + taint_tracking.cpp remove_fn_attr.cpp tainted_control_flow.cpp DataFlowSanitizer.cpp utils.cpp pass_plugin.cpp) target_link_libraries( diff --git a/polytracker/src/passes/function_tracing.cpp b/polytracker/src/passes/function_tracing.cpp deleted file mode 100644 index 9fb228e1..00000000 --- a/polytracker/src/passes/function_tracing.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2022-present, Trail of Bits, Inc. - * All rights reserved. - * - * This source code is licensed in accordance with the terms specified in - * the LICENSE file found in the root directory of this source tree. - */ - -#include "polytracker/passes/function_tracing.h" - -#include -#include - -#include "polytracker/passes/utils.h" -#include "taintdag/fnmapping.h" - -static llvm::cl::list ignore_lists( - "pt-ftrace-ignore-list", - llvm::cl::desc( - "File that specifies functions that pt-ftrace should ignore")); - -namespace polytracker { - -void FunctionTracingPass::declareLoggingFunctions(llvm::Module &mod) { - llvm::IRBuilder<> ir(mod.getContext()); - auto fn_index_t{ir.getIntNTy(sizeof(taintdag::Functions::index_t) * 8)}; - func_entry_log_fn = - mod.getOrInsertFunction("__polytracker_log_func_entry", fn_index_t, - ir.getInt8PtrTy(), ir.getInt16Ty()); - func_exit_log_fn = mod.getOrInsertFunction("__polytracker_log_func_exit", - ir.getVoidTy(), fn_index_t); -} - -void FunctionTracingPass::visitReturnInst(llvm::ReturnInst &ri) { - llvm::IRBuilder<> ir(&ri); - ir.CreateCall(func_exit_log_fn, log_entry_calls[ri.getFunction()]); -} - -llvm::PreservedAnalyses -FunctionTracingPass::run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam) { - declareLoggingFunctions(mod); - auto ignore{readIgnoreLists(ignore_lists)}; - for (auto &fn : mod) { - auto fname{fn.getName()}; - if (fn.isDeclaration() || ignore.count(fname.str())) { - continue; - } - llvm::IRBuilder<> ir(&*fn.getEntryBlock().begin()); - auto fname_ptr{ir.CreateGlobalStringPtr(fname)}; - log_entry_calls[&fn] = ir.CreateCall( - func_entry_log_fn, {fname_ptr, ir.getInt16(fname.size())}); - visit(fn); - } - return llvm::PreservedAnalyses::none(); -} - -} // namespace polytracker \ No newline at end of file diff --git a/polytracker/src/passes/pass_plugin.cpp b/polytracker/src/passes/pass_plugin.cpp index e8ad4a1e..5de824cf 100644 --- a/polytracker/src/passes/pass_plugin.cpp +++ b/polytracker/src/passes/pass_plugin.cpp @@ -10,7 +10,6 @@ #include #include "polytracker/passes/DataFlowSanitizer.h" -#include "polytracker/passes/function_tracing.h" #include "polytracker/passes/remove_fn_attr.h" #include "polytracker/passes/taint_tracking.h" #include "polytracker/passes/tainted_control_flow.h" @@ -33,10 +32,6 @@ llvm::PassPluginLibraryInfo getPolyTrackerPluginInfo() { mpm.addPass(polytracker::RemoveFnAttrsPass()); return true; } - if (name == "pt-ftrace") { - mpm.addPass(polytracker::FunctionTracingPass()); - return true; - } if (name == "pt-tcf") { mpm.addPass(polytracker::TaintedControlFlowPass()); return true; diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 8498d681..8b82d113 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -20,6 +20,11 @@ #include +static llvm::cl::list ignore_lists( + "pt-ftrace-ignore-list", + llvm::cl::desc( + "File that specifies functions that pt-tcf should ignore")); + namespace polytracker { namespace detail { @@ -185,10 +190,18 @@ TaintedControlFlowPass::run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam) { label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); declareLoggingFunctions(mod); + auto fnsToIgnore{readIgnoreLists(ignore_lists)}; + for (auto &fn : mod) { - instrumentFunctionEnter(fn); - visit(fn); + auto fname{fn.getName()}; + if (fnsToIgnore.count(fname.str())) { + continue; + } else { + instrumentFunctionEnter(fn); + visit(fn); + } } + return llvm::PreservedAnalyses::none(); } diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 56064e3b..587f9d6e 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -21,22 +21,6 @@ static void polytracker_initialize() { polytracker_init_flag.test_and_set(std::memory_order_relaxed); } -extern "C" taintdag::Functions::index_t -__polytracker_log_func_entry(char *name, uint16_t len) { - if (!polytracker_is_initialized()) { - return 0; - } - return get_polytracker_tdag().function_entry({name, len}); -} - -extern "C" void -__polytracker_log_func_exit(taintdag::Functions::index_t func_index) { - if (!polytracker_is_initialized()) { - return; - } - get_polytracker_tdag().function_exit(func_index); -} - extern "C" dfsan_label __polytracker_union_table(const dfsan_label &l1, const dfsan_label &l2) { if (!polytracker_is_initialized()) { diff --git a/polytracker/src/taintdag/fntrace.cpp b/polytracker/src/taintdag/fntrace.cpp deleted file mode 100644 index 745fda53..00000000 --- a/polytracker/src/taintdag/fntrace.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2022-present, Trail of Bits, Inc. - * All rights reserved. - * - * This source code is licensed in accordance with the terms specified in - * the LICENSE file found in the root directory of this source tree. - */ - -#include "taintdag/fntrace.h" - -#include - -#include "taintdag/error.h" - -namespace taintdag { - -void Events::log_fn_event(Event::kind_t kind, Functions::index_t idx) { - // Write an `Event` via `construct` - if (!construct(kind, idx)) { - error_exit("Failed to log event with id: ", count()); - } -} - -} // namespace taintdag \ No newline at end of file diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index 65b683a8..6f48a34e 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -15,7 +15,6 @@ #include "taintdag/error.h" #include "taintdag/fnmapping.h" -#include "taintdag/fntrace.h" namespace taintdag { @@ -187,20 +186,4 @@ void PolyTracker::leave_function(uint32_t function_id) { output_file_.section().leave_function(function_id); } -Functions::index_t PolyTracker::function_entry(std::string_view name) { - auto &functions{output_file_.section()}; - auto maybe_index{functions.add_mapping(name)}; - if (!maybe_index) { - error_exit("Failed to add function mapping for: ", name); - } - auto &events{output_file_.section()}; - events.log_fn_event(Event::kind_t::entry, *maybe_index); - return *maybe_index; -} - -void PolyTracker::function_exit(Functions::index_t index) { - auto &events{output_file_.section()}; - events.log_fn_event(Event::kind_t::exit, index); -} - } // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/fntrace.cpp b/unittests/src/taintdag/fntrace.cpp index a1600e38..332b6810 100644 --- a/unittests/src/taintdag/fntrace.cpp +++ b/unittests/src/taintdag/fntrace.cpp @@ -6,26 +6,26 @@ * the LICENSE file found in the root directory of this source tree. */ -#include "taintdag/fntrace.h" +// #include "taintdag/tainted_control_flow.h" -#include +// #include -TEST_CASE("Test fntrace operations") { - namespace td = taintdag; - SECTION("Log unique events") { - td::OutputFile of{std::tmpnam(nullptr)}; - auto &events{of.section()}; - td::Functions::index_t fnidx{0}; - events.log_fn_event(td::Event::kind_t::entry, fnidx); - events.log_fn_event(td::Event::kind_t::exit, fnidx); - SECTION("Events are successfully written") { - REQUIRE(events.count() == 2); - td::Event entry{*events.begin()}; - REQUIRE(entry.kind == td::Event::kind_t::entry); - REQUIRE(entry.function == fnidx); - td::Event exit{*(events.begin() + 1)}; - REQUIRE(exit.kind == td::Event::kind_t::exit); - REQUIRE(exit.function == fnidx); - } - } -} \ No newline at end of file +// TEST_CASE("Test fntrace operations") { +// namespace td = taintdag; +// SECTION("Log unique events") { +// td::OutputFile of{std::mkstemp(nullptr)}; +// auto &events{of.section()}; +// td::Functions::index_t fnidx{0}; +// events.log_fn_event(td::Event::kind_t::entry, fnidx); +// events.log_fn_event(td::Event::kind_t::exit, fnidx); +// SECTION("Events are successfully written") { +// REQUIRE(events.count() == 2); +// td::Event entry{*events.begin()}; +// REQUIRE(entry.kind == td::Event::kind_t::entry); +// REQUIRE(entry.function == fnidx); +// td::Event exit{*(events.begin() + 1)}; +// REQUIRE(exit.kind == td::Event::kind_t::exit); +// REQUIRE(exit.function == fnidx); +// } +// } +// } \ No newline at end of file From 2eee49bebd1d2dff7f3306126b3212799f0c1f13 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Tue, 3 Dec 2024 15:26:10 +0000 Subject: [PATCH 003/112] remove ftrace argument uses --- examples/Dockerfile-acropalypse.demo | 2 +- examples/Dockerfile-daedalus-nitf.demo | 2 +- examples/Dockerfile-daedalus-pdf.demo | 4 ++-- examples/Dockerfile-ffmpeg.demo | 2 +- examples/Dockerfile-file.demo | 2 +- examples/Dockerfile-jq.demo | 2 +- examples/Dockerfile-libjpeg.demo | 2 +- examples/Dockerfile-mupdf.demo | 4 ++-- examples/Dockerfile-nitro-nitf.demo | 5 +---- examples/Dockerfile-openjpeg.demo | 2 +- examples/Dockerfile-poppler.demo | 4 ++-- examples/Dockerfile-qpdf.demo | 4 ++-- examples/Dockerfile-xpdf.demo | 6 +++--- examples/analysis/ubet/Dockerfile.nitro | 12 ++---------- examples/analysis/ubet/build_nitro.sh | 4 ++-- examples/http/httpd/Dockerfile | 12 ++++++------ examples/http/picohttpparser/Dockerfile | 2 +- tests/conftest.py | 2 +- 18 files changed, 31 insertions(+), 42 deletions(-) diff --git a/examples/Dockerfile-acropalypse.demo b/examples/Dockerfile-acropalypse.demo index fedbd39f..95f88910 100644 --- a/examples/Dockerfile-acropalypse.demo +++ b/examples/Dockerfile-acropalypse.demo @@ -27,5 +27,5 @@ RUN CPPFLAGS="-I$(pwd)/zlib-1.2.13/include" LDFLAGS="-L$(pwd)/zlib-1.2.13/lib" p RUN polytracker extract-bc -o pngtest.bc pngtest RUN llvm-link -o pngtest-linked.bc pngtest.bc libz.bc -RUN polytracker instrument-bc --taint --ftrace pngtest-linked.bc -o instrumented.bc +RUN polytracker instrument-bc --taint --cflog pngtest-linked.bc -o instrumented.bc RUN polytracker lower-bc instrumented.bc -t pngtest -o pngtest.instrumented diff --git a/examples/Dockerfile-daedalus-nitf.demo b/examples/Dockerfile-daedalus-nitf.demo index 3e86d07f..ebdfabeb 100644 --- a/examples/Dockerfile-daedalus-nitf.demo +++ b/examples/Dockerfile-daedalus-nitf.demo @@ -30,5 +30,5 @@ RUN cabal run ../../:daedalus -- compile-c++ nitf_main.ddl --out-dir=cpp_parser WORKDIR /polytracker/the_klondike/daedalus/formats/nitf/cpp_parser RUN polytracker build make parser && \ - polytracker instrument-targets --taint --ftrace parser --ignore-lists gmp ssl libz && \ + polytracker instrument-targets --taint --cflog parser --ignore-lists gmp ssl libz && \ mv parser.instrumented parser-track diff --git a/examples/Dockerfile-daedalus-pdf.demo b/examples/Dockerfile-daedalus-pdf.demo index a3ebca69..f8d9edd5 100644 --- a/examples/Dockerfile-daedalus-pdf.demo +++ b/examples/Dockerfile-daedalus-pdf.demo @@ -4,7 +4,7 @@ RUN mkdir -p /polytracker/the_klondike WORKDIR /polytracker/the_klondike -ENV DEBIAN_FRONTEND=noninteractive +ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get -y upgrade && apt-get install -y git pkg-config RUN git clone --recursive https://github.com/GaloisInc/daedalus.git @@ -30,5 +30,5 @@ WORKDIR /polytracker/the_klondike/daedalus/formats/pdf/new/c++ RUN polytracker build cmake -S . -B build RUN polytracker build cmake --build build --target parser-test -j$(nproc) -RUN polytracker instrument-targets --taint --ftrace parser-test --ignore-lists gmp ssl libz +RUN polytracker instrument-targets --taint --cflog parser-test --ignore-lists gmp ssl libz RUN mv parser-test.instrumented parser-test-track \ No newline at end of file diff --git a/examples/Dockerfile-ffmpeg.demo b/examples/Dockerfile-ffmpeg.demo index 5e19b135..91bcb336 100644 --- a/examples/Dockerfile-ffmpeg.demo +++ b/examples/Dockerfile-ffmpeg.demo @@ -27,7 +27,7 @@ RUN ../configure --disable-everything \ --disable-asm RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace ffmpeg_g --ignore-lists libz +RUN polytracker instrument-targets --taint --cflog ffmpeg_g --ignore-lists libz RUN mv ffmpeg_g.instrumented ffmpeg_track # Use the following command to transcode a `.mov` h264/aac file to an `.avi` raw/aac file diff --git a/examples/Dockerfile-file.demo b/examples/Dockerfile-file.demo index 46c48492..ceb94a1a 100644 --- a/examples/Dockerfile-file.demo +++ b/examples/Dockerfile-file.demo @@ -16,5 +16,5 @@ RUN git fetch --tags && \ RUN autoreconf -fiv RUN ./configure --prefix=/polytracker/the_klondike/bin/ --disable-shared RUN polytracker build make -j$((`nproc`+1)) install -RUN polytracker instrument-targets --taint --ftrace file --ignore-lists libz +RUN polytracker instrument-targets --taint --cflog file --ignore-lists libz RUN mv file.instrumented file_track diff --git a/examples/Dockerfile-jq.demo b/examples/Dockerfile-jq.demo index 54b68464..27dabc9a 100644 --- a/examples/Dockerfile-jq.demo +++ b/examples/Dockerfile-jq.demo @@ -11,4 +11,4 @@ WORKDIR /polytracker/the_klondike/jq RUN autoreconf -fi RUN ./configure --with-oniguruma=builtin CC=clang RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace jq \ No newline at end of file +RUN polytracker instrument-targets --taint --cflog jq \ No newline at end of file diff --git a/examples/Dockerfile-libjpeg.demo b/examples/Dockerfile-libjpeg.demo index 39fcc192..80809ce2 100644 --- a/examples/Dockerfile-libjpeg.demo +++ b/examples/Dockerfile-libjpeg.demo @@ -18,6 +18,6 @@ WORKDIR /polytracker/the_klondike/jpeg-9e/build RUN ../configure LDFLAGS="-static" # Build and instrument RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace djpeg +RUN polytracker instrument-targets --taint --cflog djpeg # Create `djpeg_track` RUN mv djpeg.instrumented djpeg_track \ No newline at end of file diff --git a/examples/Dockerfile-mupdf.demo b/examples/Dockerfile-mupdf.demo index e3c2da1c..12945516 100644 --- a/examples/Dockerfile-mupdf.demo +++ b/examples/Dockerfile-mupdf.demo @@ -4,7 +4,7 @@ RUN mkdir -p /polytracker/the_klondike WORKDIR /polytracker/the_klondike -ENV DEBIAN_FRONTEND=noninteractive +ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get -y upgrade && apt-get install -y git pkg-config RUN git clone --recursive --branch 1.20.0 git://git.ghostscript.com/mupdf.git @@ -22,7 +22,7 @@ WORKDIR /polytracker/the_klondike/mupdf RUN git checkout d00de0e96a4a5ec90ffc30837d40cd624a6a89e0 # Instrument mutool RUN polytracker build make -j$((`nproc`+1)) HAVE_X11=no HAVE_GLUT=no prefix=/usr/local build=release install -RUN polytracker instrument-targets --taint --ftrace mutool +RUN polytracker instrument-targets --taint --cflog mutool RUN mv mutool.instrumented mutool_track # Note, the /workdir directory is intended to be mounted at runtime VOLUME ["/workdir"] diff --git a/examples/Dockerfile-nitro-nitf.demo b/examples/Dockerfile-nitro-nitf.demo index 3a84b0d0..e63c4201 100644 --- a/examples/Dockerfile-nitro-nitf.demo +++ b/examples/Dockerfile-nitro-nitf.demo @@ -18,9 +18,6 @@ RUN polytracker build cmake .. \ RUN polytracker build cmake --build . -j$((`nproc`+1)) --target show_nitf++ -RUN polytracker instrument-targets \ - --taint \ - --ftrace \ - show_nitf++ +RUN polytracker instrument-targets --taint --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_track diff --git a/examples/Dockerfile-openjpeg.demo b/examples/Dockerfile-openjpeg.demo index daa7be32..7d758d7b 100644 --- a/examples/Dockerfile-openjpeg.demo +++ b/examples/Dockerfile-openjpeg.demo @@ -20,5 +20,5 @@ RUN polytracker extract-bc bin/opj_decompress -o opj_decompress.bc RUN polytracker extract-bc bin/libopenjp2.a -o libopenjp2.a.bc RUN llvm-link -only-needed opj_decompress.bc libopenjp2.a.bc -o exec.bc RUN polytracker opt-bc exec.bc -o exec.bc -RUN polytracker instrument-bc --taint --ftrace exec.bc -o exec.bc -o exec.instrumented.bc +RUN polytracker instrument-bc --taint --cflog exec.bc -o exec.bc -o exec.instrumented.bc RUN polytracker lower-bc exec.instrumented.bc -t opj_decompress -o opj_decompress_track diff --git a/examples/Dockerfile-poppler.demo b/examples/Dockerfile-poppler.demo index fd58f962..b67d50ff 100644 --- a/examples/Dockerfile-poppler.demo +++ b/examples/Dockerfile-poppler.demo @@ -44,7 +44,7 @@ RUN polytracker build cmake -S . -B build \ RUN polytracker build cmake --build build -j$(nproc) # pdftotext (separate for measurement purposes only) -RUN polytracker instrument-targets --taint --ftrace pdftotext --ignore-lists freetype fontconfig +RUN polytracker instrument-targets --taint --cflog pdftotext --ignore-lists freetype fontconfig # pdftops (separate for measurement purposes only) -RUN polytracker instrument-targets --taint --ftrace pdftops --ignore-lists freetype fontconfig \ No newline at end of file +RUN polytracker instrument-targets --taint --cflog pdftops --ignore-lists freetype fontconfig \ No newline at end of file diff --git a/examples/Dockerfile-qpdf.demo b/examples/Dockerfile-qpdf.demo index 7a98a612..9360d7f2 100644 --- a/examples/Dockerfile-qpdf.demo +++ b/examples/Dockerfile-qpdf.demo @@ -2,7 +2,7 @@ FROM ubuntu:focal AS qpdf-sources WORKDIR /polytracker/the_klondike ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && apt-get install -y git wget +RUN apt-get update && apt-get install -y git wget RUN git clone --depth=1 --branch 11.5 https://github.com/qpdf/qpdf.git RUN wget https://www.ijg.org/files/jpegsrc.v9e.tar.gz && tar xf jpegsrc.v9e.tar.gz @@ -23,5 +23,5 @@ WORKDIR /polytracker/the_klondike/qpdf RUN polytracker build cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DBUILD_STATIC_LIBS=ON #Instrument and build track target RUN polytracker build cmake --build build -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace qpdf --ignore-lists libz +RUN polytracker instrument-targets --taint --cflog qpdf --ignore-lists libz RUN mv qpdf.instrumented qpdf_track diff --git a/examples/Dockerfile-xpdf.demo b/examples/Dockerfile-xpdf.demo index b6b440a4..95ab1972 100644 --- a/examples/Dockerfile-xpdf.demo +++ b/examples/Dockerfile-xpdf.demo @@ -91,8 +91,8 @@ RUN polytracker build make -j$(nproc) install # pdftops.instrumented, pdftotext.instrumented, and pdfinfo.instrumented # These commands are split up for timing / debugging purposes but you could # run them all as one big instrument-targets as well. -RUN polytracker instrument-targets --taint --ftrace pdftotext --ignore-lists freetype fontconfig xml2 libz +RUN polytracker instrument-targets --taint --cflog pdftotext --ignore-lists freetype fontconfig xml2 libz -RUN polytracker instrument-targets --taint --ftrace pdfinfo --ignore-lists freetype fontconfig xml2 libz +RUN polytracker instrument-targets --taint --cflog pdfinfo --ignore-lists freetype fontconfig xml2 libz -RUN polytracker instrument-targets --taint --ftrace pdftops --ignore-lists freetype fontconfig xml2 libz \ No newline at end of file +RUN polytracker instrument-targets --taint --cflog pdftops --ignore-lists freetype fontconfig xml2 libz \ No newline at end of file diff --git a/examples/analysis/ubet/Dockerfile.nitro b/examples/analysis/ubet/Dockerfile.nitro index 3549b341..93ef1436 100644 --- a/examples/analysis/ubet/Dockerfile.nitro +++ b/examples/analysis/ubet/Dockerfile.nitro @@ -30,11 +30,7 @@ RUN polytracker build cmake --build . -j$((`nproc`+1)) --target show_nitf++ --co RUN cp modules/c++/nitf/show_nitf++ nitro_Release -RUN polytracker instrument-targets \ - --taint \ - --ftrace \ - --cflog \ - show_nitf++ +RUN polytracker instrument-targets --taint --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_trackRelease @@ -49,11 +45,7 @@ RUN polytracker build cmake ../.. \ RUN polytracker build cmake --build . -j$((`nproc`+1)) --clean-first --target show_nitf++ --config Debug RUN cp modules/c++/nitf/show_nitf++ nitro_Debug -RUN polytracker instrument-targets \ - --taint \ - --ftrace \ - --cflog \ - show_nitf++ +RUN polytracker instrument-targets --taint --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_trackDebug diff --git a/examples/analysis/ubet/build_nitro.sh b/examples/analysis/ubet/build_nitro.sh index d9a13a2a..2b53039e 100755 --- a/examples/analysis/ubet/build_nitro.sh +++ b/examples/analysis/ubet/build_nitro.sh @@ -14,7 +14,7 @@ opt -load "${COMPILER_DIR}/pass/libPolytrackerPass.so" -load-pass-plugin "${COMP echo "Optmize bitcode" polytracker opt-bc --output O3.bc after_preoptO3.bc echo "Instrument optimized bitcode" -polytracker instrument-bc --ftrace --taint --output instrumentedO3.bc O3.bc +polytracker instrument-bc --cflog --taint --output instrumentedO3.bc O3.bc echo "Lower optimized bitcode" polytracker lower-bc -t show_nitf++ -o nitro_trackRelease instrumentedO3.bc @@ -36,7 +36,7 @@ opt -load "${COMPILER_DIR}/pass/libPolytrackerPass.so" -load-pass-plugin "${COMP cp after_preoptO0.bc O0.bc echo "Instrument non-optimized bitcode" -polytracker instrument-bc --ftrace --taint --output instrumentedO0.bc O0.bc +polytracker instrument-bc --cflog --taint --output instrumentedO0.bc O0.bc echo "Lower non-optimized bitcode" polytracker lower-bc -t show_nitf++ -o nitro_trackDebug instrumentedO0.bc diff --git a/examples/http/httpd/Dockerfile b/examples/http/httpd/Dockerfile index 8c250356..cdf34a9c 100644 --- a/examples/http/httpd/Dockerfile +++ b/examples/http/httpd/Dockerfile @@ -11,7 +11,7 @@ WORKDIR /polytracker/examples/http/httpd/httpd RUN mkdir -p srclib/apr srclib/apr-util srclib/pcre srclib/expat RUN curl https://archive.apache.org/dist/apr/apr-1.7.0.tar.gz -o apr-1.7.0.tar.gz \ && tar xfz apr-1.7.0.tar.gz -C srclib/apr --strip-components 1 \ - && rm apr-1.7.0.tar.gz + && rm apr-1.7.0.tar.gz RUN curl https://archive.apache.org/dist/apr/apr-util-1.6.1.tar.gz -o apr-util-1.6.1.tar.gz \ && tar xfz apr-util-1.6.1.tar.gz -C srclib/apr-util --strip-components 1 \ && rm apr-util-1.6.1.tar.gz @@ -33,19 +33,19 @@ RUN polytracker build make # apr, apr-util are configured via httpd's configure script WORKDIR /polytracker/examples/http/httpd/httpd -RUN polytracker build ./buildconf +RUN polytracker build ./buildconf RUN CFLAGS="-I$(pwd)/srclib/pcre -I$(pwd)/srclib/expat/lib" \ LDFLAGS="-L$(pwd)/srclib/pcre/.libs -L$(pwd)/srclib/expat/lib/.libs" \ polytracker build ./configure --disable-shared --with-mpm=prefork --with-pcre=srclib/pcre/pcre-config --with-included-apr \ --enable-mods-static='authz_core unixd' RUN CFLAGS="-I$(pwd)/srclib/pcre -I$(pwd)/srclib/expat/lib" \ - LDFLAGS="-L$(pwd)/srclib/pcre/.libs -L$(pwd)/srclib/expat/lib/.libs" \ + LDFLAGS="-L$(pwd)/srclib/pcre/.libs -L$(pwd)/srclib/expat/lib/.libs" \ polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace httpd +RUN polytracker instrument-targets --taint --cflog httpd RUN mv httpd.instrumented httpd_track -# overwrite binary to be installed with our polytracker-instrumented version -RUN cp httpd_track httpd +# overwrite binary to be installed with our polytracker-instrumented version +RUN cp httpd_track httpd RUN polytracker build make install COPY harness_httpd.sh /polytracker/examples/http/httpd/ diff --git a/examples/http/picohttpparser/Dockerfile b/examples/http/picohttpparser/Dockerfile index 89cb0fe0..f1d96b99 100644 --- a/examples/http/picohttpparser/Dockerfile +++ b/examples/http/picohttpparser/Dockerfile @@ -10,7 +10,7 @@ COPY Makefile example_picohttpparser.c /polytracker/examples/http/picohttpparser # Build and instrument RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace example_picohttpparser +RUN polytracker instrument-targets --taint --cflog example_picohttpparser RUN mv example_picohttpparser.instrumented example_picohttpparser_track # Note, the /workdir and /testcase directories are intended to be mounted at runtime diff --git a/tests/conftest.py b/tests/conftest.py index 8114bafc..8c1d09a2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,7 +35,7 @@ def build(target: Path, binary: Path) -> None: def instrument(target: str) -> None: - cmd = ["instrument-targets", "--taint", "--ftrace", "--cflog", target] + cmd = ["instrument-targets", "--taint", "--cflog", target] run_polytracker(cmd) From d48a4f352f7a0454b08ea17088bbf42fc71ace15 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Tue, 3 Dec 2024 15:56:28 +0000 Subject: [PATCH 004/112] raise the ValueError without entering the target iteration loop for the controller --- polytracker/build.py | 88 +++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/polytracker/build.py b/polytracker/build.py index 3a81f6d8..0b4b0fb7 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -147,10 +147,17 @@ def _optimize_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: subprocess.check_call(cmd) -def _preopt_instrument_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: +def _preopt_instrument_bitcode( + input_bitcode: Path, + output_bitcode: Path, + ignore_lists: List[str]) -> None: POLY_PASS_PATH: Path = _ensure_path_exists( _compiler_dir_path() / "pass" / "libPolytrackerPass.so" ) + POLY_ABI_LIST_PATH: Path = _ensure_path_exists( + _compiler_dir_path() / "abi_lists" / "polytracker_abilist.txt" + ) + ABI_PATH: Path = _ensure_path_exists(_compiler_dir_path() / "abi_lists") cmd = [ "opt", @@ -163,6 +170,13 @@ def _preopt_instrument_bitcode(input_bitcode: Path, output_bitcode: Path) -> Non "-o", str(output_bitcode), ] + + if ignore_lists and len(ignore_lists) > 0: + # ignore lists for `pt-tcf` (function tracing for control flow logging) + cmd.append(f"-pt-ftrace-ignore-list={POLY_ABI_LIST_PATH}") + for item in ignore_lists: + cmd.append(f"-pt-ftrace-ignore-list={ABI_PATH}/{item}") + # execute `cmd` subprocess.check_call(cmd) @@ -170,9 +184,7 @@ def _preopt_instrument_bitcode(input_bitcode: Path, output_bitcode: Path) -> Non def _instrument_bitcode( input_bitcode: Path, output_bitcode: Path, - ignore_lists: List[str], - add_taint_tracking: bool, - add_function_tracing: bool, + ignore_lists: List[str] ) -> None: POLY_PASS_PATH: Path = _ensure_path_exists( _compiler_dir_path() / "pass" / "libPolytrackerPass.so" @@ -194,34 +206,22 @@ def _instrument_bitcode( ] pass_pipeline: List[str] = [] - if add_taint_tracking: - pass_pipeline.append("pt-taint") + pass_pipeline.append("pt-taint") - if add_function_tracing: - pass_pipeline.append("pt-tcf") - - if add_taint_tracking: - pass_pipeline += ["pt-dfsan", "pt-rm-fn-attr"] + pass_pipeline += ["pt-dfsan", "pt-rm-fn-attr"] cmd.append(f"-passes={','.join(pass_pipeline)}") - if add_taint_tracking: - # ignore lists for `pt-taint` - cmd.append( - f"-pt-taint-ignore-list={POLY_ABI_LIST_PATH}", - ) - for item in ignore_lists: - cmd.append(f"-pt-taint-ignore-list={ABI_PATH}/{item}") - # abi lists for `dfsan` - cmd.append(f"-pt-dfsan-abilist={DFSAN_ABI_LIST_PATH}") - for item in ignore_lists: - cmd.append(f"-pt-dfsan-abilist={ABI_PATH}/{item}") - - if add_function_tracing: - # ignore lists for `pt-tcf` (function tracing for control flow logging) - cmd.append(f"-pt-ftrace-ignore-list={POLY_ABI_LIST_PATH}") - for item in ignore_lists: - cmd.append(f"-pt-ftrace-ignore-list={ABI_PATH}/{item}") + # ignore lists for `pt-taint` + cmd.append( + f"-pt-taint-ignore-list={POLY_ABI_LIST_PATH}", + ) + for item in ignore_lists: + cmd.append(f"-pt-taint-ignore-list={ABI_PATH}/{item}") + # abi lists for `dfsan` + cmd.append(f"-pt-dfsan-abilist={DFSAN_ABI_LIST_PATH}") + for item in ignore_lists: + cmd.append(f"-pt-dfsan-abilist={ABI_PATH}/{item}") # input and output files cmd += [str(input_bitcode), "-o", str(output_bitcode)] @@ -330,12 +330,13 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): ) def run(self, args: argparse.Namespace): - _instrument_bitcode( - args.input, - args.output, - args.ignore_lists, - args.taint, - ) + if args.taint: + _instrument_bitcode( + args.input, + args.output, + args.ignore_lists) + else: + raise ValueError("No action was specified. Try using the argument --taint?") class LowerBitcode(Command): @@ -412,6 +413,9 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): ) def run(self, args: argparse.Namespace): + if not args.taint and not args.cflog: + raise ValueError("Did you specify an action? Try --taint or --cflog") + for target in args.targets: blight_cmds = _read_blight_journal(args.journal_path) target_cmd, target_path = _find_target(target, blight_cmds) @@ -420,14 +424,16 @@ def run(self, args: argparse.Namespace): _extract_bitcode(target_path, bc_path) if args.cflog: # Control affecting data flow logging happens before optimization - _preopt_instrument_bitcode(bc_path, bc_path) + _preopt_instrument_bitcode( + input_bitcode=bc_path, + output_bitcode=bc_path, + ignore_lists=args.ignore_lists) _optimize_bitcode(bc_path, opt_bc) inst_bc_path = Path(f"{bc_path.stem}.instrumented.bc") - _instrument_bitcode( - opt_bc, - inst_bc_path, - args.ignore_lists, - args.taint, - ) + if args.taint: + _instrument_bitcode( + input_bitcode=opt_bc, + output_bitcode=inst_bc_path, + ignore_lists=args.ignore_lists) _lower_bitcode(inst_bc_path, Path(inst_bc_path.stem), target_cmd) From 7fa6086f9c7fe6574b4c4e20ff82d108ee02703c Mon Sep 17 00:00:00 2001 From: kaoudis Date: Tue, 3 Dec 2024 17:37:19 +0000 Subject: [PATCH 005/112] start setting up tests for how I would like the fn section to work --- docs/tdag.md | 1 - polytracker/taint_dag.py | 55 ++++++++++------------------------------ tests/test_cf_log.py | 28 ++++++++++++++++---- tests/test_fntrace.py | 25 ------------------ 4 files changed, 36 insertions(+), 73 deletions(-) delete mode 100644 tests/test_fntrace.py diff --git a/docs/tdag.md b/docs/tdag.md index 993f89fa..6ed22ab7 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -39,7 +39,6 @@ Some specifics: - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) - [Strings](../polytracker/include/taintdag/string_table.h) todo(kaoudis) the string table is used in conjunction with the fnmapping to put together an earlier version of the control flow log used for grammar extraction - [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this contains an early version of the function list part of the control flow log used for grammar extraction -- [Events](../polytracker/include/taintdag/fntrace.h) todo(kaoudis) this contains an early version of the entry and exit events used to structure the control flow log - [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. ## TDAG Contents diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 744e5761..18f318c4 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -129,17 +129,20 @@ def read_raw(self, label): def count(self): return len(self.section) // sizeof(c_uint64) +class Event: + callstack: List = None + label: int = None -class TDEnterFunctionEvent: + def __init__(self, callstack): + """Callstack at the point the event occurred""" + self.callstack = callstack + +class TDEnterFunctionEvent(Event): """Emitted whenever execution enters a function. The callstack member is the callstack right before entering the function, having the function just entered as the last member of the callstack. """ - def __init__(self, callstack): - """Callstack after entering function""" - self.callstack = callstack - def __repr__(self) -> str: return f"Enter: {self.callstack}" @@ -149,16 +152,12 @@ def __eq__(self, __o: object) -> bool: return False -class TDLeaveFunctionEvent: +class TDLeaveFunctionEvent(Event): """Emitted whenever execution leaves a function. The callstack member is the callstack right before leaving the function, having the function about to leave as the last member of the callstack. """ - def __init__(self, callstack): - """Callstack before leaving function""" - self.callstack = callstack - def __repr__(self) -> str: return f"Leave: {self.callstack}" @@ -168,7 +167,7 @@ def __eq__(self, __o: object) -> bool: return False -class TDTaintedControlFlowEvent: +class TDTaintedControlFlowEvent(Event): """Emitted whenever a control flow change is influenced by tainted data. The label that influenced the control flow is available in the `label` member. Current callstack (including the function the control flow happened in) is available @@ -326,15 +325,6 @@ def __iter__(self): yield TDFnHeader.from_buffer_copy(self.section, offset) -class TDEventsSection: - def __init__(self, mem, hdr): - self.section = mem[hdr.offset : hdr.offset + hdr.size] - - def __iter__(self): - for offset in range(0, len(self.section), sizeof(TDEvent)): - yield TDEvent.from_buffer_copy(self.section, offset) - - class TDFDHeader(Structure): """Python representation of the SourceEntry from taint_source.h""" @@ -413,16 +403,6 @@ def __repr__(self) -> str: return f"TDSink fdidx: {self.fdidx} offset: {self.offset} label: {self.label}" -class TDEvent(Structure): - _fields_ = [("kind", c_uint8), ("fnidx", c_uint16)] - - class Kind(Enum): - ENTRY = 0 - EXIT = 1 - - def __repr__(self) -> str: - return f"kind: {self.Kind(self.kind).name} fnidx: {self.fnidx}" - TDSection = Union[ TDLabelSection, @@ -431,7 +411,6 @@ def __repr__(self) -> str: TDSinkSection, TDSourceIndexSection, TDFunctionsSection, - TDEventsSection, TDControlFlowLogSection, ] @@ -475,8 +454,9 @@ def __init__(self, file: BinaryIO) -> None: self.sections.append(TDFunctionsSection(self.buffer, hdr)) self.sections_by_type[TDFunctionsSection] = self.sections[-1] elif hdr.tag == 7: - self.sections.append(TDEventsSection(self.buffer, hdr)) - self.sections_by_type[TDEventsSection] = self.sections[-1] + continue + # self.sections.append(TDEventsSection(self.buffer, hdr)) + # self.sections_by_type[TDEventsSection] = self.sections[-1] elif hdr.tag == 8: self.sections.append(TDControlFlowLogSection(self.buffer, hdr)) self.sections_by_type[TDControlFlowLogSection] = self.sections[-1] @@ -569,15 +549,6 @@ def sinks(self) -> Iterator[TDSink]: assert isinstance(sink_section, TDSinkSection) yield from sink_section.enumerate() - def read_event(self, offset: int) -> TDEvent: - return TDEvent.from_buffer_copy(self.buffer, offset) - - @property - def events(self) -> Iterator[TDEvent]: - events_section = self.sections_by_type[TDEventsSection] - assert isinstance(events_section, TDEventsSection) - yield from events_section - class TDTaintOutput(TaintOutput): def __init__(self, source: Input, output_offset: int, label: int): diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index 5316fac7..5ecc4aba 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -1,5 +1,4 @@ import cxxfilt -import json import pytest import subprocess @@ -7,11 +6,26 @@ from pathlib import Path from polytracker.taint_dag import ( + Event, TDEnterFunctionEvent, TDLeaveFunctionEvent, TDTaintedControlFlowEvent, + TDProgramTrace ) +from polytracker import ProgramTrace +@pytest.mark.program_trace("test_fntrace.cpp") +def test_cf_log_fn_trace(program_trace: ProgramTrace): + assert isinstance(program_trace, TDProgramTrace) + + functions = list(program_trace.tdfile.fn_headers) + names = set(map(lambda f: f[0], functions)) + # we store the names in llvm mangled fashion but + assert names == set(["main", "_Z9factoriali"]) + + # you can easily unmangle them for readability! + functionid_mapping = list(map(cxxfilt.demangle, functions)) + assert functionid_mapping == set(["main", "factorial(int)"]) @pytest.mark.program_trace("test_cf_log.cpp") def test_cf_log(instrumented_binary: Path, trace_file: Path): @@ -34,11 +48,11 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): polytracker.taint_dag.TDControlFlowLogSection ) - # The functionid mapping is available next to the built binary - with open(instrumented_binary.parent / "functionid.json", "rb") as f: - functionid_mapping = list(map(cxxfilt.demangle, json.load(f))) + functions = program_trace.tdfile.fn_headers + + functionid_mapping = list(map(cxxfilt.demangle, functions)) - # Apply the id to function mappign + # Apply the id to function mapping cflog.function_id_mapping(functionid_mapping) expected_seq = [ @@ -66,4 +80,8 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): # NOTE(hbrodin): Could have done assert list(cflog) == expected_seq, but this provides the failed element for got, expected in zip(cflog, expected_seq): + assert type(got) == Event assert got == expected + if type(got) == TDTaintedControlFlowEvent: + # inheritance should make this work? + assert got.label is not None diff --git a/tests/test_fntrace.py b/tests/test_fntrace.py deleted file mode 100644 index 71f7c47d..00000000 --- a/tests/test_fntrace.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from collections import defaultdict -from typing import Dict - -from polytracker import taint_dag, ProgramTrace - - -@pytest.mark.program_trace("test_fntrace.cpp") -def test_fn_headers(program_trace: ProgramTrace): - assert isinstance(program_trace, taint_dag.TDProgramTrace) - functions = list(program_trace.tdfile.fn_headers) - names = set(map(lambda f: f[0], functions)) - assert names == set(["main", "_Z9factoriali"]) - - -@pytest.mark.program_trace("test_fntrace.cpp") -def test_fntrace(program_trace: ProgramTrace): - assert isinstance(program_trace, taint_dag.TDProgramTrace) - events = list(program_trace.tdfile.events) - assert len(events) == 10 - kinds: Dict[taint_dag.TDEvent.Kind, int] = defaultdict(int) - for e in events: - kinds[e.kind] += 1 - assert kinds[taint_dag.TDEvent.Kind.ENTRY] == kinds[taint_dag.TDEvent.Kind.EXIT] From b75f4b194033a970224c2a13824e9821fa002703 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 4 Dec 2024 22:50:30 +0000 Subject: [PATCH 006/112] clean up an extraneous test comment; amend doc --- docs/tdag.md | 2 +- tests/test_cf_log.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/tdag.md b/docs/tdag.md index 6ed22ab7..8fbbb4ca 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -37,7 +37,7 @@ Some specifics: - [Sources](../polytracker/src/taint_sources/taint_sources.cpp) contains source labels (byte offsets into the input) - The Source Label Index is a bitmap that defines how to index the sources section. - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) -- [Strings](../polytracker/include/taintdag/string_table.h) todo(kaoudis) the string table is used in conjunction with the fnmapping to put together an earlier version of the control flow log used for grammar extraction +- [Strings](../polytracker/include/taintdag/string_table.h) todo(kaoudis) the string table is used in conjunction with the fnmapping to put together an earlier version of the control flow log used for grammar extraction. Note that the string table also contains other things! - [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this contains an early version of the function list part of the control flow log used for grammar extraction - [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index 5ecc4aba..9ded510b 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -78,7 +78,8 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): TDLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit ] - # NOTE(hbrodin): Could have done assert list(cflog) == expected_seq, but this provides the failed element + assert len(got) > 0 + for got, expected in zip(cflog, expected_seq): assert type(got) == Event assert got == expected From 7fd92f3cb45e7ad853632df684889c81d108e285 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 4 Dec 2024 22:52:20 +0000 Subject: [PATCH 007/112] alphabetic order is a bit easier to read unless there's something going on here with dep ordering --- polytracker/src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/src/CMakeLists.txt b/polytracker/src/CMakeLists.txt index 2950ea1f..29d01c50 100644 --- a/polytracker/src/CMakeLists.txt +++ b/polytracker/src/CMakeLists.txt @@ -39,9 +39,9 @@ set(TAINTDAG_DIR taintdag) set(TAINTDAG_SOURCES ${TAINTDAG_DIR}/encoding.cpp ${TAINTDAG_DIR}/error.cpp + ${TAINTDAG_DIR}/fnmapping.cpp ${TAINTDAG_DIR}/polytracker.cpp ${TAINTDAG_DIR}/print.cpp - ${TAINTDAG_DIR}/fnmapping.cpp ${TAINTDAG_DIR}/util.cpp) add_library(Polytracker STATIC ${POLYTRACKER_SOURCES} ${TAINT_SOURCES} From 6cdb85474a745daa01f9b441885bc20981d0aabd Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 4 Dec 2024 22:54:34 +0000 Subject: [PATCH 008/112] remove unit test whose functionality was partly deleted / partly migrated into elsewhere --- unittests/src/taintdag/fntrace.cpp | 31 ------------------------------ 1 file changed, 31 deletions(-) delete mode 100644 unittests/src/taintdag/fntrace.cpp diff --git a/unittests/src/taintdag/fntrace.cpp b/unittests/src/taintdag/fntrace.cpp deleted file mode 100644 index 332b6810..00000000 --- a/unittests/src/taintdag/fntrace.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2022-present, Trail of Bits, Inc. - * All rights reserved. - * - * This source code is licensed in accordance with the terms specified in - * the LICENSE file found in the root directory of this source tree. - */ - -// #include "taintdag/tainted_control_flow.h" - -// #include - -// TEST_CASE("Test fntrace operations") { -// namespace td = taintdag; -// SECTION("Log unique events") { -// td::OutputFile of{std::mkstemp(nullptr)}; -// auto &events{of.section()}; -// td::Functions::index_t fnidx{0}; -// events.log_fn_event(td::Event::kind_t::entry, fnidx); -// events.log_fn_event(td::Event::kind_t::exit, fnidx); -// SECTION("Events are successfully written") { -// REQUIRE(events.count() == 2); -// td::Event entry{*events.begin()}; -// REQUIRE(entry.kind == td::Event::kind_t::entry); -// REQUIRE(entry.function == fnidx); -// td::Event exit{*(events.begin() + 1)}; -// REQUIRE(exit.kind == td::Event::kind_t::exit); -// REQUIRE(exit.function == fnidx); -// } -// } -// } \ No newline at end of file From 83db76ec72b1b3bc6c4e9d83663399129e724969 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 4 Dec 2024 22:55:31 +0000 Subject: [PATCH 009/112] remove deleted unit tests from CMake --- unittests/src/taintdag/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/unittests/src/taintdag/CMakeLists.txt b/unittests/src/taintdag/CMakeLists.txt index b620b84a..ec91e1c8 100644 --- a/unittests/src/taintdag/CMakeLists.txt +++ b/unittests/src/taintdag/CMakeLists.txt @@ -8,7 +8,6 @@ add_executable( bitmap_section.cpp encoding.cpp fnmapping.cpp - fntrace.cpp union.cpp labeldeq.cpp stream_offset.cpp From 285126df1a7eb15b7997491733c2b1140396fc9c Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 4 Dec 2024 23:25:57 +0000 Subject: [PATCH 010/112] modify fnmapping so that the mapping is: cflog(function_id), fnmapping(function_id, offset), strings[offset] = function_name --- polytracker/include/taintdag/fnmapping.h | 4 ++-- polytracker/src/taintdag/fnmapping.cpp | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/polytracker/include/taintdag/fnmapping.h b/polytracker/include/taintdag/fnmapping.h index 1141be71..50f14b46 100644 --- a/polytracker/include/taintdag/fnmapping.h +++ b/polytracker/include/taintdag/fnmapping.h @@ -39,12 +39,12 @@ class Functions : public FixedSizeAlloc { : FixedSizeAlloc{of.range}, string_table{of.output_file.template section()} {} - std::optional add_mapping(std::string_view name); + std::optional add_mapping(uint32_t function_id, std::string_view function_name); private: StringTable &string_table; std::mutex mappings_mutex; - std::unordered_map mappings; + std::unordered_map mappings; }; } // namespace taintdag diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index 8ce76a29..1deacaa6 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -19,16 +19,21 @@ using index_t = Functions::index_t; } // namespace -std::optional Functions::add_mapping(std::string_view name) { +// The goal here is to get to the following state: +// - the cflog section contains function ids +// - the functions section maps those function ids to the offsets of names in the strings table +// - the strings table contains names +// In this way, the functions section is a lookup layer for getting names (in their original, mangled format - you can demangle them later with cxxfilt in python) out of the strings table. +std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { // Lock `mappings` // std::cout << "BREAK 1" << std::endl; std::unique_lock mappings_lock(mappings_mutex); - // See if we already have a mapping of `name` - if (auto it{mappings.find(name)}; it != mappings.end()) { + // See if we already have a mapping of the function id + if (auto it{mappings.find(function_id)}; it != mappings.end()) { return it->second; } - // Write `name` into the string table section - auto maybe_name_offset{string_table.add_string(name)}; + // Write the function's mangled name into the string table section + auto maybe_name_offset{string_table.add_string(function_name)}; if (!maybe_name_offset) { return {}; } @@ -39,7 +44,7 @@ std::optional Functions::add_mapping(std::string_view name) { return {}; } // Return index of `Function` in `Functions` - return mappings[name] = index(maybe_ctx->t); + return mappings[function_id] = index(maybe_ctx->t); } } // namespace taintdag \ No newline at end of file From 9e74b5cf8f466404c49802f3dc2f16cedb48280d Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 4 Dec 2024 23:36:18 +0000 Subject: [PATCH 011/112] update fnmapping unit test to involve function id as well --- unittests/src/taintdag/fnmapping.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/unittests/src/taintdag/fnmapping.cpp b/unittests/src/taintdag/fnmapping.cpp index 5352da2b..2fabac9f 100644 --- a/unittests/src/taintdag/fnmapping.cpp +++ b/unittests/src/taintdag/fnmapping.cpp @@ -15,26 +15,26 @@ TEST_CASE("Test fnmapping operations") { SECTION("Add unique functions, functions are successfully inserted") { td::OutputFile of{std::tmpnam(nullptr)}; auto &functions{of.section()}; - REQUIRE(functions.add_mapping("foo")); - REQUIRE(functions.add_mapping("bar")); - REQUIRE(functions.add_mapping("baz")); + REQUIRE(functions.add_mapping(4, "foo")); + REQUIRE(functions.add_mapping(55, "bar")); + REQUIRE(functions.add_mapping(1, "baz")); } SECTION("Add unique functions, functions have successive indices") { td::OutputFile of{std::tmpnam(nullptr)}; auto &functions{of.section()}; - REQUIRE(functions.add_mapping("foo").value_or(3) == 0); - REQUIRE(functions.add_mapping("bar").value_or(3) == 1); - REQUIRE(functions.add_mapping("baz").value_or(3) == 2); + REQUIRE(functions.add_mapping(4, "foo").value_or(3) == 0); + REQUIRE(functions.add_mapping(55, "bar").value_or(3) == 1); + REQUIRE(functions.add_mapping(1, "baz").value_or(3) == 2); } SECTION("Add duplicate functions, duplicate functions have the same index") { td::OutputFile of{std::tmpnam(nullptr)}; auto &functions{of.section()}; - auto foo_1{functions.add_mapping("foo").value_or(3)}; - functions.add_mapping("bar"); - functions.add_mapping("baz"); - auto foo_2{functions.add_mapping("foo").value_or(4)}; + auto foo_1{functions.add_mapping(4, "foo").value_or(3)}; + functions.add_mapping(55, "bar"); + functions.add_mapping(1, "baz"); + auto foo_2{functions.add_mapping(4, "foo").value_or(4)}; REQUIRE(foo_1 == foo_2); } } \ No newline at end of file From eff25e02d4a79d362dbcc73d827ef549192f241f Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 4 Dec 2024 23:54:37 +0000 Subject: [PATCH 012/112] first whack at recording function names to the cflog -> fnmapping -> string table combo --- polytracker/include/taintdag/control_flow_log.h | 1 + polytracker/include/taintdag/polytracker.h | 3 +++ polytracker/src/passes/tainted_control_flow.cpp | 4 ++-- polytracker/src/polytracker/polytracker.cpp | 3 ++- polytracker/src/taintdag/polytracker.cpp | 4 ++++ 5 files changed, 12 insertions(+), 3 deletions(-) diff --git a/polytracker/include/taintdag/control_flow_log.h b/polytracker/include/taintdag/control_flow_log.h index 1bcff380..283a5910 100644 --- a/polytracker/include/taintdag/control_flow_log.h +++ b/polytracker/include/taintdag/control_flow_log.h @@ -59,6 +59,7 @@ struct ControlFlowLog : public SectionBase { " bytes of output to the ControlFlowLog Section."); } } + void enter_function(uint32_t function_id) { function_event(EnterFunction, function_id); } diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index 6c27b69e..ccd784d3 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -61,6 +61,9 @@ class PolyTracker { // Instrumentation callback for when execution enters a function void enter_function(uint32_t function_id); + // Log function name mapping when execution enters a function + void record_function_name(uint32_t function_id, std::string_view function_name); + // Instrumentation callback for when execution leaves a function void leave_function(uint32_t function_id); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 8b82d113..b6ef4e0d 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -166,7 +166,7 @@ void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", - ir.getVoidTy(), ir.getInt32Ty()); + ir.getVoidTy(), ir.getInt32Ty(), ir.getStringTy()); fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); @@ -177,7 +177,7 @@ void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { return; } llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); - ir.CreateCall(fn_enter_log_fn, get_function_id_const(func)); + ir.CreateCall(fn_enter_log_fn, get_function_id_const(func), func.getName()); } void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 587f9d6e..189015b3 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -71,11 +71,12 @@ extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( return conditional; } -extern "C" void __polytracker_enter_function(uint32_t function_id) { +extern "C" void __polytracker_enter_function(uint32_t function_id, std::string_view function_name) { if (!polytracker_is_initialized()) { return; } get_polytracker_tdag().enter_function(function_id); + get_polytracker_tdag().record_function_name(function_id, function_name); } extern "C" void __polytracker_leave_function(uint32_t function_id) { diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index 6f48a34e..0a082d2c 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -182,6 +182,10 @@ void PolyTracker::enter_function(uint32_t function_id) { output_file_.section().enter_function(function_id); } +void PolyTracker::record_function_name(uint32_t function_id, std::string_view function_name) { + output_file_.section().add_mapping(function_id, function_name); +} + void PolyTracker::leave_function(uint32_t function_id) { output_file_.section().leave_function(function_id); } From fd0b4c329418db0437306089742363a740a0a9e5 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 5 Dec 2024 16:06:18 +0000 Subject: [PATCH 013/112] Revert "first whack at recording function names to the cflog -> fnmapping -> string table combo" This reverts commit 15c596cb61e332cc8495e731e897942fa080906d. --- polytracker/include/taintdag/control_flow_log.h | 1 - polytracker/include/taintdag/polytracker.h | 3 --- polytracker/src/passes/tainted_control_flow.cpp | 4 ++-- polytracker/src/polytracker/polytracker.cpp | 3 +-- polytracker/src/taintdag/polytracker.cpp | 4 ---- 5 files changed, 3 insertions(+), 12 deletions(-) diff --git a/polytracker/include/taintdag/control_flow_log.h b/polytracker/include/taintdag/control_flow_log.h index 283a5910..1bcff380 100644 --- a/polytracker/include/taintdag/control_flow_log.h +++ b/polytracker/include/taintdag/control_flow_log.h @@ -59,7 +59,6 @@ struct ControlFlowLog : public SectionBase { " bytes of output to the ControlFlowLog Section."); } } - void enter_function(uint32_t function_id) { function_event(EnterFunction, function_id); } diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index ccd784d3..6c27b69e 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -61,9 +61,6 @@ class PolyTracker { // Instrumentation callback for when execution enters a function void enter_function(uint32_t function_id); - // Log function name mapping when execution enters a function - void record_function_name(uint32_t function_id, std::string_view function_name); - // Instrumentation callback for when execution leaves a function void leave_function(uint32_t function_id); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index b6ef4e0d..8b82d113 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -166,7 +166,7 @@ void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", - ir.getVoidTy(), ir.getInt32Ty(), ir.getStringTy()); + ir.getVoidTy(), ir.getInt32Ty()); fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); @@ -177,7 +177,7 @@ void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { return; } llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); - ir.CreateCall(fn_enter_log_fn, get_function_id_const(func), func.getName()); + ir.CreateCall(fn_enter_log_fn, get_function_id_const(func)); } void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 189015b3..587f9d6e 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -71,12 +71,11 @@ extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( return conditional; } -extern "C" void __polytracker_enter_function(uint32_t function_id, std::string_view function_name) { +extern "C" void __polytracker_enter_function(uint32_t function_id) { if (!polytracker_is_initialized()) { return; } get_polytracker_tdag().enter_function(function_id); - get_polytracker_tdag().record_function_name(function_id, function_name); } extern "C" void __polytracker_leave_function(uint32_t function_id) { diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index 0a082d2c..6f48a34e 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -182,10 +182,6 @@ void PolyTracker::enter_function(uint32_t function_id) { output_file_.section().enter_function(function_id); } -void PolyTracker::record_function_name(uint32_t function_id, std::string_view function_name) { - output_file_.section().add_mapping(function_id, function_name); -} - void PolyTracker::leave_function(uint32_t function_id) { output_file_.section().leave_function(function_id); } From 9420980890607d0071ab44a4171ec84959376469 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 6 Dec 2024 20:17:57 +0000 Subject: [PATCH 014/112] record function names from the cflog pass through the instrumentation (leaving the JSON code in place in parallel for now since this is a breaking change) --- .../polytracker/passes/tainted_control_flow.h | 1 + polytracker/include/taintdag/polytracker.h | 3 +++ .../src/passes/tainted_control_flow.cpp | 20 +++++++++++++------ polytracker/src/polytracker/polytracker.cpp | 3 ++- polytracker/src/taintdag/polytracker.cpp | 4 ++++ 5 files changed, 24 insertions(+), 7 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index b9d22f6a..e4d3d033 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -28,6 +28,7 @@ class TaintedControlFlowPass llvm::FunctionCallee cond_br_log_fn; // Log enter/leave functions llvm::FunctionCallee fn_enter_log_fn; + llvm::FunctionType *enter_log_fn_type; llvm::FunctionCallee fn_leave_log_fn; // Helpers diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index 6c27b69e..a1afd679 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -64,6 +64,9 @@ class PolyTracker { // Instrumentation callback for when execution leaves a function void leave_function(uint32_t function_id); + // Log function name + void record_function_name(uint32_t function_id, std::string_view function_name); + // Log tainted data flowed into the sink void taint_sink(int fd, util::Offset offset, void const *mem, size_t length); // Same as before, but use same label for all data diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 8b82d113..2f20ea9f 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -155,7 +155,9 @@ void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { } void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { - llvm::IRBuilder<> ir(mod.getContext()); + llvm::LLVMContext *context = &mod.getContext(); + llvm::IRBuilder<> ir(*context); + cond_br_log_fn = mod.getOrInsertFunction( "__polytracker_log_tainted_control_flow", llvm::AttributeList::get( @@ -165,11 +167,11 @@ void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { llvm::Attribute::ReadNone)}}), ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); - fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", - ir.getVoidTy(), ir.getInt32Ty()); + enter_log_fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), llvm::Type::getInt8PtrTy(*context)); + + fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", enter_log_fn_type); - fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", - ir.getVoidTy(), ir.getInt32Ty()); + fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); } void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { @@ -177,7 +179,13 @@ void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { return; } llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); - ir.CreateCall(fn_enter_log_fn, get_function_id_const(func)); + + ir.CreateCall(fn_enter_log_fn, + { + get_function_id_const(func), + ir.CreateGlobalStringPtr(func.getName()) + } + ); } void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 587f9d6e..bf1912b9 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -71,11 +71,12 @@ extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( return conditional; } -extern "C" void __polytracker_enter_function(uint32_t function_id) { +extern "C" void __polytracker_enter_function(uint32_t function_id, const char* function_name) { if (!polytracker_is_initialized()) { return; } get_polytracker_tdag().enter_function(function_id); + get_polytracker_tdag().record_function_name(function_id, std::string_view(function_name)); } extern "C" void __polytracker_leave_function(uint32_t function_id) { diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index 6f48a34e..ff540f9e 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -178,6 +178,10 @@ void PolyTracker::log_tainted_control_flow(label_t lbl, uint32_t function_id) { output_file_.section().tainted_control_flow(lbl, function_id); } +void PolyTracker::record_function_name(uint32_t function_id, std::string_view function_name) { + output_file_.section().add_mapping(function_id, function_name); +} + void PolyTracker::enter_function(uint32_t function_id) { output_file_.section().enter_function(function_id); } From 5ccf93393310df0cc0ddc7ede88461441c88e796 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 6 Dec 2024 20:18:32 +0000 Subject: [PATCH 015/112] write the function ID as well as the string table offset of the mangled function symbol into the functions section --- polytracker/src/taintdag/fnmapping.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index 1deacaa6..026e12fb 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -16,6 +16,7 @@ namespace taintdag { namespace { using index_t = Functions::index_t; +using offset_t = Function::offset_t; } // namespace @@ -26,7 +27,6 @@ using index_t = Functions::index_t; // In this way, the functions section is a lookup layer for getting names (in their original, mangled format - you can demangle them later with cxxfilt in python) out of the strings table. std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { // Lock `mappings` - // std::cout << "BREAK 1" << std::endl; std::unique_lock mappings_lock(mappings_mutex); // See if we already have a mapping of the function id if (auto it{mappings.find(function_id)}; it != mappings.end()) { @@ -37,7 +37,13 @@ std::optional Functions::add_mapping(uint32_t function_id, std::string_ if (!maybe_name_offset) { return {}; } - // Write a `Function` via `construct` + // Now write the function ID into the functions section + auto maybe_fn_id_ctx{construct((offset_t) function_id)}; + if (!maybe_fn_id_ctx) { + return {}; + } + + // Finally, write the offset in the string table of the function name into the functions section auto name_offset{*maybe_name_offset}; auto maybe_ctx{construct(name_offset)}; if (!maybe_ctx) { From 6ffa345b0dbeb13dd5d787f31ec2bf884df2d860 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 6 Dec 2024 20:34:45 +0000 Subject: [PATCH 016/112] temporarily test just that the functions were written correctly and could be read --- tests/test_cf_log.py | 76 +++++--------------------------------------- 1 file changed, 8 insertions(+), 68 deletions(-) diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index 9ded510b..17a9f2fa 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -6,83 +6,23 @@ from pathlib import Path from polytracker.taint_dag import ( - Event, + TDEvent, + TDControlFlowLogSection, TDEnterFunctionEvent, TDLeaveFunctionEvent, TDTaintedControlFlowEvent, TDProgramTrace ) from polytracker import ProgramTrace +from typing import List @pytest.mark.program_trace("test_fntrace.cpp") def test_cf_log_fn_trace(program_trace: ProgramTrace): assert isinstance(program_trace, TDProgramTrace) - functions = list(program_trace.tdfile.fn_headers) - names = set(map(lambda f: f[0], functions)) - # we store the names in llvm mangled fashion but - assert names == set(["main", "_Z9factoriali"]) + # we store the names in llvm mangled fashion but... + assert program_trace.tdfile.fn_headers == ["main", "_Z9factoriali"] - # you can easily unmangle them for readability! - functionid_mapping = list(map(cxxfilt.demangle, functions)) - assert functionid_mapping == set(["main", "factorial(int)"]) - -@pytest.mark.program_trace("test_cf_log.cpp") -def test_cf_log(instrumented_binary: Path, trace_file: Path): - # Data to write to stdin, one byte at a time - stdin_data = "abcdefgh" - - subprocess.run( - [str(instrumented_binary)], - input=stdin_data.encode("utf-8"), - env={ - "POLYDB": str(trace_file), - "POLYTRACKER_STDIN_SOURCE": "1", - "POLYTRACKER_LOG_CONTROL_FLOW": "1", - }, - ) - - program_trace = polytracker.PolyTrackerTrace.load(trace_file) - - cflog = program_trace.tdfile._get_section( - polytracker.taint_dag.TDControlFlowLogSection - ) - - functions = program_trace.tdfile.fn_headers - - functionid_mapping = list(map(cxxfilt.demangle, functions)) - - # Apply the id to function mapping - cflog.function_id_mapping(functionid_mapping) - - expected_seq = [ - TDEnterFunctionEvent(["main"]), - TDTaintedControlFlowEvent(["main"], 1), - TDTaintedControlFlowEvent(["main"], 2), - TDTaintedControlFlowEvent(["main"], 3), - TDTaintedControlFlowEvent(["main"], 4), - TDTaintedControlFlowEvent(["main"], 5), - TDTaintedControlFlowEvent(["main"], 6), - TDTaintedControlFlowEvent(["main"], 7), - TDTaintedControlFlowEvent(["main"], 8), - TDTaintedControlFlowEvent(["main"], 15), - TDTaintedControlFlowEvent(["main"], 3), - TDEnterFunctionEvent(["main", "f1(unsigned char)"]), - TDTaintedControlFlowEvent(["main", "f1(unsigned char)"], 7), - TDEnterFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), - TDTaintedControlFlowEvent( - ["main", "f1(unsigned char)", "f2(unsigned char)"], 7 - ), - TDLeaveFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), - TDLeaveFunctionEvent(["main", "f1(unsigned char)"]), - TDLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit - ] - - assert len(got) > 0 - - for got, expected in zip(cflog, expected_seq): - assert type(got) == Event - assert got == expected - if type(got) == TDTaintedControlFlowEvent: - # inheritance should make this work? - assert got.label is not None + # you can easily unmangle them for human readable stack traces! + functionid_mapping: List[str] = list(map(cxxfilt.demangle, program_trace.tdfile.fn_headers)) + assert functionid_mapping == ["main", "factorial(int)"] From 3c569ecaa7cad680b462a3520b8fbaa23c8977af Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 6 Dec 2024 22:12:27 +0000 Subject: [PATCH 017/112] test_polytracker seems to only cover PolyTrackerTrace, so rename it --- tests/{test_polytracker.py => test_program_trace.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename tests/{test_polytracker.py => test_program_trace.py} (99%) diff --git a/tests/test_polytracker.py b/tests/test_program_trace.py similarity index 99% rename from tests/test_polytracker.py rename to tests/test_program_trace.py index 13bf6e29..b6d765da 100644 --- a/tests/test_polytracker.py +++ b/tests/test_program_trace.py @@ -182,7 +182,7 @@ def test_cxx_global_object(program_trace: ProgramTrace): assert taints[0].offset == 1 assert taints[0].length == 1 - +@pytest.mark.skip(reason="the Taint Forest is currently not implemented") @pytest.mark.program_trace("test_simple_union.cpp", input="ABCDEFGH\n11235878\n") def test_taint_forest(program_trace: ProgramTrace): had_taint_union = False From 52917c8880cfc8d65714d3266dd0273dba053c14 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 6 Dec 2024 22:39:10 +0000 Subject: [PATCH 018/112] test_stdin is hanging for some reason, maybe due to how I'm writing the strings table? --- polytracker/src/taintdag/fnmapping.cpp | 22 +++++++--------------- polytracker/taint_dag.py | 17 ++++++++++------- tests/test_cf_log.py | 2 +- tests/test_stdin.py | 4 +++- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index 026e12fb..c924beea 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -16,34 +16,26 @@ namespace taintdag { namespace { using index_t = Functions::index_t; -using offset_t = Function::offset_t; } // namespace -// The goal here is to get to the following state: -// - the cflog section contains function ids -// - the functions section maps those function ids to the offsets of names in the strings table -// - the strings table contains names -// In this way, the functions section is a lookup layer for getting names (in their original, mangled format - you can demangle them later with cxxfilt in python) out of the strings table. +/* Maps to the function names recorded in the strings section, from +* the function IDs recorded in cflog entry callstacks. +*/ std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { // Lock `mappings` + // std::cout << "BREAK 1" << std::endl; std::unique_lock mappings_lock(mappings_mutex); - // See if we already have a mapping of the function id + // See if we already have a mapping of `name` if (auto it{mappings.find(function_id)}; it != mappings.end()) { return it->second; } - // Write the function's mangled name into the string table section + // Write `name` into the string table section auto maybe_name_offset{string_table.add_string(function_name)}; if (!maybe_name_offset) { return {}; } - // Now write the function ID into the functions section - auto maybe_fn_id_ctx{construct((offset_t) function_id)}; - if (!maybe_fn_id_ctx) { - return {}; - } - - // Finally, write the offset in the string table of the function name into the functions section + // Write a `Function` via `construct` auto name_offset{*maybe_name_offset}; auto maybe_ctx{construct(name_offset)}; if (!maybe_ctx) { diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 18f318c4..79f72075 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -433,7 +433,7 @@ def __init__(self, file: BinaryIO) -> None: section_offset = sizeof(TDFileMeta) self.sections: List[TDSection] = [] self.sections_by_type: Dict[Type[TDSection], TDSection] = {} - for i in range(0, self.filemeta.section_count): + for _ in range(0, self.filemeta.section_count): hdr = TDSectionMeta.from_buffer_copy(self.buffer, section_offset) if hdr.tag == 1: self.sections.append(TDSourceSection(self.buffer, hdr)) @@ -469,12 +469,13 @@ def __init__(self, file: BinaryIO) -> None: self.sink_cache: Dict[int, TDSink] = {} self.fd_headers: List[Tuple[Path, TDFDHeader]] = list(self.read_fd_headers()) - self.fn_headers: List[Tuple[str, TDFnHeader]] = list(self.read_fn_headers()) + self.fn_headers: List[str] = list(self.read_fn_headers()) def _get_section(self, wanted_type: Type[TDSection]) -> TDSection: return self.sections_by_type[wanted_type] def read_fd_headers(self) -> Iterator[Tuple[Path, TDFDHeader]]: + print("hi from fd_headers") sources = self.sections_by_type[TDSourceSection] strings = self.sections_by_type[TDStringSection] assert isinstance(sources, TDSourceSection) @@ -484,7 +485,8 @@ def read_fd_headers(self) -> Iterator[Tuple[Path, TDFDHeader]]: (Path(strings.read_string(x.name_offset)), x) for x in sources.enumerate() ) - def read_fn_headers(self) -> Iterator[Tuple[str, TDFnHeader]]: + def read_fn_headers(self) -> Iterator[str]: + print("hi from fn_headers") functions = self.sections_by_type[TDFunctionsSection] strings = self.sections_by_type[TDStringSection] assert isinstance(functions, TDFunctionsSection) @@ -492,7 +494,7 @@ def read_fn_headers(self) -> Iterator[Tuple[str, TDFnHeader]]: for header in functions: name = strings.read_string(header.name_offset) - yield name, header + yield name def input_labels(self) -> Iterator[int]: """Enumerates all taint labels that are input labels (source taint)""" @@ -560,7 +562,9 @@ def taints(self) -> Taints: class TDProgramTrace(ProgramTrace): def __init__(self, file: BinaryIO) -> None: + print("hi!") self.tdfile: TDFile = TDFile(file) + print("HEOOOOLLLOOOOOO") self.tforest: TDTaintForest = TDTaintForest(self) self._inputs = None @@ -841,9 +845,8 @@ def run(self, args): print(f"{i}: {path}") if args.print_fn_headers: - for i, h in enumerate(tdfile.fn_headers): - name = h[0] - print(f"{i}: {name}") + for i, function_name in enumerate(tdfile.fn_headers): + print(f"{i}: {function_name}") if args.print_taint_sinks: for s in tdfile.sinks: diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index 17a9f2fa..6bbbc639 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -6,7 +6,7 @@ from pathlib import Path from polytracker.taint_dag import ( - TDEvent, + # TDEvent, TDControlFlowLogSection, TDEnterFunctionEvent, TDLeaveFunctionEvent, diff --git a/tests/test_stdin.py b/tests/test_stdin.py index ef6c2034..ab40faf4 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -13,14 +13,16 @@ def test_stdin_read(instrumented_binary: Path, trace_file: Path, method: str): # Data to write to stdin, one byte at a time stdin_data = "abcdefghi\njklmnopqr" - + print("hi") subprocess.run( [str(instrumented_binary), method], input=stdin_data.encode("utf-8"), env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": str(1)}, ).check_returncode() + print("hurro") program_trace = polytracker.PolyTrackerTrace.load(trace_file) + assert False == True # Ensure /dev/stdin is in the list of inputs assert "/dev/stdin" in [x.path for x in program_trace.inputs] From 7d5e49be6c542e5f8b38207d569b13e01684ce52 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Mon, 9 Dec 2024 20:45:34 +0000 Subject: [PATCH 019/112] sketch adding function_id in the right place; adds print statements that should be removed before merge --- polytracker/src/taintdag/fnmapping.cpp | 24 ++++++++++++++++-------- polytracker/taint_dag.py | 18 ++++++++++++------ 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index c924beea..fa7e4718 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -16,17 +16,19 @@ namespace taintdag { namespace { using index_t = Functions::index_t; +using offset_t = Function::offset_t; } // namespace /* Maps to the function names recorded in the strings section, from * the function IDs recorded in cflog entry callstacks. +* This section should look like this: +* |offset|id|offset|id|... */ std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { // Lock `mappings` - // std::cout << "BREAK 1" << std::endl; std::unique_lock mappings_lock(mappings_mutex); - // See if we already have a mapping of `name` + // See if we already have a mapping if (auto it{mappings.find(function_id)}; it != mappings.end()) { return it->second; } @@ -35,14 +37,20 @@ std::optional Functions::add_mapping(uint32_t function_id, std::string_ if (!maybe_name_offset) { return {}; } - // Write a `Function` via `construct` - auto name_offset{*maybe_name_offset}; - auto maybe_ctx{construct(name_offset)}; - if (!maybe_ctx) { + + offset_t name_offset{*maybe_name_offset}; + auto maybe_offset_ctx{construct(name_offset)}; + if (!maybe_offset_ctx) { return {}; } - // Return index of `Function` in `Functions` - return mappings[function_id] = index(maybe_ctx->t); + + auto maybe_fn_ctx{construct((offset_t)function_id)}; + if (!maybe_fn_ctx) { + return {}; + } + + // Keep the function_id to offset mapping so we can check for it later + return mappings[function_id] = index(maybe_offset_ctx->t); } } // namespace taintdag \ No newline at end of file diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 79f72075..e7710cb6 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -129,7 +129,7 @@ def read_raw(self, label): def count(self): return len(self.section) // sizeof(c_uint64) -class Event: +class CFEvent: callstack: List = None label: int = None @@ -137,7 +137,7 @@ def __init__(self, callstack): """Callstack at the point the event occurred""" self.callstack = callstack -class TDEnterFunctionEvent(Event): +class TDEnterFunctionEvent(CFEvent): """Emitted whenever execution enters a function. The callstack member is the callstack right before entering the function, having the function just entered as the last member of the callstack. @@ -152,7 +152,7 @@ def __eq__(self, __o: object) -> bool: return False -class TDLeaveFunctionEvent(Event): +class TDLeaveFunctionEvent(CFEvent): """Emitted whenever execution leaves a function. The callstack member is the callstack right before leaving the function, having the function about to leave as the last member of the callstack. @@ -167,7 +167,7 @@ def __eq__(self, __o: object) -> bool: return False -class TDTaintedControlFlowEvent(Event): +class TDTaintedControlFlowEvent(CFEvent): """Emitted whenever a control flow change is influenced by tainted data. The label that influenced the control flow is available in the `label` member. Current callstack (including the function the control flow happened in) is available @@ -342,8 +342,14 @@ def invalid_fd(self): class TDFnHeader(Structure): - _fields_ = [("name_offset", c_uint32)] + # constructor for use with the mmap buffer + _fields_ = [("name_offset", c_uint32), ("function_id", c_uint32)] + # name_offset: the offset/location in the strings table of the fn name + # function_id: the id written to the cflog + def __init__(self, name_offset: int, function_id: int): + self.name_offset = name_offset + self.function_id = function_id class TDNode: def __init__(self, affects_control_flow: bool = False): @@ -486,7 +492,7 @@ def read_fd_headers(self) -> Iterator[Tuple[Path, TDFDHeader]]: ) def read_fn_headers(self) -> Iterator[str]: - print("hi from fn_headers") + print("get all the fn_headers") functions = self.sections_by_type[TDFunctionsSection] strings = self.sections_by_type[TDStringSection] assert isinstance(functions, TDFunctionsSection) From fe20c52229eb06a4ac447928c4c5cdcd3682522e Mon Sep 17 00:00:00 2001 From: kaoudis Date: Wed, 11 Dec 2024 23:06:48 +0000 Subject: [PATCH 020/112] refactor the strings table; all C++ tests but for one pass --- .github/workflows/build.yml | 8 +++- polytracker/include/taintdag/fnmapping.h | 2 +- polytracker/include/taintdag/string_table.h | 44 ++++++++++----------- polytracker/src/taintdag/fnmapping.cpp | 28 ++++--------- polytracker/taint_dag.py | 2 +- 5 files changed, 39 insertions(+), 45 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index da74c34e..f3c729da 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -57,7 +57,13 @@ jobs: - name: Load image run: docker load --input /tmp/polytracker.tar - - name: Run tests + - name: C++ (write side) tests + uses: addnab/docker-run-action@v3 + with: + image: trailofbits/polytracker:latest + run: ./polytracker-build/unittests/src/taintdag/tests-taintdag + + - name: Python (integration) tests uses: addnab/docker-run-action@v3 with: image: trailofbits/polytracker:latest diff --git a/polytracker/include/taintdag/fnmapping.h b/polytracker/include/taintdag/fnmapping.h index 50f14b46..759d636b 100644 --- a/polytracker/include/taintdag/fnmapping.h +++ b/polytracker/include/taintdag/fnmapping.h @@ -44,7 +44,7 @@ class Functions : public FixedSizeAlloc { private: StringTable &string_table; std::mutex mappings_mutex; - std::unordered_map mappings; + std::unordered_map mappings; }; } // namespace taintdag diff --git a/polytracker/include/taintdag/string_table.h b/polytracker/include/taintdag/string_table.h index 9146c427..f1dee520 100644 --- a/polytracker/include/taintdag/string_table.h +++ b/polytracker/include/taintdag/string_table.h @@ -17,12 +17,9 @@ #include "taintdag/util.h" namespace taintdag { - -template -struct StringTableBase : public SectionBase { - using offset_t = OffsetT; - using length_t = LengthT; +struct StringTable : public SectionBase { + using offset_t = uint32_t; + using length_t = uint16_t; static_assert(sizeof(length_t) <= sizeof(offset_t), "offset_t should be larger than or equal to length_t"); @@ -31,19 +28,19 @@ struct StringTableBase : public SectionBase { // Max string length is limited by either length-type or by maximum offset // that can be expressed. - static constexpr size_t max_string_len = + static constexpr size_t max_entry_size = std::min(static_cast(std::numeric_limits::max()), - max_offset - sizeof(length_t)); + max_offset - sizeof(length_t)); - static constexpr uint8_t tag{Tag}; - static constexpr size_t allocation_size{AllocationSize}; - static constexpr size_t align_of = alignof(length_t); + static constexpr uint8_t tag{3}; + static constexpr size_t allocation_size{0x100000}; + static constexpr size_t align_of = 1; template - StringTableBase(SectionArg of) : SectionBase{of.range} { - if (of.range.size() > max_offset) { - error_exit("Tried to use an allocation of size ", of.range.size(), - " max in current offset_t config is ", max_offset); + StringTable(SectionArg output_file) : SectionBase{output_file.range} { + if (output_file.range.size() > max_offset) { + error_exit("Tried to use an allocation of size ", output_file.range.size(), + " max in current offset_t config is ", max_offset); } } @@ -52,15 +49,20 @@ struct StringTableBase : public SectionBase { // string, but the offset to the size of it. Recover the string // by using `from_offset`. std::optional add_string(std::string_view sv) { - if (sv.size() > max_string_len) { - error_exit("Tried to store a string of size ", sv.size(), " max is ", - max_string_len); - // Doesn't return from here. + if ((sv.size() + sizeof(length_t)) > max_entry_size) { + spdlog::info("Tried to store a string of size {0:d} but max is {1:d} (will truncate string)", sv.size(), max_entry_size); + + size_t to_truncate = max_entry_size - sizeof(length_t) - 1; + sv = sv.substr(0, to_truncate); + + if ((sv.size() + sizeof(length_t)) > max_entry_size) { + error_exit("Truncated string was too big: ", sv.size() + sizeof(length_t)); + } } auto len = allocated_len(sv.size()); if (auto write_context = write(len)) { - // prefix with length + // todo(kaoudis) this is possibly a type confusion issue resulting in truncation since size_t is bigger than the current length_t *reinterpret_cast(&*(write_context->mem.begin())) = sv.size(); // copy string @@ -135,6 +137,4 @@ struct StringTableBase : public SectionBase { } }; -using StringTable = StringTableBase<>; - } // namespace taintdag diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index fa7e4718..d4592406 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -16,20 +16,14 @@ namespace taintdag { namespace { using index_t = Functions::index_t; -using offset_t = Function::offset_t; } // namespace -/* Maps to the function names recorded in the strings section, from -* the function IDs recorded in cflog entry callstacks. -* This section should look like this: -* |offset|id|offset|id|... -*/ std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { // Lock `mappings` std::unique_lock mappings_lock(mappings_mutex); - // See if we already have a mapping - if (auto it{mappings.find(function_id)}; it != mappings.end()) { + // See if we already have a mapping of `name` + if (auto it{mappings.find(function_name)}; it != mappings.end()) { return it->second; } // Write `name` into the string table section @@ -37,20 +31,14 @@ std::optional Functions::add_mapping(uint32_t function_id, std::string_ if (!maybe_name_offset) { return {}; } - - offset_t name_offset{*maybe_name_offset}; - auto maybe_offset_ctx{construct(name_offset)}; - if (!maybe_offset_ctx) { + // Write a `Function` via `construct` + auto name_offset{*maybe_name_offset}; + auto maybe_ctx{construct(name_offset)}; + if (!maybe_ctx) { return {}; } - - auto maybe_fn_ctx{construct((offset_t)function_id)}; - if (!maybe_fn_ctx) { - return {}; - } - - // Keep the function_id to offset mapping so we can check for it later - return mappings[function_id] = index(maybe_offset_ctx->t); + // Return index of `Function` in `Functions` + return mappings[function_name] = index(maybe_ctx->t); } } // namespace taintdag \ No newline at end of file diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index e7710cb6..0ed9b211 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -97,7 +97,7 @@ class TDStringSection: """TDAG String Table section Interprets the String Table section in a TDAG file. - Corresponds to StringTableBase in string_table.h. + Corresponds to StringTable in string_table.h. """ def __init__(self, mem, hdr): From e5e0efda069b8612017058f528c5f86de0ea5874 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 15:34:34 +0000 Subject: [PATCH 021/112] refactors the string table somewhat - be more defensive about strings that are too long; todo come back to this; started also debugging why the stdin test is hanging --- polytracker/include/taintdag/fnmapping.h | 2 +- polytracker/include/taintdag/string_table.h | 9 +- polytracker/src/taintdag/fnmapping.cpp | 12 +- polytracker/taint_dag.py | 26 +-- tests/conftest.py | 3 - tests/test_stdin.py | 4 +- unittests/src/taintdag/tdag.cpp | 237 +++++++++++--------- 7 files changed, 150 insertions(+), 143 deletions(-) diff --git a/polytracker/include/taintdag/fnmapping.h b/polytracker/include/taintdag/fnmapping.h index 759d636b..50f14b46 100644 --- a/polytracker/include/taintdag/fnmapping.h +++ b/polytracker/include/taintdag/fnmapping.h @@ -44,7 +44,7 @@ class Functions : public FixedSizeAlloc { private: StringTable &string_table; std::mutex mappings_mutex; - std::unordered_map mappings; + std::unordered_map mappings; }; } // namespace taintdag diff --git a/polytracker/include/taintdag/string_table.h b/polytracker/include/taintdag/string_table.h index f1dee520..4368872d 100644 --- a/polytracker/include/taintdag/string_table.h +++ b/polytracker/include/taintdag/string_table.h @@ -34,15 +34,10 @@ struct StringTable : public SectionBase { static constexpr uint8_t tag{3}; static constexpr size_t allocation_size{0x100000}; - static constexpr size_t align_of = 1; + static constexpr size_t align_of = alignof(length_t); template - StringTable(SectionArg output_file) : SectionBase{output_file.range} { - if (output_file.range.size() > max_offset) { - error_exit("Tried to use an allocation of size ", output_file.range.size(), - " max in current offset_t config is ", max_offset); - } - } + StringTable(SectionArg output_file) : SectionBase{output_file.range} {} // Appends the string `sv` to the string table. // Returns the offset of the string entry. Note that this is not the diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index d4592406..0a2fb7bd 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -23,7 +23,7 @@ std::optional Functions::add_mapping(uint32_t function_id, std::string_ // Lock `mappings` std::unique_lock mappings_lock(mappings_mutex); // See if we already have a mapping of `name` - if (auto it{mappings.find(function_name)}; it != mappings.end()) { + if (auto it{mappings.find(function_id)}; it != mappings.end()) { return it->second; } // Write `name` into the string table section @@ -32,13 +32,19 @@ std::optional Functions::add_mapping(uint32_t function_id, std::string_ return {}; } // Write a `Function` via `construct` - auto name_offset{*maybe_name_offset}; + uint32_t name_offset{*maybe_name_offset}; auto maybe_ctx{construct(name_offset)}; if (!maybe_ctx) { return {}; } + + // auto maybe_fn_id_ctx{construct(function_id)}; + // if (!maybe_fn_id_ctx) { + // return {}; + // } + // Return index of `Function` in `Functions` - return mappings[function_name] = index(maybe_ctx->t); + return mappings[function_id] = index(maybe_ctx->t); } } // namespace taintdag \ No newline at end of file diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 0ed9b211..e554d663 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -105,10 +105,10 @@ def __init__(self, mem, hdr): self.align = hdr.align def read_string(self, offset): - n = c_uint16.from_buffer_copy(self.section[offset:]).value - assert len(self.section) >= offset + sizeof(c_uint16) + n + size_of_string = c_uint16.from_buffer_copy(self.section[offset:]).value + assert len(self.section) >= offset + sizeof(c_uint16) + size_of_string return str( - self.section[offset + sizeof(c_uint16) : offset + sizeof(c_uint16) + n], + self.section[offset + sizeof(c_uint16) : offset + sizeof(c_uint16) + size_of_string], "utf-8", ) @@ -343,13 +343,10 @@ def invalid_fd(self): class TDFnHeader(Structure): # constructor for use with the mmap buffer - _fields_ = [("name_offset", c_uint32), ("function_id", c_uint32)] - - # name_offset: the offset/location in the strings table of the fn name - # function_id: the id written to the cflog - def __init__(self, name_offset: int, function_id: int): - self.name_offset = name_offset - self.function_id = function_id + _fields_ = [ + ("name_offset", c_uint32), + # ("function_id", c_uint32) + ] class TDNode: def __init__(self, affects_control_flow: bool = False): @@ -487,12 +484,12 @@ def read_fd_headers(self) -> Iterator[Tuple[Path, TDFDHeader]]: assert isinstance(sources, TDSourceSection) assert isinstance(strings, TDStringSection) - yield from ( - (Path(strings.read_string(x.name_offset)), x) for x in sources.enumerate() - ) + for source in sources.enumerate(): + source_name = strings.read_string(source.name_offset) + print(source_name) + yield Path(source_name), source def read_fn_headers(self) -> Iterator[str]: - print("get all the fn_headers") functions = self.sections_by_type[TDFunctionsSection] strings = self.sections_by_type[TDStringSection] assert isinstance(functions, TDFunctionsSection) @@ -500,6 +497,7 @@ def read_fn_headers(self) -> Iterator[str]: for header in functions: name = strings.read_string(header.name_offset) + print(name) yield name def input_labels(self) -> Iterator[int]: diff --git a/tests/conftest.py b/tests/conftest.py index 8c1d09a2..cac9f56b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -79,11 +79,8 @@ def program_trace(input_file, trace_file, instrumented_binary, monkeypatch): monkeypatch.chdir(input_file.parent) monkeypatch.setenv("POLYDB", str(trace_file)) cmd = [ - # instrumented binary instrumented_binary, - # input data str(input_file), ] subprocess.check_call(cmd) - # Read the trace file return polytracker.PolyTrackerTrace.load(trace_file) diff --git a/tests/test_stdin.py b/tests/test_stdin.py index ab40faf4..ef6c2034 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -13,16 +13,14 @@ def test_stdin_read(instrumented_binary: Path, trace_file: Path, method: str): # Data to write to stdin, one byte at a time stdin_data = "abcdefghi\njklmnopqr" - print("hi") + subprocess.run( [str(instrumented_binary), method], input=stdin_data.encode("utf-8"), env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": str(1)}, ).check_returncode() - print("hurro") program_trace = polytracker.PolyTrackerTrace.load(trace_file) - assert False == True # Ensure /dev/stdin is in the list of inputs assert "/dev/stdin" in [x.path for x in program_trace.inputs] diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index 6e6285c7..7afe706c 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -1,4 +1,5 @@ #include +#include #include "taintdag/outputfile.h" #include "taintdag/section.h" @@ -11,7 +12,7 @@ namespace taintdag { -TEST_CASE("Test TDAG", "Integration") { +TEST_CASE("Test TDAG", "[Integration]") { OutputFile tdg("filename.bin"); auto offset1 = tdg.section().add_string("Hello"); auto offset2 = tdg.section().add_string("World!"); @@ -116,7 +117,7 @@ TEST_CASE("SectionBase operations are consistent", "[SectionBase]") { auto ctx = sb.write(1); REQUIRE(!ctx); - // If offset is requirested for out of bounds memory, just abort. Something + // If offset is requested for out of bounds memory, just abort. Something // is seriously wrong. REQUIRE_THROWS_AS(sb.offset(SectionBase::span_t::iterator{}), test::ErrorExit); @@ -238,70 +239,61 @@ TEST_CASE("FixedSizeAlloc operations are consistent", "[FixedSizeAlloc]") { } // Dummy OutputFile, to allow retrieving the StringTable -struct DummyOf { - template T §ion() { return st; } +struct DummyOutputFile { + template T §ion() { return string_table; } - StringTable &st; + StringTable &string_table; }; -TEST_CASE("Taint sources basic usage", "[Sources]") { +TEST_CASE("The Sources and StringTable sections can be used to store source entries", "[Sources, StringTable]") { + OutputFile of{std::tmpnam(nullptr)}; + auto &sources_section{of.section()}; + auto &string_table{of.section()}; - const size_t max_sources = 4; - const size_t allocation_size = max_sources * sizeof(SourceEntry); - alignas(SourceEntry) uint8_t backing[allocation_size]; - - const size_t strings_size = 128; - uint8_t string_backing[strings_size]; - - // NOTE(hbrodin): .output_file arg is not used in StringTable so just - // construct an int. - int dummy = 1; - StringTable st( - SectionArg{.output_file = dummy, .range = string_backing}); - - DummyOf of{st}; - - Sources src{SectionArg{.output_file = of, .range = backing}}; - - // TODO(hbrodin): Refactor below. - - SECTION("Add and retrieve mappings") { + SECTION("Can add taint-source entries to the Sources section", "[Sources, StringTable]") { int fd = 3; - REQUIRE(!src.mapping_idx(fd)); + REQUIRE(!sources_section.mapping_idx(fd)); + + auto s1 = sources_section.add_source("test", fd, 122); + REQUIRE(s1.has_value()); - auto s1 = src.add_source("test", fd, 122); - REQUIRE(s1); - auto m = src.mapping_idx(fd); - REQUIRE(m); + auto m = sources_section.mapping_idx(fd); + REQUIRE(m.has_value()); REQUIRE(*s1 == *m); - auto m1 = src.get(*m); + auto m1 = sources_section.get(*m); REQUIRE(m1.fd == fd); - REQUIRE(m1.name(st) == "test"); + + REQUIRE(m1.name(string_table) == "test"); REQUIRE(m1.size == 122); int fd2 = 99; - auto s2 = src.add_source("test2", fd2, SourceEntry::InvalidSize); - REQUIRE(s2); - auto idx2 = src.mapping_idx(fd2); - REQUIRE(idx2); - auto m2 = src.get(*idx2); + auto s2 = sources_section.add_source("test2", fd2, SourceEntry::InvalidSize); + REQUIRE(s2.has_value()); + + auto idx2 = sources_section.mapping_idx(fd2); + REQUIRE(idx2.has_value()); + + auto m2 = sources_section.get(*idx2); REQUIRE(m2.fd == fd2); - REQUIRE(m2.name(st) == "test2"); + REQUIRE(m2.name(string_table) == "test2"); + REQUIRE(m2.size == SourceEntry::InvalidSize); } - SECTION("Latest wins in case of multiple mappings for same fd") { - int fd = 1; - src.add_source("first", fd); - src.add_source("second", fd); + WHEN("Adding taint-sources to the Sources section and the string table") { + THEN("Latest wins in terms in case output_file has multiple mappings for the same fd") { + int fd = 1; + sources_section.add_source("first", fd); + sources_section.add_source("second", fd); - auto mm = src.mapping_idx(fd); - REQUIRE(mm); + auto mm = sources_section.mapping_idx(fd); + REQUIRE(mm); - auto m = src.get(*mm); - REQUIRE(m.fd == fd); - REQUIRE(m.name(st) == "second"); + auto m = sources_section.get(*mm); + REQUIRE(m.fd == fd); + REQUIRE(m.name(string_table) == "second"); + } } } @@ -309,84 +301,105 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { // To be able to capture error_exits test::ErrorExitReplace errthrow; - alignas(StringTable::length_t) uint8_t backing[64]; - - int dummy = 1; - StringTable st{SectionArg{.output_file = dummy, .range = backing}}; - - SECTION("Initial properties") { - REQUIRE(StringTable::align_of == alignof(StringTable::length_t)); - REQUIRE(st.size() == 0); - REQUIRE(st.begin() == st.end()); + OutputFile of{std::tmpnam(nullptr)}; + auto &string_table{of.section()}; - REQUIRE(sizeof(StringTable::length_t) <= sizeof(StringTable::offset_t)); + SECTION("StringTable properties") { + // squish everything together as close as we can + REQUIRE(StringTable::align_of == 2UL); + // no elements in the string table to start + REQUIRE(string_table.size() == 0); + REQUIRE(string_table.begin() == string_table.end()); } - SECTION("Adding/retrieving") { - auto ofs = st.add_string("Hello"); - REQUIRE(ofs); - REQUIRE(st.from_offset(*ofs) == "Hello"); + WHEN("A string is added") { + THEN("It should also be retrievable from the offset of its length") { + auto ofs = string_table.add_string("Hello"); + REQUIRE(ofs); + REQUIRE(string_table.from_offset(*ofs) == "Hello"); - auto ofs2 = st.add_string("World"); - REQUIRE(ofs2); - REQUIRE(st.from_offset(*ofs2) == "World"); + auto ofs2 = string_table.add_string("World"); + REQUIRE(ofs2); + REQUIRE(string_table.from_offset(*ofs2) == "World"); + } } - SECTION("Iteration") { - st.add_string("a"); - st.add_string("b"); - st.add_string("c"); - st.add_string("d"); - - std::vector res; - std::copy(st.begin(), st.end(), std::back_inserter(res)); - REQUIRE(res.size() == 4); - REQUIRE(res[0] == "a"); - REQUIRE(res[1] == "b"); - REQUIRE(res[2] == "c"); - REQUIRE(res[3] == "d"); + WHEN("Multiple strings are added") { + THEN("They should be iterable using begin() and end()") { + string_table.add_string("a"); + string_table.add_string("b"); + string_table.add_string("c"); + string_table.add_string("d"); + + std::vector res; + std::copy(string_table.begin(), string_table.end(), std::back_inserter(res)); + REQUIRE(res.size() == 4); + REQUIRE(res[0] == "a"); + REQUIRE(res[1] == "b"); + REQUIRE(res[2] == "c"); + REQUIRE(res[3] == "d"); + } } - SECTION("Capacity") { - SECTION("Fill with one string") { - std::string s(sizeof(backing) - sizeof(StringTable::length_t), 'A'); - REQUIRE(st.add_string(s)); - std::string s2{1, 'B'}; - REQUIRE(!st.add_string(s2)); + WHEN("Adding to the string table") { + THEN("A string bigger than the maximum string size will be truncated and stored") { + spdlog::set_level(spdlog::level::debug); + + auto len = StringTable::max_entry_size + 10; + std::string too_big(len, 'A'); + REQUIRE_NOTHROW([&](){ + auto offset = string_table.add_string(too_big); + REQUIRE(offset.has_value()); + + std::string_view result = string_table.from_offset(offset.value()); + REQUIRE(result.size() + sizeof(StringTable::length_t) == StringTable::max_entry_size - 1); + }()); } - SECTION("Fill with many short strings") { + THEN("Can fill the string table with many short strings") { std::string s{"a"}; - size_t n = 0; - while (st.add_string(s)) { - ++n; - } - auto allocsize = sizeof(StringTable::length_t) + s.size(); - // Per string allocation size - if (auto rem = allocsize % StringTable::align_of; rem != 0) { - allocsize += StringTable::align_of - rem; + while (auto os = string_table.add_string(s)) { + if (!os.has_value()) { + break; + } + auto result = string_table.from_offset(os.value()); + REQUIRE(s.compare(result.data()) == 0); } - - REQUIRE(n == sizeof(backing) / allocsize); } - } - SECTION("Errors") { - // Trying to store a string larger than can be represented by the length_t - auto len = - static_cast(std::numeric_limits::max()) + - 1; - char const *strp = reinterpret_cast(&backing[0]); - REQUIRE_THROWS_AS(st.add_string({strp, len}), test::ErrorExit); - - // Allocation is larger than can be represented by the offset type. - auto alloc_size = - static_cast(std::numeric_limits::max()) + - 1; - auto span = StringTable::span_t{&backing[0], alloc_size}; - REQUIRE_THROWS_AS( - (StringTable{SectionArg{.output_file = dummy, .range = span}}), - test::ErrorExit); + THEN("Add a maximumly big string and will still be able to add other strings") { + auto size = StringTable::max_entry_size - sizeof(StringTable::length_t); + std::string s(size, 'A'); + REQUIRE_NOTHROW([&](){ + auto offset = string_table.add_string(s); + REQUIRE(offset.has_value()); + auto result = string_table.from_offset(offset.value()); + // no truncation happened this time + REQUIRE(result.size() == size); + REQUIRE(s.compare(result.data()) == 0); + }()); + + std::string s2{1, 'B'}; + auto os2 = string_table.add_string(s2); + REQUIRE(os2.has_value()); + + std::string s3("hello"); + auto os3 = string_table.add_string(s3); + REQUIRE(os3.has_value()); + } } } + + // TEST_CASE("An allocation that is larger than can be represented in the string table will result in error", "[StringTable]") { + // auto alloc_size = + // static_cast(std::numeric_limits::max()) + + // 1; + // alignas(StringTable::offset_t) uint8_t backing[64]; + // int dummy = 1; + // StringTable st{SectionArg{.output_file = dummy, .range = backing}}; + // auto span = StringTable::span_t{&backing[0], alloc_size}; + // REQUIRE_THROWS_AS( + // st, + // test::ErrorExit); + // } } // namespace taintdag \ No newline at end of file From 6702de5a149e00aa09b58611892eed1f425f69c8 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 19:52:23 +0000 Subject: [PATCH 022/112] a sketchy steel thread version of functions not coming from json file works; tests pass except unrelated --- examples/analysis/ubet/eval_nitro.py | 3 +- polytracker/include/taintdag/fnmapping.h | 10 +- polytracker/include/taintdag/section.h | 2 +- polytracker/src/taintdag/fnmapping.cpp | 29 ++--- polytracker/taint_dag.py | 157 +++++++++++++---------- tests/test_cf_log.py | 84 ++++++++++-- unittests/src/taintdag/tdag.cpp | 1 + 7 files changed, 183 insertions(+), 103 deletions(-) diff --git a/examples/analysis/ubet/eval_nitro.py b/examples/analysis/ubet/eval_nitro.py index c188aabb..c171091a 100644 --- a/examples/analysis/ubet/eval_nitro.py +++ b/examples/analysis/ubet/eval_nitro.py @@ -210,7 +210,6 @@ def run_instrumented(is_debug: bool, inputfile: Path, targetdir: Path): e = { "POLYDB": str(db), "POLYTRACKER_STDOUT_SINK": "1", - "POLYTRACKER_LOG_CONTROL_FLOW": "1", } ret = subprocess.run(args, env=e, stdout=subprocess.PIPE, stderr=subprocess.PIPE) os.rename(db, targetdir / db) @@ -275,7 +274,7 @@ def get_cflog_entires(tdfile, is_debug): map( lambda e: (input_offsets(e.label, tdfile), e.callstack), filter( - lambda e: isinstance(e, taint_dag.TDTaintedControlFlowEvent), cflog + lambda e: isinstance(e, taint_dag.TaintedControlFlowEvent), cflog ), ) ) diff --git a/polytracker/include/taintdag/fnmapping.h b/polytracker/include/taintdag/fnmapping.h index 50f14b46..685db4fa 100644 --- a/polytracker/include/taintdag/fnmapping.h +++ b/polytracker/include/taintdag/fnmapping.h @@ -21,14 +21,17 @@ namespace taintdag { struct Function { -public: using offset_t = StringTable::offset_t; offset_t name_offset; + uint32_t function_id; + + Function(offset_t name_ofs, uint32_t f_id) : + name_offset(name_ofs), function_id(f_id) {}; }; class Functions : public FixedSizeAlloc { public: - using index_t = uint16_t; + using index_t = StringTable::offset_t; static constexpr uint8_t tag{6}; static constexpr size_t allocation_size{std::numeric_limits::max() * @@ -44,7 +47,8 @@ class Functions : public FixedSizeAlloc { private: StringTable &string_table; std::mutex mappings_mutex; - std::unordered_map mappings; + // look up Function index in the Functions section by function name + std::unordered_map mappings; }; } // namespace taintdag diff --git a/polytracker/include/taintdag/section.h b/polytracker/include/taintdag/section.h index cc699007..ac2efe09 100644 --- a/polytracker/include/taintdag/section.h +++ b/polytracker/include/taintdag/section.h @@ -142,7 +142,7 @@ template struct FixedSizeAlloc : SectionBase { .t = *new (&*(write_context->mem.begin())) T{std::forward(args)...}}; } - // Failed to allocate memory + spdlog::error("Failed to allocate memory in the section for the object, so could not construct the object in the tdag section"); return {}; } diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index 0a2fb7bd..f1dbf831 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -20,31 +20,26 @@ using index_t = Functions::index_t; } // namespace std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { - // Lock `mappings` std::unique_lock mappings_lock(mappings_mutex); - // See if we already have a mapping of `name` - if (auto it{mappings.find(function_id)}; it != mappings.end()) { + + if (auto it{mappings.find(function_name)}; it != mappings.end()) { return it->second; } - // Write `name` into the string table section - auto maybe_name_offset{string_table.add_string(function_name)}; - if (!maybe_name_offset) { + + std::optional maybe_name_offset = string_table.add_string(function_name); + if (!maybe_name_offset.has_value()) { + spdlog::error("Could not write function name to strings table"); return {}; } - // Write a `Function` via `construct` - uint32_t name_offset{*maybe_name_offset}; - auto maybe_ctx{construct(name_offset)}; - if (!maybe_ctx) { + + auto maybe_ctx = construct(Function(maybe_name_offset.value(), function_id)); + if (!maybe_ctx.has_value()) { + spdlog::error("Could not write Function {0} with id {1:d}, string table ofs {2:d} to the tdag functions section", function_name, function_id, maybe_name_offset.value()); return {}; } - // auto maybe_fn_id_ctx{construct(function_id)}; - // if (!maybe_fn_id_ctx) { - // return {}; - // } - - // Return index of `Function` in `Functions` - return mappings[function_id] = index(maybe_ctx->t); + // Return index of the `Function` in `Functions` + return mappings[function_name] = index(maybe_ctx->t); } } // namespace taintdag \ No newline at end of file diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index e554d663..13f193f0 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -12,6 +12,7 @@ cast, ) +from cxxfilt import demangle from enum import Enum from pathlib import Path from mmap import mmap, PROT_READ @@ -94,10 +95,16 @@ def enumerate(self): class TDStringSection: - """TDAG String Table section + """TDAG String Table section. Interprets the String Table section in a TDAG file. - Corresponds to StringTable in string_table.h. + Corresponds to StringTableBase in string_table.h. + + The string table will contain information like the following: + - source names + - function names + - additional label metadata + Check usages of StringTableBase in the C++ ("write side") part of the codebase. """ def __init__(self, mem, hdr): @@ -105,10 +112,10 @@ def __init__(self, mem, hdr): self.align = hdr.align def read_string(self, offset): - size_of_string = c_uint16.from_buffer_copy(self.section[offset:]).value - assert len(self.section) >= offset + sizeof(c_uint16) + size_of_string + n = c_uint16.from_buffer_copy(self.section[offset:]).value + assert len(self.section) >= offset + sizeof(c_uint16) + n return str( - self.section[offset + sizeof(c_uint16) : offset + sizeof(c_uint16) + size_of_string], + self.section[offset + sizeof(c_uint16) : offset + sizeof(c_uint16) + n], "utf-8", ) @@ -129,68 +136,77 @@ def read_raw(self, label): def count(self): return len(self.section) // sizeof(c_uint64) -class CFEvent: - callstack: List = None - label: int = None +class ControlFlowEvent: + callstack: List + label: int - def __init__(self, callstack): + def __init__(self, callstack: List, label: int = None): """Callstack at the point the event occurred""" self.callstack = callstack + self.label = label + + def __repr__(self, typ, callstack: List, label: int = None): + return f"{typ}: label {label}, callstack {callstack}" -class TDEnterFunctionEvent(CFEvent): +class CFEnterFunctionEvent(ControlFlowEvent): """Emitted whenever execution enters a function. The callstack member is the callstack right before entering the function, having the function just entered as the last member of the callstack. """ + def __init__(self, callstack: List): + super().__init__(callstack) + def __repr__(self) -> str: - return f"Enter: {self.callstack}" + ControlFlowEvent.__repr__(type(CFEnterFunctionEvent), self.callstack, None) def __eq__(self, __o: object) -> bool: - if isinstance(__o, TDEnterFunctionEvent): + if isinstance(__o, CFEnterFunctionEvent): return self.callstack == __o.callstack return False -class TDLeaveFunctionEvent(CFEvent): +class CFLeaveFunctionEvent(ControlFlowEvent): """Emitted whenever execution leaves a function. The callstack member is the callstack right before leaving the function, having the function about to leave as the last member of the callstack. """ + def __init__(self, callstack: List): + super().__init__(callstack) + def __repr__(self) -> str: - return f"Leave: {self.callstack}" + ControlFlowEvent.__repr__(type(CFLeaveFunctionEvent), self.callstack, None) def __eq__(self, __o: object) -> bool: - if isinstance(__o, TDLeaveFunctionEvent): + if isinstance(__o, CFLeaveFunctionEvent): return self.callstack == __o.callstack return False -class TDTaintedControlFlowEvent(CFEvent): +class TaintedControlFlowEvent(ControlFlowEvent): """Emitted whenever a control flow change is influenced by tainted data. The label that influenced the control flow is available in the `label` member. Current callstack (including the function the control flow happened in) is available in the `callstack` member.""" - def __init__(self, callstack, label): - self.callstack = callstack - self.label = label + def __init__(self, callstack: List, label: int): + super().__init__(callstack, label) def __repr__(self) -> str: - return f"TaintedControlFlow label {self.label} callstack {self.callstack}" + ControlFlowEvent.__repr__(type(TaintedControlFlowEvent), self.callstack, self.label) def __eq__(self, __o: object) -> bool: - if isinstance(__o, TDTaintedControlFlowEvent): + if isinstance(__o, TaintedControlFlowEvent): return self.label == __o.label and self.callstack == __o.callstack return False class TDControlFlowLogSection: - """TDAG Control flow log section + """TDAG Control flow log section. Interprets the control flow log section in a TDAG file. - Enables enumeration/random access of items + Enables enumeration/random access of items in the cflog. """ # NOTE: MUST correspond to the members in the `ControlFlowLog::EventType`` in `control_flog_log.h`. @@ -215,26 +231,30 @@ def _decode_varint(buffer): @staticmethod def _align_callstack(target_function_id, callstack): while callstack and callstack[-1] != target_function_id: - yield TDLeaveFunctionEvent(callstack[:]) + yield CFLeaveFunctionEvent(callstack[:]) callstack.pop() def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] - self.funcmapping = None - def __iter__(self): + def __iter__(self) -> Iterator[ControlFlowEvent]: + """Produces the cflog entries in order from the mmapped buffer.""" buffer = self.section callstack = [] while buffer: event = c_uint8.from_buffer_copy(buffer, 0).value buffer = buffer[1:] + + # A function id is a uint32_t that the functions + # section maps to an index into the strings table + # If you need function names, you should be working + # with the tdfile, rather than with a singular section + # of the tdag directly. function_id, buffer = TDControlFlowLogSection._decode_varint(buffer) - if self.funcmapping != None: - function_id = self.funcmapping[function_id] if event == TDControlFlowLogSection.ENTER_FUNCTION: callstack.append(function_id) - yield TDEnterFunctionEvent(callstack[:]) + yield CFEnterFunctionEvent(callstack[:]) elif event == TDControlFlowLogSection.LEAVE_FUNCTION: # Align call stack, if needed yield from TDControlFlowLogSection._align_callstack( @@ -242,7 +262,7 @@ def __iter__(self): ) # TODO(hbrodin): If the callstack doesn't contain function_id at all, this will break. - yield TDLeaveFunctionEvent(callstack[:]) + yield CFLeaveFunctionEvent(callstack[:]) callstack.pop() else: # Align call stack, if needed @@ -251,15 +271,11 @@ def __iter__(self): ) label, buffer = TDControlFlowLogSection._decode_varint(buffer) - yield TDTaintedControlFlowEvent(callstack[:], label) + yield TaintedControlFlowEvent(callstack[:], label) # Drain callstack with artifical TDLeaveFunction events (using a dummy function id that doesn't exist) yield from TDControlFlowLogSection._align_callstack(-1, callstack) - def function_id_mapping(self, id_to_name_array): - """This method stores an array used to translate from function id to symbolic names""" - self.funcmapping = id_to_name_array - class TDSinkSection: """TDAG Sinks section @@ -317,12 +333,15 @@ def __init__(self, mem, hdr): class TDFunctionsSection: + """This section holds the mapping between the function IDs stored in callstack form in the cflog section, and the function names stored in the string table. See fnmapping in the C++ part of the codebase for the "write" side part of Polytracker that pertains to this section. Each entry is an uint32_t as set in fnmapping.cpp, but a TDFnHeader will then contain *two* of these: the function_id and the name_offset. + + Structure in memory: |offset|function id|...""" def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] def __iter__(self): - for offset in range(0, len(self.section), sizeof(TDFnHeader)): - yield TDFnHeader.from_buffer_copy(self.section, offset) + for entry in range(0, len(self.section), sizeof(TDFnHeader)): + yield TDFnHeader.from_buffer_copy(self.section, entry) class TDFDHeader(Structure): @@ -342,10 +361,11 @@ def invalid_fd(self): class TDFnHeader(Structure): - # constructor for use with the mmap buffer + # This corresponds to the Function inline constructor in fnmapping.h. + # Anything using Structure needs to be in sync with the corresponding C++. _fields_ = [ ("name_offset", c_uint32), - # ("function_id", c_uint32) + ("function_id", c_uint32) ] class TDNode: @@ -458,6 +478,8 @@ def __init__(self, file: BinaryIO) -> None: self.sections_by_type[TDFunctionsSection] = self.sections[-1] elif hdr.tag == 7: continue + # todo(kaoudis): change tag indices and remove this + # this will break compatibility with old tdags # self.sections.append(TDEventsSection(self.buffer, hdr)) # self.sections_by_type[TDEventsSection] = self.sections[-1] elif hdr.tag == 8: @@ -472,33 +494,44 @@ def __init__(self, file: BinaryIO) -> None: self.sink_cache: Dict[int, TDSink] = {} self.fd_headers: List[Tuple[Path, TDFDHeader]] = list(self.read_fd_headers()) - self.fn_headers: List[str] = list(self.read_fn_headers()) def _get_section(self, wanted_type: Type[TDSection]) -> TDSection: return self.sections_by_type[wanted_type] def read_fd_headers(self) -> Iterator[Tuple[Path, TDFDHeader]]: - print("hi from fd_headers") sources = self.sections_by_type[TDSourceSection] strings = self.sections_by_type[TDStringSection] assert isinstance(sources, TDSourceSection) assert isinstance(strings, TDStringSection) for source in sources.enumerate(): - source_name = strings.read_string(source.name_offset) - print(source_name) - yield Path(source_name), source + yield Path(strings.read_string(source.name_offset)), source - def read_fn_headers(self) -> Iterator[str]: + @property + def mangled_fn_symbol_lookup(self) -> Dict[int, str]: + """Unordered! map of dynamically observed function IDs to clang symbols. You can demangle the symbols with cxxfilt.demangle.""" + lookup = {} functions = self.sections_by_type[TDFunctionsSection] - strings = self.sections_by_type[TDStringSection] assert isinstance(functions, TDFunctionsSection) + strings = self.sections_by_type[TDStringSection] assert isinstance(strings, TDStringSection) - for header in functions: - name = strings.read_string(header.name_offset) - print(name) - yield name + for entry in functions: + lookup[entry.function_id] = strings.read_string(entry.name_offset) + + return lookup + + def cflog(self, demangle_symbols: bool=False) -> Iterator[ControlFlowEvent]: + """Presents the control flow log. Does not demangle symbols by default, for performance.""" + cflog_section = self.sections_by_type[TDControlFlowLogSection] + + if demangle_symbols: + for cflog_entry in cflog_section: + cflog_entry.callstack[:] = [demangle(self.mangled_fn_symbol_lookup[function_id]) for function_id in cflog_entry.callstack] + + yield cflog_entry + else: + cflog_section.__iter__() def input_labels(self) -> Iterator[int]: """Enumerates all taint labels that are input labels (source taint)""" @@ -566,9 +599,7 @@ def taints(self) -> Taints: class TDProgramTrace(ProgramTrace): def __init__(self, file: BinaryIO) -> None: - print("hi!") self.tdfile: TDFile = TDFile(file) - print("HEOOOOLLLOOOOOO") self.tforest: TDTaintForest = TDTaintForest(self) self._inputs = None @@ -803,13 +834,7 @@ def __init_arguments__(self, parser): "--print-fd-headers", "-f", action="store_true", - help="print file descriptor headers", - ) - parser.add_argument( - "--print-fn-headers", - "-x", - action="store_true", - help="print function headers", + help="print file descriptor headers (sources)", ) parser.add_argument( "--print-taint-sinks", @@ -848,10 +873,6 @@ def run(self, args): path = h[0] print(f"{i}: {path}") - if args.print_fn_headers: - for i, function_name in enumerate(tdfile.fn_headers): - print(f"{i}: {function_name}") - if args.print_taint_sinks: for s in tdfile.sinks: print(f"{s} -> {tdfile.decode_node(s.label)}") @@ -861,11 +882,9 @@ def run(self, args): print(f"Label {lbl}: {tdfile.decode_node(lbl)}") if args.print_function_trace: - for e in tdfile.events: - print(f"{e}") + for k,v in tdfile.mangled_fn_symbol_lookup: + print(f"function_id '{k}': function '{demangle(v)}'") if args.print_control_flow_log: - cflog = tdfile._get_section(TDControlFlowLogSection) - assert isinstance(cflog, TDControlFlowLogSection) - for obj in cflog: - print(f"{obj}") + for event in tdfile.cflog(demangle_symbols=True): + print(str(event)) diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index 6bbbc639..f3970968 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -6,23 +6,85 @@ from pathlib import Path from polytracker.taint_dag import ( - # TDEvent, + ControlFlowEvent, TDControlFlowLogSection, - TDEnterFunctionEvent, - TDLeaveFunctionEvent, - TDTaintedControlFlowEvent, + CFEnterFunctionEvent, + CFLeaveFunctionEvent, + TaintedControlFlowEvent, TDProgramTrace ) from polytracker import ProgramTrace from typing import List @pytest.mark.program_trace("test_fntrace.cpp") -def test_cf_log_fn_trace(program_trace: ProgramTrace): - assert isinstance(program_trace, TDProgramTrace) +def test_function_mapping(program_trace: ProgramTrace): + mangled_symbols = list(program_trace.tdfile.mangled_fn_symbol_lookup.values()) - # we store the names in llvm mangled fashion but... - assert program_trace.tdfile.fn_headers == ["main", "_Z9factoriali"] + assert mangled_symbols == ["main", "_Z9factoriali"] + expected_names = ["main", "factorial(int)"] + for symbol in mangled_symbols: + assert cxxfilt.demangle(symbol) in expected_names - # you can easily unmangle them for human readable stack traces! - functionid_mapping: List[str] = list(map(cxxfilt.demangle, program_trace.tdfile.fn_headers)) - assert functionid_mapping == ["main", "factorial(int)"] +@pytest.mark.program_trace("test_fntrace.cpp") +def test_callstack_mapping(program_trace: ProgramTrace): + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] + + for cflog_entry in cflog: + assert len(cflog_entry.callstack) > 0 + # a callstack entry (if not mapped and demangled) is just a function id + for callstack_entry in cflog_entry.callstack: + # when we look up the function id it should map to a name we traced + assert callstack_entry in program_trace.tdfile.mangled_fn_symbol_lookup + +@pytest.mark.program_trace("test_cf_log.cpp") +def test_cf_log(instrumented_binary: Path, trace_file: Path): + """Demonstrates how the cflog should work end to end, integrated with the fn mapping and the function symbols from the strings table.""" + # Data to write to stdin, one byte at a time + stdin_data = "abcdefgh" + + subprocess.run( + [str(instrumented_binary)], + input=stdin_data.encode("utf-8"), + env={ + "POLYDB": str(trace_file), + "POLYTRACKER_STDIN_SOURCE": "1", + }, + ) + + program_trace = polytracker.PolyTrackerTrace.load(trace_file) + + expected_seq = [ + CFEnterFunctionEvent(["main"]), + TaintedControlFlowEvent(["main"], 1), + TaintedControlFlowEvent(["main"], 2), + TaintedControlFlowEvent(["main"], 3), + TaintedControlFlowEvent(["main"], 4), + TaintedControlFlowEvent(["main"], 5), + TaintedControlFlowEvent(["main"], 6), + TaintedControlFlowEvent(["main"], 7), + TaintedControlFlowEvent(["main"], 8), + TaintedControlFlowEvent(["main"], 15), + TaintedControlFlowEvent(["main"], 3), + CFEnterFunctionEvent(["main", "f1(unsigned char)"]), + TaintedControlFlowEvent(["main", "f1(unsigned char)"], 7), + CFEnterFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), + TaintedControlFlowEvent( + ["main", "f1(unsigned char)", "f2(unsigned char)"], 7 + ), + CFLeaveFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), + CFLeaveFunctionEvent(["main", "f1(unsigned char)"]), + CFLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit + ] + + cflog: List[ControlFlowEvent] = program_trace.tdfile.cflog(demangle_symbols=True) + for got, expected in zip(cflog, expected_seq): + assert got == expected + + if type(got) == TaintedControlFlowEvent: + assert got.label is not None + + assert len(got.callstack) > 0 + + for entry in cflog: + for callstack_entry in entry.callstack: + assert callstack_entry in list(program_trace.tdfile.mangled_fn_symbol_lookup.values()) \ No newline at end of file diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index 7afe706c..a00c18f3 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -343,6 +343,7 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { WHEN("Adding to the string table") { THEN("A string bigger than the maximum string size will be truncated and stored") { + // display the info logging spdlog::set_level(spdlog::level::debug); auto len = StringTable::max_entry_size + 10; From f6917a435d14705f7031721b695c1afcde1b1cdc Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 20:42:34 +0000 Subject: [PATCH 023/112] adds a cflog label test --- tests/test_cf_log.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index f3970968..0d8e962d 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -7,11 +7,11 @@ from polytracker.taint_dag import ( ControlFlowEvent, - TDControlFlowLogSection, CFEnterFunctionEvent, CFLeaveFunctionEvent, TaintedControlFlowEvent, - TDProgramTrace + TDControlFlowLogSection, + TDNode, ) from polytracker import ProgramTrace from typing import List @@ -36,6 +36,18 @@ def test_callstack_mapping(program_trace: ProgramTrace): # when we look up the function id it should map to a name we traced assert callstack_entry in program_trace.tdfile.mangled_fn_symbol_lookup +@pytest.mark.program_trace("test_fntrace.cpp") +def test_label_mapping(program_trace: ProgramTrace): + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] + + for cflog_entry in cflog: + if cflog_entry.label is not None: + node: TDNode = program_trace.tdfile.decode_node(cflog_entry.label) + assert node.affects_control_flow + else: + assert cflog_entry.label is None + + @pytest.mark.program_trace("test_cf_log.cpp") def test_cf_log(instrumented_binary: Path, trace_file: Path): """Demonstrates how the cflog should work end to end, integrated with the fn mapping and the function symbols from the strings table.""" From 26a2ea1e84b03a35707cc6a3e82b79f34c07946f Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 21:13:18 +0000 Subject: [PATCH 024/112] first step toward removing json writer: don't write to the file --- .../src/passes/tainted_control_flow.cpp | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 2f20ea9f..16c81918 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -55,19 +55,15 @@ class FunctionMappingJSONWriter { } // namespace detail namespace { -uint32_t -get_or_add_mapping(uintptr_t key, std::unordered_map &m, - uint32_t &counter, std::string_view name, - polytracker::detail::FunctionMappingJSONWriter &js) { - if (auto it = m.find(key); it != m.end()) { - return it->second; - } else { - js.append(name); - return m[key] = counter++; + uint32_t get_or_add_mapping(uintptr_t key, std::unordered_map &mapping, uint32_t &counter) { + if (auto it = mapping.find(key); it != mapping.end()) { + return it->second; + } else { + return mapping[key] = counter++; + } } -} - } // namespace + void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, llvm::Value *val) { llvm::IRBuilder<> ir(&inst); @@ -81,9 +77,7 @@ void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, llvm::ConstantInt * TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { auto func_address = reinterpret_cast(&func); - std::string_view name = func.getName(); - auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_, - name, *function_mapping_writer_); + auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); } From 8df5eece60c7d1eaa5621ac1bc8a01a56af1b6cf Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 21:34:23 +0000 Subject: [PATCH 025/112] remove json writing frmo control flow low --- .../polytracker/passes/tainted_control_flow.h | 9 - .../src/passes/tainted_control_flow.cpp | 265 ++++++++---------- 2 files changed, 115 insertions(+), 159 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index e4d3d033..fb6463d4 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -13,9 +13,6 @@ #include namespace polytracker { -namespace detail { -struct FunctionMappingJSONWriter; -} class TaintedControlFlowPass : public llvm::PassInfoMixin, @@ -42,10 +39,6 @@ class TaintedControlFlowPass public: using function_id = uint32_t; - TaintedControlFlowPass(); - TaintedControlFlowPass(TaintedControlFlowPass &&); - ~TaintedControlFlowPass(); - llvm::PreservedAnalyses run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam); void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); @@ -60,8 +53,6 @@ class TaintedControlFlowPass std::unordered_map function_ids_; function_id function_counter_{0}; - - std::unique_ptr function_mapping_writer_; }; } // namespace polytracker \ No newline at end of file diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 16c81918..73f21552 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -27,33 +27,6 @@ static llvm::cl::list ignore_lists( namespace polytracker { -namespace detail { -// Helper type to produce the json file of function names by functionid -class FunctionMappingJSONWriter { -public: - FunctionMappingJSONWriter(std::string_view filename) - : file(filename.data(), std::ios::binary) { - file << "["; - } - - ~FunctionMappingJSONWriter() { - // Back up and erase the last ",\n" - file.seekp(-2, std::ios::cur); - file << "\n]\n"; - } - - void append(std::string_view name) { - // Will cause an additional ',' but don't care about that right now... - // The destructor will back up two steps and replace the ',' with a newline - // and array termination. - file << "\"" << name << "\",\n"; - } - -private: - std::ofstream file; -}; -} // namespace detail - namespace { uint32_t get_or_add_mapping(uintptr_t key, std::unordered_map &mapping, uint32_t &counter) { if (auto it = mapping.find(key); it != mapping.end()) { @@ -64,155 +37,147 @@ namespace { } } // namespace -void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, - llvm::Value *val) { - llvm::IRBuilder<> ir(&inst); - auto dummy_val{val}; - if (inst.getType()->isVectorTy()) { - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); - } - ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); -} - -llvm::ConstantInt * -TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { - auto func_address = reinterpret_cast(&func); - auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); - return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); -} - -llvm::ConstantInt * -TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { - return get_function_id_const(*(i.getParent()->getParent())); -} - -void TaintedControlFlowPass::visitGetElementPtrInst( - llvm::GetElementPtrInst &gep) { - llvm::IRBuilder<> ir(&gep); - for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { - continue; - } - - // we do not handle VectorTypes yet - if ((*(idx->getType())).isVectorTy()) { - continue; + void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, + llvm::Value *val) { + llvm::IRBuilder<> ir(&inst); + auto dummy_val{val}; + if (inst.getType()->isVectorTy()) { + dummy_val = ir.CreateExtractElement(val, uint64_t(0)); } + ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); + } - auto callret = ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), - get_function_id_const(gep)}); + llvm::ConstantInt * + TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { + auto func_address = reinterpret_cast(&func); + auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); + return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); + } - idx = ir.CreateSExtOrTrunc(callret, idx->getType()); + llvm::ConstantInt * + TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { + return get_function_id_const(*(i.getParent()->getParent())); } -} -void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { - if (bi.isUnconditional()) { - return; + void TaintedControlFlowPass::visitGetElementPtrInst( + llvm::GetElementPtrInst &gep) { + llvm::IRBuilder<> ir(&gep); + for (auto &idx : gep.indices()) { + if (llvm::isa(idx)) { + continue; + } + + // we do not handle VectorTypes yet + if ((*(idx->getType())).isVectorTy()) { + continue; + } + + auto callret = ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), + get_function_id_const(gep)}); + + idx = ir.CreateSExtOrTrunc(callret, idx->getType()); + } } - llvm::IRBuilder<> ir(&bi); - auto cond = bi.getCondition(); + void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { + if (bi.isUnconditional()) { + return; + } - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); + llvm::IRBuilder<> ir(&bi); + auto cond = bi.getCondition(); - bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); -} + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); -void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); + bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); + } - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); -} + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); -void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - // TODO(hbrodin): Can't handle atm. - if (si.getType()->isVectorTy()) { - return; + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); } - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { + // TODO(hbrodin): Can't handle atm. + if (si.getType()->isVectorTy()) { + return; + } + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); -} + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); -void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { - llvm::LLVMContext *context = &mod.getContext(); - llvm::IRBuilder<> ir(*context); + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); + } - cond_br_log_fn = mod.getOrInsertFunction( - "__polytracker_log_tainted_control_flow", - llvm::AttributeList::get( - mod.getContext(), - {{llvm::AttributeList::FunctionIndex, - llvm::Attribute::get(mod.getContext(), - llvm::Attribute::ReadNone)}}), - ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); + void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { + llvm::LLVMContext *context = &mod.getContext(); + llvm::IRBuilder<> ir(*context); - enter_log_fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), llvm::Type::getInt8PtrTy(*context)); + cond_br_log_fn = mod.getOrInsertFunction( + "__polytracker_log_tainted_control_flow", + llvm::AttributeList::get( + mod.getContext(), + {{llvm::AttributeList::FunctionIndex, + llvm::Attribute::get(mod.getContext(), + llvm::Attribute::ReadNone)}}), + ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); - fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", enter_log_fn_type); + enter_log_fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), llvm::Type::getInt8PtrTy(*context)); - fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); -} + fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", enter_log_fn_type); -void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { - if (func.isDeclaration()) { - return; + fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); } - llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); - - ir.CreateCall(fn_enter_log_fn, - { - get_function_id_const(func), - ir.CreateGlobalStringPtr(func.getName()) - } - ); -} - -void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { - llvm::IRBuilder<> ir(&ri); - ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); -} - -llvm::PreservedAnalyses -TaintedControlFlowPass::run(llvm::Module &mod, - llvm::ModuleAnalysisManager &mam) { - label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); - declareLoggingFunctions(mod); - auto fnsToIgnore{readIgnoreLists(ignore_lists)}; - - for (auto &fn : mod) { - auto fname{fn.getName()}; - if (fnsToIgnore.count(fname.str())) { - continue; - } else { - instrumentFunctionEnter(fn); - visit(fn); + + void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { + if (func.isDeclaration()) { + return; } + llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); + + ir.CreateCall(fn_enter_log_fn, + { + get_function_id_const(func), + ir.CreateGlobalStringPtr(func.getName()) + } + ); + } + + void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { + llvm::IRBuilder<> ir(&ri); + ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); } - return llvm::PreservedAnalyses::none(); -} + llvm::PreservedAnalyses + TaintedControlFlowPass::run(llvm::Module &mod, + llvm::ModuleAnalysisManager &mam) { + label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); + declareLoggingFunctions(mod); + auto fnsToIgnore{readIgnoreLists(ignore_lists)}; + + for (auto &fn : mod) { + auto fname{fn.getName()}; + if (fnsToIgnore.count(fname.str())) { + continue; + } else { + instrumentFunctionEnter(fn); + visit(fn); + } + } -TaintedControlFlowPass::TaintedControlFlowPass() - : function_mapping_writer_( - std::make_unique( - "functionid.json")) {} + return llvm::PreservedAnalyses::none(); + } -TaintedControlFlowPass::~TaintedControlFlowPass() = default; -TaintedControlFlowPass::TaintedControlFlowPass(TaintedControlFlowPass &&) = - default; } // namespace polytracker \ No newline at end of file From 3e69c3bdaa073d75bbdf859bd589de334796e6ad Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 22:09:11 +0000 Subject: [PATCH 026/112] remove apparently unused function defn from the TCF header --- polytracker/include/polytracker/passes/tainted_control_flow.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index fb6463d4..ddbd4c10 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -49,8 +49,6 @@ class TaintedControlFlowPass void instrumentFunctionEnter(llvm::Function &func); void visitReturnInst(llvm::ReturnInst &ri); - function_id function_mapping(llvm::Function &func); - std::unordered_map function_ids_; function_id function_counter_{0}; }; From 49710b99952f90a14811e8a450fd2bfe7d180804 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 22:10:17 +0000 Subject: [PATCH 027/112] remove functionality that calls function that doesn't exist in an effort to protect dfsan from terminating appropriately - not quite sure yet what to do instead but this isn't doing anything but making the test hang --- polytracker/src/polytracker/polytracker.cpp | 24 +-------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index bf1912b9..1dc3149d 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -13,27 +13,16 @@ EARLY_CONSTRUCT_EXTERN_GETTER(taintdag::PolyTracker, polytracker_tdag); static std::atomic_flag polytracker_init_flag = ATOMIC_FLAG_INIT; -static bool polytracker_is_initialized() { - return polytracker_init_flag.test(std::memory_order_relaxed); -} - static void polytracker_initialize() { polytracker_init_flag.test_and_set(std::memory_order_relaxed); } extern "C" dfsan_label __polytracker_union_table(const dfsan_label &l1, const dfsan_label &l2) { - if (!polytracker_is_initialized()) { - return 0; - } return get_polytracker_tdag().union_labels(l1, l2); } extern "C" void __polytracker_log_conditional_branch(dfsan_label label) { - if (!polytracker_is_initialized()) { - return; - } - if (label > 0) { get_polytracker_tdag().affects_control_flow(label); } @@ -42,9 +31,6 @@ extern "C" void __polytracker_log_conditional_branch(dfsan_label label) { extern "C" void __dfsw___polytracker_log_conditional_branch(uint64_t conditional, dfsan_label conditional_label) { - if (!polytracker_is_initialized()) { - return; - } __polytracker_log_conditional_branch(conditional_label); } @@ -60,28 +46,20 @@ extern "C" void __polytracker_taint_argv(int argc, char *argv[]) { extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, dfsan_label function_label, dfsan_label *ret_label) { - if (!polytracker_is_initialized()) { - return 0; - } if (conditional_label > 0) { get_polytracker_tdag().log_tainted_control_flow(conditional_label, functionid); } + *ret_label = conditional_label; return conditional; } extern "C" void __polytracker_enter_function(uint32_t function_id, const char* function_name) { - if (!polytracker_is_initialized()) { - return; - } get_polytracker_tdag().enter_function(function_id); get_polytracker_tdag().record_function_name(function_id, std::string_view(function_name)); } extern "C" void __polytracker_leave_function(uint32_t function_id) { - if (!polytracker_is_initialized()) { - return; - } get_polytracker_tdag().leave_function(function_id); } \ No newline at end of file From 06954dd2a37a77a71b7d213f283f449b59f0d8aa Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 23:41:48 +0000 Subject: [PATCH 028/112] inserts missing semicolon --- polytracker/src/polytracker/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/src/polytracker/main.cpp b/polytracker/src/polytracker/main.cpp index 9f095409..4653afe6 100644 --- a/polytracker/src/polytracker/main.cpp +++ b/polytracker/src/polytracker/main.cpp @@ -87,7 +87,7 @@ polytrackers settings 3. Set rest to default if possible and error if no polypath. */ void polytracker_get_settings() { - DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_db_name) + DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_db_name); DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_stderr_sink); DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_stdout_sink); DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_stdin_source); From 1116696914f8dcf2d1c0f65ebb74179bfac306a7 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Thu, 12 Dec 2024 23:56:37 +0000 Subject: [PATCH 029/112] remove extraneous header --- unittests/src/taintdag/tdag.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index a00c18f3..766fcda2 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -1,5 +1,4 @@ #include -#include #include "taintdag/outputfile.h" #include "taintdag/section.h" From 5dda1830cfc78be9e3dab8306765fb5964339cb4 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 20:25:12 +0000 Subject: [PATCH 030/112] gets rid of the argument --taint since we do that currently by default --- examples/Dockerfile-acropalypse.demo | 2 +- examples/Dockerfile-daedalus-nitf.demo | 2 +- examples/Dockerfile-daedalus-pdf.demo | 2 +- examples/Dockerfile-ffmpeg.demo | 2 +- examples/Dockerfile-file.demo | 2 +- examples/Dockerfile-jq.demo | 2 +- examples/Dockerfile-libjpeg.demo | 2 +- examples/Dockerfile-mupdf.demo | 2 +- examples/Dockerfile-nitro-nitf.demo | 2 +- examples/Dockerfile-openjpeg.demo | 2 +- examples/Dockerfile-poppler.demo | 4 +-- examples/Dockerfile-qpdf.demo | 2 +- examples/Dockerfile-xpdf.demo | 6 ++-- examples/analysis/ubet/Dockerfile.nitro | 4 +-- examples/analysis/ubet/build_nitro.sh | 4 +-- examples/analysis/ubet/eval.py | 2 +- examples/http/httpd/Dockerfile | 2 +- examples/http/picohttpparser/Dockerfile | 2 +- polytracker/build.py | 41 ++++++------------------- tests/conftest.py | 2 +- 20 files changed, 33 insertions(+), 56 deletions(-) diff --git a/examples/Dockerfile-acropalypse.demo b/examples/Dockerfile-acropalypse.demo index 95f88910..e4fd23db 100644 --- a/examples/Dockerfile-acropalypse.demo +++ b/examples/Dockerfile-acropalypse.demo @@ -27,5 +27,5 @@ RUN CPPFLAGS="-I$(pwd)/zlib-1.2.13/include" LDFLAGS="-L$(pwd)/zlib-1.2.13/lib" p RUN polytracker extract-bc -o pngtest.bc pngtest RUN llvm-link -o pngtest-linked.bc pngtest.bc libz.bc -RUN polytracker instrument-bc --taint --cflog pngtest-linked.bc -o instrumented.bc +RUN polytracker instrument-bc --cflog pngtest-linked.bc -o instrumented.bc RUN polytracker lower-bc instrumented.bc -t pngtest -o pngtest.instrumented diff --git a/examples/Dockerfile-daedalus-nitf.demo b/examples/Dockerfile-daedalus-nitf.demo index ebdfabeb..e8ae88ae 100644 --- a/examples/Dockerfile-daedalus-nitf.demo +++ b/examples/Dockerfile-daedalus-nitf.demo @@ -30,5 +30,5 @@ RUN cabal run ../../:daedalus -- compile-c++ nitf_main.ddl --out-dir=cpp_parser WORKDIR /polytracker/the_klondike/daedalus/formats/nitf/cpp_parser RUN polytracker build make parser && \ - polytracker instrument-targets --taint --cflog parser --ignore-lists gmp ssl libz && \ + polytracker instrument-targets --cflog parser --ignore-lists gmp ssl libz && \ mv parser.instrumented parser-track diff --git a/examples/Dockerfile-daedalus-pdf.demo b/examples/Dockerfile-daedalus-pdf.demo index f8d9edd5..816ee44e 100644 --- a/examples/Dockerfile-daedalus-pdf.demo +++ b/examples/Dockerfile-daedalus-pdf.demo @@ -30,5 +30,5 @@ WORKDIR /polytracker/the_klondike/daedalus/formats/pdf/new/c++ RUN polytracker build cmake -S . -B build RUN polytracker build cmake --build build --target parser-test -j$(nproc) -RUN polytracker instrument-targets --taint --cflog parser-test --ignore-lists gmp ssl libz +RUN polytracker instrument-targets --cflog parser-test --ignore-lists gmp ssl libz RUN mv parser-test.instrumented parser-test-track \ No newline at end of file diff --git a/examples/Dockerfile-ffmpeg.demo b/examples/Dockerfile-ffmpeg.demo index 91bcb336..a8d90d50 100644 --- a/examples/Dockerfile-ffmpeg.demo +++ b/examples/Dockerfile-ffmpeg.demo @@ -27,7 +27,7 @@ RUN ../configure --disable-everything \ --disable-asm RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --cflog ffmpeg_g --ignore-lists libz +RUN polytracker instrument-targets --cflog ffmpeg_g --ignore-lists libz RUN mv ffmpeg_g.instrumented ffmpeg_track # Use the following command to transcode a `.mov` h264/aac file to an `.avi` raw/aac file diff --git a/examples/Dockerfile-file.demo b/examples/Dockerfile-file.demo index ceb94a1a..09c779f8 100644 --- a/examples/Dockerfile-file.demo +++ b/examples/Dockerfile-file.demo @@ -16,5 +16,5 @@ RUN git fetch --tags && \ RUN autoreconf -fiv RUN ./configure --prefix=/polytracker/the_klondike/bin/ --disable-shared RUN polytracker build make -j$((`nproc`+1)) install -RUN polytracker instrument-targets --taint --cflog file --ignore-lists libz +RUN polytracker instrument-targets --cflog file --ignore-lists libz RUN mv file.instrumented file_track diff --git a/examples/Dockerfile-jq.demo b/examples/Dockerfile-jq.demo index 27dabc9a..4d1c260f 100644 --- a/examples/Dockerfile-jq.demo +++ b/examples/Dockerfile-jq.demo @@ -11,4 +11,4 @@ WORKDIR /polytracker/the_klondike/jq RUN autoreconf -fi RUN ./configure --with-oniguruma=builtin CC=clang RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --cflog jq \ No newline at end of file +RUN polytracker instrument-targets --cflog jq \ No newline at end of file diff --git a/examples/Dockerfile-libjpeg.demo b/examples/Dockerfile-libjpeg.demo index 80809ce2..09f341e2 100644 --- a/examples/Dockerfile-libjpeg.demo +++ b/examples/Dockerfile-libjpeg.demo @@ -18,6 +18,6 @@ WORKDIR /polytracker/the_klondike/jpeg-9e/build RUN ../configure LDFLAGS="-static" # Build and instrument RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --cflog djpeg +RUN polytracker instrument-targets --cflog djpeg # Create `djpeg_track` RUN mv djpeg.instrumented djpeg_track \ No newline at end of file diff --git a/examples/Dockerfile-mupdf.demo b/examples/Dockerfile-mupdf.demo index 12945516..ca59d1c0 100644 --- a/examples/Dockerfile-mupdf.demo +++ b/examples/Dockerfile-mupdf.demo @@ -22,7 +22,7 @@ WORKDIR /polytracker/the_klondike/mupdf RUN git checkout d00de0e96a4a5ec90ffc30837d40cd624a6a89e0 # Instrument mutool RUN polytracker build make -j$((`nproc`+1)) HAVE_X11=no HAVE_GLUT=no prefix=/usr/local build=release install -RUN polytracker instrument-targets --taint --cflog mutool +RUN polytracker instrument-targets --cflog mutool RUN mv mutool.instrumented mutool_track # Note, the /workdir directory is intended to be mounted at runtime VOLUME ["/workdir"] diff --git a/examples/Dockerfile-nitro-nitf.demo b/examples/Dockerfile-nitro-nitf.demo index e63c4201..d13b3181 100644 --- a/examples/Dockerfile-nitro-nitf.demo +++ b/examples/Dockerfile-nitro-nitf.demo @@ -18,6 +18,6 @@ RUN polytracker build cmake .. \ RUN polytracker build cmake --build . -j$((`nproc`+1)) --target show_nitf++ -RUN polytracker instrument-targets --taint --cflog show_nitf++ +RUN polytracker instrument-targets --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_track diff --git a/examples/Dockerfile-openjpeg.demo b/examples/Dockerfile-openjpeg.demo index 7d758d7b..ba2cd59b 100644 --- a/examples/Dockerfile-openjpeg.demo +++ b/examples/Dockerfile-openjpeg.demo @@ -20,5 +20,5 @@ RUN polytracker extract-bc bin/opj_decompress -o opj_decompress.bc RUN polytracker extract-bc bin/libopenjp2.a -o libopenjp2.a.bc RUN llvm-link -only-needed opj_decompress.bc libopenjp2.a.bc -o exec.bc RUN polytracker opt-bc exec.bc -o exec.bc -RUN polytracker instrument-bc --taint --cflog exec.bc -o exec.bc -o exec.instrumented.bc +RUN polytracker instrument-bc --cflog exec.bc -o exec.bc -o exec.instrumented.bc RUN polytracker lower-bc exec.instrumented.bc -t opj_decompress -o opj_decompress_track diff --git a/examples/Dockerfile-poppler.demo b/examples/Dockerfile-poppler.demo index b67d50ff..1f964559 100644 --- a/examples/Dockerfile-poppler.demo +++ b/examples/Dockerfile-poppler.demo @@ -44,7 +44,7 @@ RUN polytracker build cmake -S . -B build \ RUN polytracker build cmake --build build -j$(nproc) # pdftotext (separate for measurement purposes only) -RUN polytracker instrument-targets --taint --cflog pdftotext --ignore-lists freetype fontconfig +RUN polytracker instrument-targets --cflog pdftotext --ignore-lists freetype fontconfig # pdftops (separate for measurement purposes only) -RUN polytracker instrument-targets --taint --cflog pdftops --ignore-lists freetype fontconfig \ No newline at end of file +RUN polytracker instrument-targets --cflog pdftops --ignore-lists freetype fontconfig \ No newline at end of file diff --git a/examples/Dockerfile-qpdf.demo b/examples/Dockerfile-qpdf.demo index 9360d7f2..c685579b 100644 --- a/examples/Dockerfile-qpdf.demo +++ b/examples/Dockerfile-qpdf.demo @@ -23,5 +23,5 @@ WORKDIR /polytracker/the_klondike/qpdf RUN polytracker build cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DBUILD_STATIC_LIBS=ON #Instrument and build track target RUN polytracker build cmake --build build -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --cflog qpdf --ignore-lists libz +RUN polytracker instrument-targets --cflog qpdf --ignore-lists libz RUN mv qpdf.instrumented qpdf_track diff --git a/examples/Dockerfile-xpdf.demo b/examples/Dockerfile-xpdf.demo index 95ab1972..e80fb571 100644 --- a/examples/Dockerfile-xpdf.demo +++ b/examples/Dockerfile-xpdf.demo @@ -91,8 +91,8 @@ RUN polytracker build make -j$(nproc) install # pdftops.instrumented, pdftotext.instrumented, and pdfinfo.instrumented # These commands are split up for timing / debugging purposes but you could # run them all as one big instrument-targets as well. -RUN polytracker instrument-targets --taint --cflog pdftotext --ignore-lists freetype fontconfig xml2 libz +RUN polytracker instrument-targets --cflog pdftotext --ignore-lists freetype fontconfig xml2 libz -RUN polytracker instrument-targets --taint --cflog pdfinfo --ignore-lists freetype fontconfig xml2 libz +RUN polytracker instrument-targets --cflog pdfinfo --ignore-lists freetype fontconfig xml2 libz -RUN polytracker instrument-targets --taint --cflog pdftops --ignore-lists freetype fontconfig xml2 libz \ No newline at end of file +RUN polytracker instrument-targets --cflog pdftops --ignore-lists freetype fontconfig xml2 libz \ No newline at end of file diff --git a/examples/analysis/ubet/Dockerfile.nitro b/examples/analysis/ubet/Dockerfile.nitro index 93ef1436..761346cb 100644 --- a/examples/analysis/ubet/Dockerfile.nitro +++ b/examples/analysis/ubet/Dockerfile.nitro @@ -30,7 +30,7 @@ RUN polytracker build cmake --build . -j$((`nproc`+1)) --target show_nitf++ --co RUN cp modules/c++/nitf/show_nitf++ nitro_Release -RUN polytracker instrument-targets --taint --cflog show_nitf++ +RUN polytracker instrument-targets --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_trackRelease @@ -45,7 +45,7 @@ RUN polytracker build cmake ../.. \ RUN polytracker build cmake --build . -j$((`nproc`+1)) --clean-first --target show_nitf++ --config Debug RUN cp modules/c++/nitf/show_nitf++ nitro_Debug -RUN polytracker instrument-targets --taint --cflog show_nitf++ +RUN polytracker instrument-targets --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_trackDebug diff --git a/examples/analysis/ubet/build_nitro.sh b/examples/analysis/ubet/build_nitro.sh index 2b53039e..6e88735d 100755 --- a/examples/analysis/ubet/build_nitro.sh +++ b/examples/analysis/ubet/build_nitro.sh @@ -14,7 +14,7 @@ opt -load "${COMPILER_DIR}/pass/libPolytrackerPass.so" -load-pass-plugin "${COMP echo "Optmize bitcode" polytracker opt-bc --output O3.bc after_preoptO3.bc echo "Instrument optimized bitcode" -polytracker instrument-bc --cflog --taint --output instrumentedO3.bc O3.bc +polytracker instrument-bc --cflog --output instrumentedO3.bc O3.bc echo "Lower optimized bitcode" polytracker lower-bc -t show_nitf++ -o nitro_trackRelease instrumentedO3.bc @@ -36,7 +36,7 @@ opt -load "${COMPILER_DIR}/pass/libPolytrackerPass.so" -load-pass-plugin "${COMP cp after_preoptO0.bc O0.bc echo "Instrument non-optimized bitcode" -polytracker instrument-bc --cflog --taint --output instrumentedO0.bc O0.bc +polytracker instrument-bc --cflog --output instrumentedO0.bc O0.bc echo "Lower non-optimized bitcode" polytracker lower-bc -t show_nitf++ -o nitro_trackDebug instrumentedO0.bc diff --git a/examples/analysis/ubet/eval.py b/examples/analysis/ubet/eval.py index 36344f34..0e219d18 100644 --- a/examples/analysis/ubet/eval.py +++ b/examples/analysis/ubet/eval.py @@ -37,7 +37,7 @@ def polytracker_build(cmdline): def polytracker_instrument(bin): - command = ["/usr/bin/env", "polytracker", "instrument-targets", "--taint", bin] + command = ["/usr/bin/env", "polytracker", "instrument-targets", "--cflog", bin] target_name = f"{bin}.instrumented" if not no_build: subprocess.call(command, cwd=src_dir) diff --git a/examples/http/httpd/Dockerfile b/examples/http/httpd/Dockerfile index cdf34a9c..46385bc6 100644 --- a/examples/http/httpd/Dockerfile +++ b/examples/http/httpd/Dockerfile @@ -42,7 +42,7 @@ RUN CFLAGS="-I$(pwd)/srclib/pcre -I$(pwd)/srclib/expat/lib" \ LDFLAGS="-L$(pwd)/srclib/pcre/.libs -L$(pwd)/srclib/expat/lib/.libs" \ polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --cflog httpd +RUN polytracker instrument-targets --cflog httpd RUN mv httpd.instrumented httpd_track # overwrite binary to be installed with our polytracker-instrumented version RUN cp httpd_track httpd diff --git a/examples/http/picohttpparser/Dockerfile b/examples/http/picohttpparser/Dockerfile index f1d96b99..9e4624c9 100644 --- a/examples/http/picohttpparser/Dockerfile +++ b/examples/http/picohttpparser/Dockerfile @@ -10,7 +10,7 @@ COPY Makefile example_picohttpparser.c /polytracker/examples/http/picohttpparser # Build and instrument RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --cflog example_picohttpparser +RUN polytracker instrument-targets --cflog example_picohttpparser RUN mv example_picohttpparser.instrumented example_picohttpparser_track # Note, the /workdir and /testcase directories are intended to be mounted at runtime diff --git a/polytracker/build.py b/polytracker/build.py index 0b4b0fb7..8e983f52 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -205,11 +205,7 @@ def _instrument_bitcode( str(POLY_PASS_PATH), ] - pass_pipeline: List[str] = [] - pass_pipeline.append("pt-taint") - - pass_pipeline += ["pt-dfsan", "pt-rm-fn-attr"] - + pass_pipeline: List[str] = ["pt-taint", "pt-dfsan", "pt-rm-fn-attr"] cmd.append(f"-passes={','.join(pass_pipeline)}") # ignore lists for `pt-taint` @@ -316,12 +312,6 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="output bitcode file", ) - parser.add_argument( - "--taint", - action="store_true", - help="instrument with taint tracking", - ) - parser.add_argument( "--ignore-lists", nargs="+", @@ -330,13 +320,10 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): ) def run(self, args: argparse.Namespace): - if args.taint: - _instrument_bitcode( - args.input, - args.output, - args.ignore_lists) - else: - raise ValueError("No action was specified. Try using the argument --taint?") + _instrument_bitcode( + args.input, + args.output, + args.ignore_lists) class LowerBitcode(Command): @@ -393,12 +380,6 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="path to blight journal", ) - parser.add_argument( - "--taint", - action="store_true", - help="instrument with taint tracking", - ) - parser.add_argument( "--ignore-lists", nargs="+", @@ -413,9 +394,6 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): ) def run(self, args: argparse.Namespace): - if not args.taint and not args.cflog: - raise ValueError("Did you specify an action? Try --taint or --cflog") - for target in args.targets: blight_cmds = _read_blight_journal(args.journal_path) target_cmd, target_path = _find_target(target, blight_cmds) @@ -431,9 +409,8 @@ def run(self, args: argparse.Namespace): _optimize_bitcode(bc_path, opt_bc) inst_bc_path = Path(f"{bc_path.stem}.instrumented.bc") - if args.taint: - _instrument_bitcode( - input_bitcode=opt_bc, - output_bitcode=inst_bc_path, - ignore_lists=args.ignore_lists) + _instrument_bitcode( + input_bitcode=opt_bc, + output_bitcode=inst_bc_path, + ignore_lists=args.ignore_lists) _lower_bitcode(inst_bc_path, Path(inst_bc_path.stem), target_cmd) diff --git a/tests/conftest.py b/tests/conftest.py index cac9f56b..5fa31b3d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,7 +35,7 @@ def build(target: Path, binary: Path) -> None: def instrument(target: str) -> None: - cmd = ["instrument-targets", "--taint", "--cflog", target] + cmd = ["instrument-targets", "--cflog", target] run_polytracker(cmd) From 581f21cbf4fb1d1db77dab9010c72702a2478234 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 21:57:24 +0000 Subject: [PATCH 031/112] add guards since for some reason the new functions section means the old cflog section doesn't read the same way --- polytracker/taint_dag.py | 64 +++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 13f193f0..a318917a 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -136,29 +136,18 @@ def read_raw(self, label): def count(self): return len(self.section) // sizeof(c_uint64) -class ControlFlowEvent: - callstack: List - label: int - def __init__(self, callstack: List, label: int = None): - """Callstack at the point the event occurred""" - self.callstack = callstack - self.label = label - - def __repr__(self, typ, callstack: List, label: int = None): - return f"{typ}: label {label}, callstack {callstack}" - -class CFEnterFunctionEvent(ControlFlowEvent): +class CFEnterFunctionEvent: """Emitted whenever execution enters a function. The callstack member is the callstack right before entering the function, having the function just entered as the last member of the callstack. """ def __init__(self, callstack: List): - super().__init__(callstack) + self.callstack = callstack def __repr__(self) -> str: - ControlFlowEvent.__repr__(type(CFEnterFunctionEvent), self.callstack, None) + return f"CFEnterFunctionEvent: {self.callstack}" def __eq__(self, __o: object) -> bool: if isinstance(__o, CFEnterFunctionEvent): @@ -166,17 +155,17 @@ def __eq__(self, __o: object) -> bool: return False -class CFLeaveFunctionEvent(ControlFlowEvent): +class CFLeaveFunctionEvent: """Emitted whenever execution leaves a function. The callstack member is the callstack right before leaving the function, having the function about to leave as the last member of the callstack. """ def __init__(self, callstack: List): - super().__init__(callstack) + self.callstack = callstack def __repr__(self) -> str: - ControlFlowEvent.__repr__(type(CFLeaveFunctionEvent), self.callstack, None) + return f"CFLeaveFunctionEvent: {self.callstack}" def __eq__(self, __o: object) -> bool: if isinstance(__o, CFLeaveFunctionEvent): @@ -184,17 +173,18 @@ def __eq__(self, __o: object) -> bool: return False -class TaintedControlFlowEvent(ControlFlowEvent): +class TaintedControlFlowEvent: """Emitted whenever a control flow change is influenced by tainted data. The label that influenced the control flow is available in the `label` member. Current callstack (including the function the control flow happened in) is available in the `callstack` member.""" - def __init__(self, callstack: List, label: int): - super().__init__(callstack, label) + def __init__(self, callstack: List, label: int = None): + self.callstack = callstack + self.label = label def __repr__(self) -> str: - ControlFlowEvent.__repr__(type(TaintedControlFlowEvent), self.callstack, self.label) + return f"TaintedControlFlowEvent: {self.label}, {self.callstack}" def __eq__(self, __o: object) -> bool: if isinstance(__o, TaintedControlFlowEvent): @@ -202,6 +192,13 @@ def __eq__(self, __o: object) -> bool: return False +ControlFlowEvent = Union[ + CFEnterFunctionEvent, + CFLeaveFunctionEvent, + TaintedControlFlowEvent, +] + + class TDControlFlowLogSection: """TDAG Control flow log section. @@ -495,9 +492,6 @@ def __init__(self, file: BinaryIO) -> None: self.fd_headers: List[Tuple[Path, TDFDHeader]] = list(self.read_fd_headers()) - def _get_section(self, wanted_type: Type[TDSection]) -> TDSection: - return self.sections_by_type[wanted_type] - def read_fd_headers(self) -> Iterator[Tuple[Path, TDFDHeader]]: sources = self.sections_by_type[TDSourceSection] strings = self.sections_by_type[TDStringSection] @@ -523,7 +517,9 @@ def mangled_fn_symbol_lookup(self) -> Dict[int, str]: def cflog(self, demangle_symbols: bool=False) -> Iterator[ControlFlowEvent]: """Presents the control flow log. Does not demangle symbols by default, for performance.""" + print(self.sections_by_type.keys()) cflog_section = self.sections_by_type[TDControlFlowLogSection] + assert isinstance(cflog_section, TDControlFlowLogSection) if demangle_symbols: for cflog_entry in cflog_section: @@ -531,7 +527,7 @@ def cflog(self, demangle_symbols: bool=False) -> Iterator[ControlFlowEvent]: yield cflog_entry else: - cflog_section.__iter__() + cflog_section() def input_labels(self) -> Iterator[int]: """Enumerates all taint labels that are input labels (source taint)""" @@ -853,14 +849,14 @@ def __init_arguments__(self, parser): "--print-function-trace", "-t", action="store_true", - help="print function trace events", + help="print function trace", ) parser.add_argument( "--print-control-flow-log", "-c", action="store_true", - help="print function trace events", + help="print control flow log events", ) def run(self, args): @@ -882,9 +878,15 @@ def run(self, args): print(f"Label {lbl}: {tdfile.decode_node(lbl)}") if args.print_function_trace: - for k,v in tdfile.mangled_fn_symbol_lookup: - print(f"function_id '{k}': function '{demangle(v)}'") + if TDFunctionsSection in tdfile.sections and len(tdfile.mangled_fn_symbol_lookup) > 0: + for k,v in tdfile.mangled_fn_symbol_lookup: + print(f"function_id '{k}': function '{demangle(v)}'") + else: + print("No Functions section could be read from the tdag!") if args.print_control_flow_log: - for event in tdfile.cflog(demangle_symbols=True): - print(str(event)) + if TDControlFlowLogSection in tdfile.sections: + for event in tdfile.cflog(demangle_symbols=True): + print(event) + else: + print("No Control Flow Log section could be read from the tdag! Consider trying to read it with an earlier Polytracker version?") From 2625d02c1377f76bbfa28651b0a15d149952557d Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 22:29:24 +0000 Subject: [PATCH 032/112] retain (deprecated) backward compatibility to be able to read older tdags that don't have the function mapping in them --- polytracker/taint_dag.py | 49 ++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index a318917a..036f677e 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -27,6 +27,7 @@ c_uint16, sizeof, ) +from typing_extensions import deprecated from .plugins import Command from .repl import PolyTrackerREPL @@ -93,6 +94,27 @@ def enumerate(self): for offset in range(0, len(self.mem), sizeof(TDFDHeader)): yield TDFDHeader.from_buffer_copy(self.mem[offset:]) +@deprecated("Use ControlFlowEvent instead, TDEvents are no longer written") +class TDEvent(Structure): + """This is an old version of the ControlFlowEvent kept for backward compatibility only""" + _fields_ = [("kind", c_uint8), ("fnidx", c_uint16)] + + class Kind(Enum): + ENTRY = 0 + EXIT = 1 + + def __repr__(self) -> str: + return f"kind: {self.Kind(self.kind).name} fnidx: {self.fnidx}" + +@deprecated("Use TDControlFlowLog instead, TDEvents section is no longer written") +class TDEventsSection: + """This is an old version of the CFLog kept for backward compatibility only""" + def __init__(self, mem, hdr): + self.section = mem[hdr.offset : hdr.offset + hdr.size] + + def __iter__(self): + for offset in range(0, len(self.section), sizeof(TDEvent)): + yield TDEvent.from_buffer_copy(self.section, offset) class TDStringSection: """TDAG String Table section. @@ -431,6 +453,7 @@ def __repr__(self) -> str: TDSinkSection, TDSourceIndexSection, TDFunctionsSection, + TDEventsSection, TDControlFlowLogSection, ] @@ -474,11 +497,8 @@ def __init__(self, file: BinaryIO) -> None: self.sections.append(TDFunctionsSection(self.buffer, hdr)) self.sections_by_type[TDFunctionsSection] = self.sections[-1] elif hdr.tag == 7: - continue - # todo(kaoudis): change tag indices and remove this - # this will break compatibility with old tdags - # self.sections.append(TDEventsSection(self.buffer, hdr)) - # self.sections_by_type[TDEventsSection] = self.sections[-1] + self.sections.append(TDEventsSection(self.buffer, hdr)) + self.sections_by_type[TDEventsSection] = self.sections[-1] elif hdr.tag == 8: self.sections.append(TDControlFlowLogSection(self.buffer, hdr)) self.sections_by_type[TDControlFlowLogSection] = self.sections[-1] @@ -515,15 +535,22 @@ def mangled_fn_symbol_lookup(self) -> Dict[int, str]: return lookup + def _maybe_demangle(self, function_id: int) -> Union[str, int]: + """Depending on the age of the tdag, it may not contain a function mapping. If the tdag doesn't contain a function mapping, this will only return function ids and you'll need to manually map them against symbols gathered statically from the compiled instrumented binary. """ + maybe_symbol = self.mangled_fn_symbol_lookup.get(function_id) + if maybe_symbol is not None: + return demangle(maybe_symbol) + else: + return function_id + def cflog(self, demangle_symbols: bool=False) -> Iterator[ControlFlowEvent]: """Presents the control flow log. Does not demangle symbols by default, for performance.""" - print(self.sections_by_type.keys()) cflog_section = self.sections_by_type[TDControlFlowLogSection] assert isinstance(cflog_section, TDControlFlowLogSection) if demangle_symbols: for cflog_entry in cflog_section: - cflog_entry.callstack[:] = [demangle(self.mangled_fn_symbol_lookup[function_id]) for function_id in cflog_entry.callstack] + cflog_entry.callstack[:] = [self._maybe_demangle(function_id) for function_id in cflog_entry.callstack] yield cflog_entry else: @@ -882,11 +909,13 @@ def run(self, args): for k,v in tdfile.mangled_fn_symbol_lookup: print(f"function_id '{k}': function '{demangle(v)}'") else: - print("No Functions section could be read from the tdag!") + print("Error: no Functions section could be read from the tdag!") + print(f"Sections that could be read: {tdfile.sections.keys()}") if args.print_control_flow_log: - if TDControlFlowLogSection in tdfile.sections: + if TDControlFlowLogSection in tdfile.sections_by_type.keys(): for event in tdfile.cflog(demangle_symbols=True): print(event) else: - print("No Control Flow Log section could be read from the tdag! Consider trying to read it with an earlier Polytracker version?") + print("Error: no Control Flow Log section could be read from the tdag! Consider trying to read it with an earlier Polytracker version?") + print(f"Sections that could be read: {tdfile.sections}") From ec28b4acfb8f992533df043eaa7df017515a531a Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 22:30:16 +0000 Subject: [PATCH 033/112] fix sections reference --- polytracker/taint_dag.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 036f677e..5ee5017e 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -905,12 +905,12 @@ def run(self, args): print(f"Label {lbl}: {tdfile.decode_node(lbl)}") if args.print_function_trace: - if TDFunctionsSection in tdfile.sections and len(tdfile.mangled_fn_symbol_lookup) > 0: + if TDFunctionsSection in tdfile.sections_by_type.keys() and len(tdfile.mangled_fn_symbol_lookup) > 0: for k,v in tdfile.mangled_fn_symbol_lookup: print(f"function_id '{k}': function '{demangle(v)}'") else: print("Error: no Functions section could be read from the tdag!") - print(f"Sections that could be read: {tdfile.sections.keys()}") + print(f"Sections that could be read: {tdfile.sections}") if args.print_control_flow_log: if TDControlFlowLogSection in tdfile.sections_by_type.keys(): From a6a4db1606208a2df58977e50e3021ab9f165f63 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 22:31:06 +0000 Subject: [PATCH 034/112] remove slightly inaccurate message --- polytracker/taint_dag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 5ee5017e..02ae0fbb 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -917,5 +917,5 @@ def run(self, args): for event in tdfile.cflog(demangle_symbols=True): print(event) else: - print("Error: no Control Flow Log section could be read from the tdag! Consider trying to read it with an earlier Polytracker version?") + print("Error: no Control Flow Log section could be read from the tdag!") print(f"Sections that could be read: {tdfile.sections}") From d9d1702b2ddbb48f0d15fa327f2642b7b88f6ef2 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 23:03:00 +0000 Subject: [PATCH 035/112] related to previous change, fix test --- tests/test_cf_log.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index 0d8e962d..ae6d80a2 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -41,11 +41,12 @@ def test_label_mapping(program_trace: ProgramTrace): cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] for cflog_entry in cflog: - if cflog_entry.label is not None: + if type(cflog_entry) == TaintedControlFlowEvent: + assert hasattr(cflog_entry, 'label') node: TDNode = program_trace.tdfile.decode_node(cflog_entry.label) assert node.affects_control_flow else: - assert cflog_entry.label is None + assert not hasattr(cflog_entry, 'label') @pytest.mark.program_trace("test_cf_log.cpp") From 0d84bb862897698756f9bae0c6af029fc0473ea1 Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 23:03:23 +0000 Subject: [PATCH 036/112] slightly clearer label repr for tainted event --- polytracker/taint_dag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 02ae0fbb..24aeedd3 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -206,7 +206,7 @@ def __init__(self, callstack: List, label: int = None): self.label = label def __repr__(self) -> str: - return f"TaintedControlFlowEvent: {self.label}, {self.callstack}" + return f"TaintedControlFlowEvent: taint label {self.label} | {self.callstack}" def __eq__(self, __o: object) -> bool: if isinstance(__o, TaintedControlFlowEvent): From e9a864f3123cb5d85a328172ddafe23e819c7c7a Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 23:04:03 +0000 Subject: [PATCH 037/112] fix weird compiler warning that is oddly in the region I was just looking at to debug the failing stdin tests --- .../sanitizer_common/sanitizer_posix_libcdep.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/polytracker/src/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/polytracker/src/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp index d29438cf..9238b0f5 100644 --- a/polytracker/src/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp +++ b/polytracker/src/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp @@ -164,8 +164,12 @@ bool SupportsColoredOutput(fd_t fd) { } #if !SANITIZER_GO -// TODO(glider): different tools may require different altstack size. -static const uptr kAltStackSize = SIGSTKSZ * 4; // SIGSTKSZ is not enough. +static uptr GetAltStackSize() { + // Note: since GLIBC_2.31, SIGSTKSZ may be a function call, so this may be + // more costly that you think. However GetAltStackSize is only call 2-3 times + // per thread so don't cache the evaluation. + return SIGSTKSZ * 4; +} void SetAlternateSignalStack() { stack_t altstack, oldstack; @@ -176,10 +180,10 @@ void SetAlternateSignalStack() { // TODO(glider): the mapped stack should have the MAP_STACK flag in the // future. It is not required by man 2 sigaltstack now (they're using // malloc()). - void* base = MmapOrDie(kAltStackSize, __func__); + void* base = MmapOrDie(GetAltStackSize(), __func__); altstack.ss_sp = (char*) base; altstack.ss_flags = 0; - altstack.ss_size = kAltStackSize; + altstack.ss_size = GetAltStackSize(); CHECK_EQ(0, sigaltstack(&altstack, nullptr)); } @@ -187,7 +191,7 @@ void UnsetAlternateSignalStack() { stack_t altstack, oldstack; altstack.ss_sp = nullptr; altstack.ss_flags = SS_DISABLE; - altstack.ss_size = kAltStackSize; // Some sane value required on Darwin. + altstack.ss_size = GetAltStackSize(); // Some sane value required on Darwin. CHECK_EQ(0, sigaltstack(&altstack, &oldstack)); UnmapOrDie(oldstack.ss_sp, oldstack.ss_size); } From 8c7c6aa8689e4a0b8b91281c9671205beb53a41d Mon Sep 17 00:00:00 2001 From: kaoudis Date: Fri, 13 Dec 2024 23:08:37 +0000 Subject: [PATCH 038/112] instrument-targets msg improvement while I'm thinking about ordering of cmd --- polytracker/build.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polytracker/build.py b/polytracker/build.py index 8e983f52..7232bd89 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -363,7 +363,7 @@ def run(self, args: argparse.Namespace): class InstrumentTargets(Command): name = "instrument-targets" - help = "instruments blight journal build targets with polytracker" + help = "instruments blight journal build targets with polytracker for dynamic taint analysis" def __init_arguments__(self, parser: argparse.ArgumentParser): parser.add_argument( @@ -390,7 +390,7 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): parser.add_argument( "--cflog", action="store_true", - help="instrument with function tracing and control affecting dataflow logging", + help="also instrument with function tracing and control affecting dataflow logging IN ADDITION TO the default dynamic taint analysis instrumentation passes", ) def run(self, args: argparse.Namespace): From 5dd7208d4442aa7b18fbc5182af605c73a2bbe19 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 21 Feb 2025 16:20:46 +0000 Subject: [PATCH 039/112] fix seg faulting test binary: a string_view over unowned data may present a null pointer, and literal comparison is not intended for strings - use strncmp instead --- tests/test_stdin.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tests/test_stdin.cpp b/tests/test_stdin.cpp index 105b041f..bfd1ff91 100644 --- a/tests/test_stdin.cpp +++ b/tests/test_stdin.cpp @@ -1,7 +1,7 @@ -#include #include #include -#include +#include +#include int stdin_read() { char inbyte; @@ -97,26 +97,27 @@ int stdin_getchar_unlocked() { } // Reads from stdin using different methods based on argv[1] -// the following functions can be used -// read, int main(int argc, char *argv[]) { - assert(argc == 2); - std::string_view method{argv[1]}; - - if (method == "read") { + if (argc != 2) { + exit(EXIT_FAILURE); + } + + if (std::strncmp(argv[1], "read", 4) == 0) { + printf("got read\n"); stdin_read(); - } else if (method == "fread") { + } else if (std::strncmp(argv[1], "fread", 5) == 0) { stdin_fread(); - } else if (method == "getc") { + } else if (std::strncmp(argv[1], "getc", 4) == 0) { stdin_getc(); - } else if (method == "getc_unlocked") { + } else if (std::strncmp(argv[1], "getc_unlocked", 13) == 0) { stdin_getc_unlocked(); - } else if (method == "getchar") { + } else if (std::strncmp(argv[1], "getchar", 7) == 0) { stdin_getchar(); - } else if (method == "getchar_unlocked") { + } else if (std::strncmp(argv[1], "getchar_unlocked", 16) == 0) { stdin_getchar_unlocked(); - } else if (method == "fgetc") { + } else if (std::strncmp(argv[1], "fgetc", 5) == 0) { stdin_fgetc(); } - return 0; + + exit(EXIT_SUCCESS); } \ No newline at end of file From 4eb2abe2d64def94cd7f6bf31b7d4e8e7345c1a1 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 21 Feb 2025 16:21:23 +0000 Subject: [PATCH 040/112] add a C++ standard to C++ test binary building so test binaries behave more uniformly across similar platforms --- tests/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5fa31b3d..ee0e2eaa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,10 +26,11 @@ def build(target: Path, binary: Path) -> None: cmd = ["build"] if target.suffix == ".cpp": - cmd.append("clang++") + cmd += ["clang++", "-std=c++20"] else: cmd.append("clang") + # debugging and want symbols? add -O0 here cmd += ["-g", "-o", str(binary), str(target)] run_polytracker(cmd) From 6828c79b93914437a8d0b0b5533baf16564b4dcc Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 21 Feb 2025 16:22:15 +0000 Subject: [PATCH 041/112] moves DRY usage out of the test framework and into the called code so that it is easier to debug a singular failing test or many without commenting out half the test setup code, which is gross --- tests/test_stdin.py | 98 +++++++++++++++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 25 deletions(-) diff --git a/tests/test_stdin.py b/tests/test_stdin.py index ef6c2034..ce7b15b4 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -2,43 +2,91 @@ import subprocess import polytracker +from polytracker import taint_dag + from pathlib import Path +from random import choice +from string import printable +# Ensure stdin reads in multiple ways are verified +# examples: getc, fgetc, fread, fread_unlocked, fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw -@pytest.mark.program_trace("test_stdin.cpp") -@pytest.mark.parametrize( - "method", - ["read", "fread", "getc", "getc_unlocked", "getchar", "getchar_unlocked", "fgetc"], -) -def test_stdin_read(instrumented_binary: Path, trace_file: Path, method: str): - # Data to write to stdin, one byte at a time - stdin_data = "abcdefghi\njklmnopqr" - - subprocess.run( - [str(instrumented_binary), method], - input=stdin_data.encode("utf-8"), - env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": str(1)}, - ).check_returncode() - program_trace = polytracker.PolyTrackerTrace.load(trace_file) - - # Ensure /dev/stdin is in the list of inputs - assert "/dev/stdin" in [x.path for x in program_trace.inputs] - - n = 0 +_stdin_data = '\n'.join(choice(printable) for _ in range(40)).encode("utf-8") + +def _run(instrumented_binary: Path, trace_file: Path, method: str) -> None: + """It's important to split out any DRY from the test framework so it's possible to see when an individual test fails.""" + try: + subprocess.run( + args=[str(instrumented_binary), method], + env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": "1"}, + stderr=subprocess.STDOUT, + input=_stdin_data, + close_fds=False, + check=True + ) + except subprocess.CalledProcessError as e: + # https://docs.python.org/3/library/subprocess.html#subprocess.CalledProcessError.returncode + print(f"Error code: {e.returncode}") + print(f"Got back: {e.output}") + print(f"stdout: {e.stdout}") + print(f"stderr: {e.stderr}") + +def _test_out(program_trace: taint_dag.TDProgramTrace) -> None: + assert "/dev/stdin" in [input.path for input in program_trace.inputs] + expected_offset = 0 for input_label in program_trace.tdfile.input_labels(): src_node = program_trace.tdfile.decode_node(input_label) assert isinstance(src_node, polytracker.taint_dag.TDSourceNode) # Requires that offsets are ordered according to read - assert src_node.offset == n + assert src_node.offset == expected_offset # Ensure all source labels originate from stdin assert program_trace.tdfile.fd_headers[src_node.idx][0] == Path("/dev/stdin") - n += 1 + expected_offset += 1 # Should be as many source labels as the length of stdin_data - assert n == len(stdin_data) + assert expected_offset == len(_stdin_data) +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_read(instrumented_binary: Path, trace_file: Path): + _run(instrumented_binary, trace_file, "read") + # if running the instrumented binary fails before trace creation, we might have no tdag out. + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + _test_out(program_trace) -# Ensure stdin reads in multiple ways are verified -# examples: getc, fgetc, fread, fread_unlocked, fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_fread(instrumented_binary: Path, trace_file: Path): + _run(instrumented_binary, trace_file, "fread") + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + _test_out(program_trace) + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_getc(instrumented_binary: Path, trace_file: Path): + _run(instrumented_binary, trace_file, "getc") + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + _test_out(program_trace) + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_getc_unlocked(instrumented_binary: Path, trace_file: Path): + _run(instrumented_binary, trace_file, "getc_unlocked") + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + _test_out(program_trace) + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_getchar(instrumented_binary: Path, trace_file: Path): + _run(instrumented_binary, trace_file, "getchar") + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + _test_out(program_trace) + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_getchar_unlocked(instrumented_binary: Path, trace_file: Path): + _run(instrumented_binary, trace_file, "getchar_unlocked") + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + _test_out(program_trace) + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_fgetc(instrumented_binary: Path, trace_file: Path): + _run(instrumented_binary, trace_file, "fgetc") + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + _test_out(program_trace) From a9dac537d8b7cc9a8e363ab36d86dd5673596bd9 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 21 Feb 2025 16:24:04 +0000 Subject: [PATCH 042/112] adds explanatory comment --- tests/test_stdin.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_stdin.py b/tests/test_stdin.py index ce7b15b4..5d9ae254 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -32,6 +32,7 @@ def _run(instrumented_binary: Path, trace_file: Path, method: str) -> None: print(f"stderr: {e.stderr}") def _test_out(program_trace: taint_dag.TDProgramTrace) -> None: + """Test the resulting tdag program trace, checking its inputs to make sure we worked with tainted stdin""" assert "/dev/stdin" in [input.path for input in program_trace.inputs] expected_offset = 0 for input_label in program_trace.tdfile.input_labels(): From 6d4c0ec19ead813254f3bc0b6e0af2be0dab2f03 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 21 Feb 2025 16:28:53 +0000 Subject: [PATCH 043/112] removes extraneous comments --- tests/test_stdin.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/test_stdin.py b/tests/test_stdin.py index 5d9ae254..cdfe1bba 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -32,27 +32,23 @@ def _run(instrumented_binary: Path, trace_file: Path, method: str) -> None: print(f"stderr: {e.stderr}") def _test_out(program_trace: taint_dag.TDProgramTrace) -> None: - """Test the resulting tdag program trace, checking its inputs to make sure we worked with tainted stdin""" + """Test the resulting tdag program trace, checking its inputs to make sure + we tainted and tracked every byte of stdin. Offsets must be ordered as they + were read.""" assert "/dev/stdin" in [input.path for input in program_trace.inputs] expected_offset = 0 for input_label in program_trace.tdfile.input_labels(): src_node = program_trace.tdfile.decode_node(input_label) assert isinstance(src_node, polytracker.taint_dag.TDSourceNode) - - # Requires that offsets are ordered according to read assert src_node.offset == expected_offset - - # Ensure all source labels originate from stdin assert program_trace.tdfile.fd_headers[src_node.idx][0] == Path("/dev/stdin") expected_offset += 1 - # Should be as many source labels as the length of stdin_data assert expected_offset == len(_stdin_data) @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_read(instrumented_binary: Path, trace_file: Path): _run(instrumented_binary, trace_file, "read") - # if running the instrumented binary fails before trace creation, we might have no tdag out. program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) _test_out(program_trace) From 5109db7b99c93d7b13fd4dd6d00432e85029f55a Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 21 Feb 2025 16:43:16 +0000 Subject: [PATCH 044/112] minor naming cleanup --- tests/test_stdin.py | 76 ++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 43 deletions(-) diff --git a/tests/test_stdin.py b/tests/test_stdin.py index cdfe1bba..c8f37875 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -13,29 +13,26 @@ _stdin_data = '\n'.join(choice(printable) for _ in range(40)).encode("utf-8") -def _run(instrumented_binary: Path, trace_file: Path, method: str) -> None: - """It's important to split out any DRY from the test framework so it's possible to see when an individual test fails.""" - try: - subprocess.run( - args=[str(instrumented_binary), method], - env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": "1"}, - stderr=subprocess.STDOUT, - input=_stdin_data, - close_fds=False, - check=True - ) - except subprocess.CalledProcessError as e: - # https://docs.python.org/3/library/subprocess.html#subprocess.CalledProcessError.returncode - print(f"Error code: {e.returncode}") - print(f"Got back: {e.output}") - print(f"stdout: {e.stdout}") - print(f"stderr: {e.stderr}") +def _create_tdag_trace(instrumented_binary: Path, trace_file: Path, method: str) -> None: + """Rather than using pytest.mark.parametrize on this setup function, split + out DRY from the test framework so it's easy to see when an individual test + fails.""" + # https://docs.python.org/3/library/subprocess.html#subprocess.CalledProcessError.returncode + subprocess.run( + args=[str(instrumented_binary), method], + env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": "1"}, + stderr=subprocess.STDOUT, + input=_stdin_data, + close_fds=False, + ).check_returncode() -def _test_out(program_trace: taint_dag.TDProgramTrace) -> None: - """Test the resulting tdag program trace, checking its inputs to make sure - we tainted and tracked every byte of stdin. Offsets must be ordered as they - were read.""" +def _test_trace(trace_file: Path) -> None: + """Test the tdag output, checking its inputs to make sure we tainted and + tracked every byte of stdin. Offsets must be ordered as they were read.""" + + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) assert "/dev/stdin" in [input.path for input in program_trace.inputs] + expected_offset = 0 for input_label in program_trace.tdfile.input_labels(): src_node = program_trace.tdfile.decode_node(input_label) @@ -43,47 +40,40 @@ def _test_out(program_trace: taint_dag.TDProgramTrace) -> None: assert src_node.offset == expected_offset assert program_trace.tdfile.fd_headers[src_node.idx][0] == Path("/dev/stdin") expected_offset += 1 - + assert expected_offset == len(_stdin_data) @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_read(instrumented_binary: Path, trace_file: Path): - _run(instrumented_binary, trace_file, "read") - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) - _test_out(program_trace) + _create_tdag_trace(instrumented_binary, trace_file, "read") + _test_trace(trace_file) @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_fread(instrumented_binary: Path, trace_file: Path): - _run(instrumented_binary, trace_file, "fread") - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) - _test_out(program_trace) + _create_tdag_trace(instrumented_binary, trace_file, "fread") + _test_trace(trace_file) @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getc(instrumented_binary: Path, trace_file: Path): - _run(instrumented_binary, trace_file, "getc") - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) - _test_out(program_trace) + _create_tdag_trace(instrumented_binary, trace_file, "getc") + _test_trace(trace_file) @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getc_unlocked(instrumented_binary: Path, trace_file: Path): - _run(instrumented_binary, trace_file, "getc_unlocked") - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) - _test_out(program_trace) + _create_tdag_trace(instrumented_binary, trace_file, "getc_unlocked") + _test_trace(trace_file) @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getchar(instrumented_binary: Path, trace_file: Path): - _run(instrumented_binary, trace_file, "getchar") - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) - _test_out(program_trace) + _create_tdag_trace(instrumented_binary, trace_file, "getchar") + _test_trace(trace_file) @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getchar_unlocked(instrumented_binary: Path, trace_file: Path): - _run(instrumented_binary, trace_file, "getchar_unlocked") - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) - _test_out(program_trace) + _create_tdag_trace(instrumented_binary, trace_file, "getchar_unlocked") + _test_trace(trace_file) @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_fgetc(instrumented_binary: Path, trace_file: Path): - _run(instrumented_binary, trace_file, "fgetc") - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) - _test_out(program_trace) + _create_tdag_trace(instrumented_binary, trace_file, "fgetc") + _test_trace(trace_file) From 34011d4d82bb7dbf14364c889725b4911c032c7e Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 16:17:58 +0000 Subject: [PATCH 045/112] finish commented out tdag test --- unittests/src/taintdag/tdag.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index 766fcda2..4793e5f1 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -390,16 +390,15 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { } } - // TEST_CASE("An allocation that is larger than can be represented in the string table will result in error", "[StringTable]") { - // auto alloc_size = - // static_cast(std::numeric_limits::max()) + - // 1; - // alignas(StringTable::offset_t) uint8_t backing[64]; - // int dummy = 1; - // StringTable st{SectionArg{.output_file = dummy, .range = backing}}; - // auto span = StringTable::span_t{&backing[0], alloc_size}; - // REQUIRE_THROWS_AS( - // st, - // test::ErrorExit); - // } + TEST_CASE("An allocation that is larger than can be represented in the string table will result in truncation and does not prevent adding more strings", "[StringTable]") { + auto alloc_size = + static_cast(std::numeric_limits::max()) + + 1; + alignas(StringTable::offset_t) uint8_t backing[64]; + int dummy = 1; + StringTable st{SectionArg{.output_file = dummy, .range = backing}}; + auto span = StringTable::span_t{&backing[0], alloc_size}; + std::string_view tinystring{"eep"}; + REQUIRE_NOTHROW(st.add_string(tinystring)); + } } // namespace taintdag \ No newline at end of file From 412b786772ad3c7177fc862ccef30a86e62fd740 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 16:28:39 +0000 Subject: [PATCH 046/112] add test inputs locale to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 798be876..7019f8c5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,5 @@ __pycache__ /Default/ polytracker.egg-info /polytracker.egg-info -compile_commands.json \ No newline at end of file +compile_commands.json +/test_inputs/* From 5f67bb115392a0772b042b8a5413faefab668fad Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 16:37:53 +0000 Subject: [PATCH 047/112] trunk fmt go brrr --- examples/analysis/ubet/eval.py | 5 +- examples/analysis/ubet/eval_nitro.py | 17 +- polytracker/include/taintdag/fnmapping.h | 7 +- polytracker/include/taintdag/polytracker.h | 3 +- polytracker/include/taintdag/section.h | 3 +- polytracker/include/taintdag/string_table.h | 12 +- .../src/passes/tainted_control_flow.cpp | 261 +++++++++--------- polytracker/src/polytracker/polytracker.cpp | 6 +- polytracker/src/taintdag/fnmapping.cpp | 10 +- polytracker/src/taintdag/polytracker.cpp | 3 +- polytracker/taint_dag.py | 81 +++--- tests/conftest.py | 9 +- tests/test_cf_log.py | 35 ++- tests/test_program_trace.py | 11 +- tests/test_stdin.cpp | 4 +- tests/test_stdin.py | 41 ++- unittests/src/taintdag/tdag.cpp | 56 ++-- 17 files changed, 310 insertions(+), 254 deletions(-) diff --git a/examples/analysis/ubet/eval.py b/examples/analysis/ubet/eval.py index 0e219d18..c8ef1328 100644 --- a/examples/analysis/ubet/eval.py +++ b/examples/analysis/ubet/eval.py @@ -1,14 +1,13 @@ # /usr/bin/python import os import random -import sys import subprocess -from typing import List, Tuple +import sys from pathlib import Path +from typing import List, Tuple from polytracker import PolyTrackerTrace - src_arg = Path(sys.argv[1]) no_build = "nobuild" == sys.argv[2] if len(sys.argv) > 2 else False src_dir = src_arg.parent diff --git a/examples/analysis/ubet/eval_nitro.py b/examples/analysis/ubet/eval_nitro.py index c171091a..062052f8 100644 --- a/examples/analysis/ubet/eval_nitro.py +++ b/examples/analysis/ubet/eval_nitro.py @@ -1,19 +1,20 @@ import argparse -from collections import defaultdict -import subprocess import os +import subprocess import sys -from typing import Optional, Set, Iterator, Tuple, Dict -from polytracker import PolyTrackerTrace, taint_dag -from polytracker.taint_dag import TDFile, TDNode, TDSourceNode, TDUnionNode, TDRangeNode -from polytracker.mapping import InputOutputMapping +from collections import defaultdict +from functools import partialmethod from pathlib import Path +from typing import Dict, Iterator, Optional, Set, Tuple + +import cxxfilt # To Silence TQDM! from tqdm import tqdm -from functools import partialmethod -import cxxfilt +from polytracker import PolyTrackerTrace, taint_dag +from polytracker.mapping import InputOutputMapping +from polytracker.taint_dag import TDFile, TDNode, TDRangeNode, TDSourceNode, TDUnionNode tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) diff --git a/polytracker/include/taintdag/fnmapping.h b/polytracker/include/taintdag/fnmapping.h index 685db4fa..09d50d5e 100644 --- a/polytracker/include/taintdag/fnmapping.h +++ b/polytracker/include/taintdag/fnmapping.h @@ -25,8 +25,8 @@ struct Function { offset_t name_offset; uint32_t function_id; - Function(offset_t name_ofs, uint32_t f_id) : - name_offset(name_ofs), function_id(f_id) {}; + Function(offset_t name_ofs, uint32_t f_id) + : name_offset(name_ofs), function_id(f_id){}; }; class Functions : public FixedSizeAlloc { @@ -42,7 +42,8 @@ class Functions : public FixedSizeAlloc { : FixedSizeAlloc{of.range}, string_table{of.output_file.template section()} {} - std::optional add_mapping(uint32_t function_id, std::string_view function_name); + std::optional add_mapping(uint32_t function_id, + std::string_view function_name); private: StringTable &string_table; diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index a1afd679..7e10f69b 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -65,7 +65,8 @@ class PolyTracker { void leave_function(uint32_t function_id); // Log function name - void record_function_name(uint32_t function_id, std::string_view function_name); + void record_function_name(uint32_t function_id, + std::string_view function_name); // Log tainted data flowed into the sink void taint_sink(int fd, util::Offset offset, void const *mem, size_t length); diff --git a/polytracker/include/taintdag/section.h b/polytracker/include/taintdag/section.h index ac2efe09..d4682026 100644 --- a/polytracker/include/taintdag/section.h +++ b/polytracker/include/taintdag/section.h @@ -142,7 +142,8 @@ template struct FixedSizeAlloc : SectionBase { .t = *new (&*(write_context->mem.begin())) T{std::forward(args)...}}; } - spdlog::error("Failed to allocate memory in the section for the object, so could not construct the object in the tdag section"); + spdlog::error("Failed to allocate memory in the section for the object, so " + "could not construct the object in the tdag section"); return {}; } diff --git a/polytracker/include/taintdag/string_table.h b/polytracker/include/taintdag/string_table.h index 4368872d..7b064a41 100644 --- a/polytracker/include/taintdag/string_table.h +++ b/polytracker/include/taintdag/string_table.h @@ -30,7 +30,7 @@ struct StringTable : public SectionBase { // that can be expressed. static constexpr size_t max_entry_size = std::min(static_cast(std::numeric_limits::max()), - max_offset - sizeof(length_t)); + max_offset - sizeof(length_t)); static constexpr uint8_t tag{3}; static constexpr size_t allocation_size{0x100000}; @@ -45,19 +45,23 @@ struct StringTable : public SectionBase { // by using `from_offset`. std::optional add_string(std::string_view sv) { if ((sv.size() + sizeof(length_t)) > max_entry_size) { - spdlog::info("Tried to store a string of size {0:d} but max is {1:d} (will truncate string)", sv.size(), max_entry_size); + spdlog::info("Tried to store a string of size {0:d} but max is {1:d} " + "(will truncate string)", + sv.size(), max_entry_size); size_t to_truncate = max_entry_size - sizeof(length_t) - 1; sv = sv.substr(0, to_truncate); if ((sv.size() + sizeof(length_t)) > max_entry_size) { - error_exit("Truncated string was too big: ", sv.size() + sizeof(length_t)); + error_exit("Truncated string was too big: ", + sv.size() + sizeof(length_t)); } } auto len = allocated_len(sv.size()); if (auto write_context = write(len)) { - // todo(kaoudis) this is possibly a type confusion issue resulting in truncation since size_t is bigger than the current length_t + // todo(kaoudis) this is possibly a type confusion issue resulting in + // truncation since size_t is bigger than the current length_t *reinterpret_cast(&*(write_context->mem.begin())) = sv.size(); // copy string diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 73f21552..e4336697 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -22,162 +22,163 @@ static llvm::cl::list ignore_lists( "pt-ftrace-ignore-list", - llvm::cl::desc( - "File that specifies functions that pt-tcf should ignore")); + llvm::cl::desc("File that specifies functions that pt-tcf should ignore")); namespace polytracker { namespace { - uint32_t get_or_add_mapping(uintptr_t key, std::unordered_map &mapping, uint32_t &counter) { - if (auto it = mapping.find(key); it != mapping.end()) { - return it->second; - } else { - return mapping[key] = counter++; - } +uint32_t get_or_add_mapping(uintptr_t key, + std::unordered_map &mapping, + uint32_t &counter) { + if (auto it = mapping.find(key); it != mapping.end()) { + return it->second; + } else { + return mapping[key] = counter++; } +} } // namespace - void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, - llvm::Value *val) { - llvm::IRBuilder<> ir(&inst); - auto dummy_val{val}; - if (inst.getType()->isVectorTy()) { - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); - } - ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); - } - - llvm::ConstantInt * - TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { - auto func_address = reinterpret_cast(&func); - auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); - return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); - } - - llvm::ConstantInt * - TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { - return get_function_id_const(*(i.getParent()->getParent())); +void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, + llvm::Value *val) { + llvm::IRBuilder<> ir(&inst); + auto dummy_val{val}; + if (inst.getType()->isVectorTy()) { + dummy_val = ir.CreateExtractElement(val, uint64_t(0)); } - - void TaintedControlFlowPass::visitGetElementPtrInst( - llvm::GetElementPtrInst &gep) { - llvm::IRBuilder<> ir(&gep); - for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { - continue; - } - - // we do not handle VectorTypes yet - if ((*(idx->getType())).isVectorTy()) { - continue; - } - - auto callret = ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), - get_function_id_const(gep)}); - - idx = ir.CreateSExtOrTrunc(callret, idx->getType()); + ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); +} + +llvm::ConstantInt * +TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { + auto func_address = reinterpret_cast(&func); + auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); + return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); +} + +llvm::ConstantInt * +TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { + return get_function_id_const(*(i.getParent()->getParent())); +} + +void TaintedControlFlowPass::visitGetElementPtrInst( + llvm::GetElementPtrInst &gep) { + llvm::IRBuilder<> ir(&gep); + for (auto &idx : gep.indices()) { + if (llvm::isa(idx)) { + continue; } - } - void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { - if (bi.isUnconditional()) { - return; + // we do not handle VectorTypes yet + if ((*(idx->getType())).isVectorTy()) { + continue; } - llvm::IRBuilder<> ir(&bi); - auto cond = bi.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); + auto callret = ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), + get_function_id_const(gep)}); - bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); + idx = ir.CreateSExtOrTrunc(callret, idx->getType()); } +} - void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { + if (bi.isUnconditional()) { + return; } - void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - // TODO(hbrodin): Can't handle atm. - if (si.getType()->isVectorTy()) { - return; - } - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); + llvm::IRBuilder<> ir(&bi); + auto cond = bi.getCondition(); - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); - } + bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} - void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { - llvm::LLVMContext *context = &mod.getContext(); - llvm::IRBuilder<> ir(*context); +void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); - cond_br_log_fn = mod.getOrInsertFunction( - "__polytracker_log_tainted_control_flow", - llvm::AttributeList::get( - mod.getContext(), - {{llvm::AttributeList::FunctionIndex, - llvm::Attribute::get(mod.getContext(), - llvm::Attribute::ReadNone)}}), - ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - enter_log_fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), llvm::Type::getInt8PtrTy(*context)); + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} - fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", enter_log_fn_type); - - fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); +void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { + // TODO(hbrodin): Can't handle atm. + if (si.getType()->isVectorTy()) { + return; } - - void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { - if (func.isDeclaration()) { - return; - } - llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); - - ir.CreateCall(fn_enter_log_fn, - { - get_function_id_const(func), - ir.CreateGlobalStringPtr(func.getName()) - } - ); + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); + + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} + +void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { + llvm::LLVMContext *context = &mod.getContext(); + llvm::IRBuilder<> ir(*context); + + cond_br_log_fn = mod.getOrInsertFunction( + "__polytracker_log_tainted_control_flow", + llvm::AttributeList::get( + mod.getContext(), + {{llvm::AttributeList::FunctionIndex, + llvm::Attribute::get(mod.getContext(), + llvm::Attribute::ReadNone)}}), + ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); + + enter_log_fn_type = llvm::FunctionType::get( + llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), + llvm::Type::getInt8PtrTy(*context)); + + fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", + enter_log_fn_type); + + fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", + ir.getVoidTy(), ir.getInt32Ty()); +} + +void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { + if (func.isDeclaration()) { + return; } - - void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { - llvm::IRBuilder<> ir(&ri); - ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); - } - - llvm::PreservedAnalyses - TaintedControlFlowPass::run(llvm::Module &mod, - llvm::ModuleAnalysisManager &mam) { - label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); - declareLoggingFunctions(mod); - auto fnsToIgnore{readIgnoreLists(ignore_lists)}; - - for (auto &fn : mod) { - auto fname{fn.getName()}; - if (fnsToIgnore.count(fname.str())) { - continue; - } else { - instrumentFunctionEnter(fn); - visit(fn); - } + llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); + + ir.CreateCall(fn_enter_log_fn, {get_function_id_const(func), + ir.CreateGlobalStringPtr(func.getName())}); +} + +void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { + llvm::IRBuilder<> ir(&ri); + ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); +} + +llvm::PreservedAnalyses +TaintedControlFlowPass::run(llvm::Module &mod, + llvm::ModuleAnalysisManager &mam) { + label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); + declareLoggingFunctions(mod); + auto fnsToIgnore{readIgnoreLists(ignore_lists)}; + + for (auto &fn : mod) { + auto fname{fn.getName()}; + if (fnsToIgnore.count(fname.str())) { + continue; + } else { + instrumentFunctionEnter(fn); + visit(fn); } - - return llvm::PreservedAnalyses::none(); } + return llvm::PreservedAnalyses::none(); +} + } // namespace polytracker \ No newline at end of file diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 1dc3149d..3ced27b2 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -55,9 +55,11 @@ extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( return conditional; } -extern "C" void __polytracker_enter_function(uint32_t function_id, const char* function_name) { +extern "C" void __polytracker_enter_function(uint32_t function_id, + const char *function_name) { get_polytracker_tdag().enter_function(function_id); - get_polytracker_tdag().record_function_name(function_id, std::string_view(function_name)); + get_polytracker_tdag().record_function_name(function_id, + std::string_view(function_name)); } extern "C" void __polytracker_leave_function(uint32_t function_id) { diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index f1dbf831..c57e4ed5 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -19,14 +19,16 @@ using index_t = Functions::index_t; } // namespace -std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { +std::optional Functions::add_mapping(uint32_t function_id, + std::string_view function_name) { std::unique_lock mappings_lock(mappings_mutex); if (auto it{mappings.find(function_name)}; it != mappings.end()) { return it->second; } - std::optional maybe_name_offset = string_table.add_string(function_name); + std::optional maybe_name_offset = + string_table.add_string(function_name); if (!maybe_name_offset.has_value()) { spdlog::error("Could not write function name to strings table"); return {}; @@ -34,7 +36,9 @@ std::optional Functions::add_mapping(uint32_t function_id, std::string_ auto maybe_ctx = construct(Function(maybe_name_offset.value(), function_id)); if (!maybe_ctx.has_value()) { - spdlog::error("Could not write Function {0} with id {1:d}, string table ofs {2:d} to the tdag functions section", function_name, function_id, maybe_name_offset.value()); + spdlog::error("Could not write Function {0} with id {1:d}, string table " + "ofs {2:d} to the tdag functions section", + function_name, function_id, maybe_name_offset.value()); return {}; } diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index ff540f9e..fa905187 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -178,7 +178,8 @@ void PolyTracker::log_tainted_control_flow(label_t lbl, uint32_t function_id) { output_file_.section().tainted_control_flow(lbl, function_id); } -void PolyTracker::record_function_name(uint32_t function_id, std::string_view function_name) { +void PolyTracker::record_function_name(uint32_t function_id, + std::string_view function_name) { output_file_.section().add_mapping(function_id, function_name); } diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 24aeedd3..56d6a4bb 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -1,47 +1,47 @@ +from ctypes import ( + Structure, + c_char, + c_int32, + c_int64, + c_uint8, + c_uint16, + c_uint32, + c_uint64, + sizeof, +) +from enum import Enum +from mmap import PROT_READ, mmap +from pathlib import Path from typing import ( BinaryIO, - Union, + Dict, Iterable, Iterator, - Optional, - Dict, - Tuple, List, + Optional, Set, + Tuple, Type, + Union, cast, ) from cxxfilt import demangle -from enum import Enum -from pathlib import Path -from mmap import mmap, PROT_READ -from ctypes import ( - Structure, - c_char, - c_int64, - c_uint64, - c_int32, - c_uint32, - c_uint8, - c_uint16, - sizeof, -) from typing_extensions import deprecated +from .inputs import Input from .plugins import Command -from .repl import PolyTrackerREPL from .polytracker import ProgramTrace -from .inputs import Input +from .repl import PolyTrackerREPL from .taint_forest import TaintForest, TaintForestNode from .tracing import ( BasicBlock, ByteOffset, Function, TaintAccess, - TraceEvent, TaintOutput, Taints, + TraceEvent, ) @@ -94,9 +94,11 @@ def enumerate(self): for offset in range(0, len(self.mem), sizeof(TDFDHeader)): yield TDFDHeader.from_buffer_copy(self.mem[offset:]) + @deprecated("Use ControlFlowEvent instead, TDEvents are no longer written") class TDEvent(Structure): """This is an old version of the ControlFlowEvent kept for backward compatibility only""" + _fields_ = [("kind", c_uint8), ("fnidx", c_uint16)] class Kind(Enum): @@ -106,9 +108,11 @@ class Kind(Enum): def __repr__(self) -> str: return f"kind: {self.Kind(self.kind).name} fnidx: {self.fnidx}" + @deprecated("Use TDControlFlowLog instead, TDEvents section is no longer written") class TDEventsSection: """This is an old version of the CFLog kept for backward compatibility only""" + def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] @@ -116,6 +120,7 @@ def __iter__(self): for offset in range(0, len(self.section), sizeof(TDEvent)): yield TDEvent.from_buffer_copy(self.section, offset) + class TDStringSection: """TDAG String Table section. @@ -355,6 +360,7 @@ class TDFunctionsSection: """This section holds the mapping between the function IDs stored in callstack form in the cflog section, and the function names stored in the string table. See fnmapping in the C++ part of the codebase for the "write" side part of Polytracker that pertains to this section. Each entry is an uint32_t as set in fnmapping.cpp, but a TDFnHeader will then contain *two* of these: the function_id and the name_offset. Structure in memory: |offset|function id|...""" + def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] @@ -382,10 +388,8 @@ def invalid_fd(self): class TDFnHeader(Structure): # This corresponds to the Function inline constructor in fnmapping.h. # Anything using Structure needs to be in sync with the corresponding C++. - _fields_ = [ - ("name_offset", c_uint32), - ("function_id", c_uint32) - ] + _fields_ = [("name_offset", c_uint32), ("function_id", c_uint32)] + class TDNode: def __init__(self, affects_control_flow: bool = False): @@ -445,7 +449,6 @@ def __repr__(self) -> str: return f"TDSink fdidx: {self.fdidx} offset: {self.offset} label: {self.label}" - TDSection = Union[ TDLabelSection, TDSourceSection, @@ -536,21 +539,24 @@ def mangled_fn_symbol_lookup(self) -> Dict[int, str]: return lookup def _maybe_demangle(self, function_id: int) -> Union[str, int]: - """Depending on the age of the tdag, it may not contain a function mapping. If the tdag doesn't contain a function mapping, this will only return function ids and you'll need to manually map them against symbols gathered statically from the compiled instrumented binary. """ + """Depending on the age of the tdag, it may not contain a function mapping. If the tdag doesn't contain a function mapping, this will only return function ids and you'll need to manually map them against symbols gathered statically from the compiled instrumented binary.""" maybe_symbol = self.mangled_fn_symbol_lookup.get(function_id) if maybe_symbol is not None: return demangle(maybe_symbol) else: return function_id - def cflog(self, demangle_symbols: bool=False) -> Iterator[ControlFlowEvent]: + def cflog(self, demangle_symbols: bool = False) -> Iterator[ControlFlowEvent]: """Presents the control flow log. Does not demangle symbols by default, for performance.""" cflog_section = self.sections_by_type[TDControlFlowLogSection] assert isinstance(cflog_section, TDControlFlowLogSection) if demangle_symbols: for cflog_entry in cflog_section: - cflog_entry.callstack[:] = [self._maybe_demangle(function_id) for function_id in cflog_entry.callstack] + cflog_entry.callstack[:] = [ + self._maybe_demangle(function_id) + for function_id in cflog_entry.callstack + ] yield cflog_entry else: @@ -863,13 +869,13 @@ def __init_arguments__(self, parser): "--print-taint-sinks", "-s", action="store_true", - help="print taint sinks", + help="print taint sinks (very slow for large traces)", ) parser.add_argument( "--print-taint-nodes", "-n", action="store_true", - help="print taint nodes", + help="print taint nodes (very slow for large traces)", ) parser.add_argument( @@ -883,7 +889,7 @@ def __init_arguments__(self, parser): "--print-control-flow-log", "-c", action="store_true", - help="print control flow log events", + help="print control flow log events (very slow for large traces)", ) def run(self, args): @@ -905,8 +911,11 @@ def run(self, args): print(f"Label {lbl}: {tdfile.decode_node(lbl)}") if args.print_function_trace: - if TDFunctionsSection in tdfile.sections_by_type.keys() and len(tdfile.mangled_fn_symbol_lookup) > 0: - for k,v in tdfile.mangled_fn_symbol_lookup: + if ( + TDFunctionsSection in tdfile.sections_by_type.keys() + and len(tdfile.mangled_fn_symbol_lookup) > 0 + ): + for k, v in tdfile.mangled_fn_symbol_lookup: print(f"function_id '{k}': function '{demangle(v)}'") else: print("Error: no Functions section could be read from the tdag!") @@ -917,5 +926,7 @@ def run(self, args): for event in tdfile.cflog(demangle_symbols=True): print(event) else: - print("Error: no Control Flow Log section could be read from the tdag!") + print( + "Error: no Control Flow Log section could be read from the tdag!" + ) print(f"Sections that could be read: {tdfile.sections}") diff --git a/tests/conftest.py b/tests/conftest.py index ee0e2eaa..627da284 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,12 @@ -import sys -import pytest import subprocess -import polytracker - +import sys from pathlib import Path from typing import List +import pytest + +import polytracker + def pytest_configure(config): config.addinivalue_line( diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index ae6d80a2..d67b6d22 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -1,20 +1,21 @@ +import subprocess +from pathlib import Path +from typing import List + import cxxfilt import pytest -import subprocess import polytracker -from pathlib import Path - +from polytracker import ProgramTrace from polytracker.taint_dag import ( - ControlFlowEvent, CFEnterFunctionEvent, CFLeaveFunctionEvent, + ControlFlowEvent, TaintedControlFlowEvent, TDControlFlowLogSection, TDNode, ) -from polytracker import ProgramTrace -from typing import List + @pytest.mark.program_trace("test_fntrace.cpp") def test_function_mapping(program_trace: ProgramTrace): @@ -25,9 +26,12 @@ def test_function_mapping(program_trace: ProgramTrace): for symbol in mangled_symbols: assert cxxfilt.demangle(symbol) in expected_names + @pytest.mark.program_trace("test_fntrace.cpp") def test_callstack_mapping(program_trace: ProgramTrace): - cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ + TDControlFlowLogSection + ] for cflog_entry in cflog: assert len(cflog_entry.callstack) > 0 @@ -36,17 +40,20 @@ def test_callstack_mapping(program_trace: ProgramTrace): # when we look up the function id it should map to a name we traced assert callstack_entry in program_trace.tdfile.mangled_fn_symbol_lookup + @pytest.mark.program_trace("test_fntrace.cpp") def test_label_mapping(program_trace: ProgramTrace): - cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ + TDControlFlowLogSection + ] for cflog_entry in cflog: if type(cflog_entry) == TaintedControlFlowEvent: - assert hasattr(cflog_entry, 'label') + assert hasattr(cflog_entry, "label") node: TDNode = program_trace.tdfile.decode_node(cflog_entry.label) assert node.affects_control_flow else: - assert not hasattr(cflog_entry, 'label') + assert not hasattr(cflog_entry, "label") @pytest.mark.program_trace("test_cf_log.cpp") @@ -81,9 +88,7 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): CFEnterFunctionEvent(["main", "f1(unsigned char)"]), TaintedControlFlowEvent(["main", "f1(unsigned char)"], 7), CFEnterFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), - TaintedControlFlowEvent( - ["main", "f1(unsigned char)", "f2(unsigned char)"], 7 - ), + TaintedControlFlowEvent(["main", "f1(unsigned char)", "f2(unsigned char)"], 7), CFLeaveFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), CFLeaveFunctionEvent(["main", "f1(unsigned char)"]), CFLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit @@ -100,4 +105,6 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): for entry in cflog: for callstack_entry in entry.callstack: - assert callstack_entry in list(program_trace.tdfile.mangled_fn_symbol_lookup.values()) \ No newline at end of file + assert callstack_entry in list( + program_trace.tdfile.mangled_fn_symbol_lookup.values() + ) diff --git a/tests/test_program_trace.py b/tests/test_program_trace.py index b6d765da..2ec6906e 100644 --- a/tests/test_program_trace.py +++ b/tests/test_program_trace.py @@ -1,14 +1,10 @@ from collections import defaultdict -import pytest from subprocess import CalledProcessError from typing import Dict, Union -from polytracker import ( - BasicBlockEntry, - FunctionEntry, - FunctionReturn, - ProgramTrace, -) +import pytest + +from polytracker import BasicBlockEntry, FunctionEntry, FunctionReturn, ProgramTrace @pytest.mark.skip(reason="taint_dag does not support traces yet") @@ -182,6 +178,7 @@ def test_cxx_global_object(program_trace: ProgramTrace): assert taints[0].offset == 1 assert taints[0].length == 1 + @pytest.mark.skip(reason="the Taint Forest is currently not implemented") @pytest.mark.program_trace("test_simple_union.cpp", input="ABCDEFGH\n11235878\n") def test_taint_forest(program_trace: ProgramTrace): diff --git a/tests/test_stdin.cpp b/tests/test_stdin.cpp index bfd1ff91..632f59b0 100644 --- a/tests/test_stdin.cpp +++ b/tests/test_stdin.cpp @@ -1,7 +1,7 @@ #include -#include #include #include +#include int stdin_read() { char inbyte; @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) { if (argc != 2) { exit(EXIT_FAILURE); } - + if (std::strncmp(argv[1], "read", 4) == 0) { printf("got read\n"); stdin_read(); diff --git a/tests/test_stdin.py b/tests/test_stdin.py index c8f37875..f117055c 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -1,20 +1,23 @@ -import pytest import subprocess - -import polytracker -from polytracker import taint_dag - from pathlib import Path from random import choice from string import printable +import pytest + +import polytracker +from polytracker import taint_dag + # Ensure stdin reads in multiple ways are verified # examples: getc, fgetc, fread, fread_unlocked, fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw -_stdin_data = '\n'.join(choice(printable) for _ in range(40)).encode("utf-8") +_stdin_data = "\n".join(choice(printable) for _ in range(40)).encode("utf-8") + -def _create_tdag_trace(instrumented_binary: Path, trace_file: Path, method: str) -> None: - """Rather than using pytest.mark.parametrize on this setup function, split +def _create_tdag_trace( + instrumented_binary: Path, trace_file: Path, method: str +) -> None: + """Rather than using pytest.mark.parametrize on this setup function, split out DRY from the test framework so it's easy to see when an individual test fails.""" # https://docs.python.org/3/library/subprocess.html#subprocess.CalledProcessError.returncode @@ -26,13 +29,16 @@ def _create_tdag_trace(instrumented_binary: Path, trace_file: Path, method: str) close_fds=False, ).check_returncode() + def _test_trace(trace_file: Path) -> None: - """Test the tdag output, checking its inputs to make sure we tainted and + """Test the tdag output, checking its inputs to make sure we tainted and tracked every byte of stdin. Offsets must be ordered as they were read.""" - - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load( + trace_file + ) assert "/dev/stdin" in [input.path for input in program_trace.inputs] - + expected_offset = 0 for input_label in program_trace.tdfile.input_labels(): src_node = program_trace.tdfile.decode_node(input_label) @@ -40,39 +46,46 @@ def _test_trace(trace_file: Path) -> None: assert src_node.offset == expected_offset assert program_trace.tdfile.fd_headers[src_node.idx][0] == Path("/dev/stdin") expected_offset += 1 - + assert expected_offset == len(_stdin_data) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_read(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "read") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_fread(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "fread") _test_trace(trace_file) - + + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getc(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getc") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getc_unlocked(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getc_unlocked") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getchar(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getchar") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getchar_unlocked(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getchar_unlocked") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_fgetc(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "fgetc") diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index 4793e5f1..8554355f 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -1,11 +1,11 @@ #include +#include "taintdag/labels.h" #include "taintdag/outputfile.h" #include "taintdag/section.h" #include "taintdag/storage.h" #include "taintdag/string_table.h" #include "taintdag/taint_source.h" -#include "taintdag/labels.h" #include "utils.h" @@ -244,12 +244,15 @@ struct DummyOutputFile { StringTable &string_table; }; -TEST_CASE("The Sources and StringTable sections can be used to store source entries", "[Sources, StringTable]") { +TEST_CASE( + "The Sources and StringTable sections can be used to store source entries", + "[Sources, StringTable]") { OutputFile of{std::tmpnam(nullptr)}; auto &sources_section{of.section()}; auto &string_table{of.section()}; - SECTION("Can add taint-source entries to the Sources section", "[Sources, StringTable]") { + SECTION("Can add taint-source entries to the Sources section", + "[Sources, StringTable]") { int fd = 3; REQUIRE(!sources_section.mapping_idx(fd)); @@ -267,7 +270,8 @@ TEST_CASE("The Sources and StringTable sections can be used to store source entr REQUIRE(m1.size == 122); int fd2 = 99; - auto s2 = sources_section.add_source("test2", fd2, SourceEntry::InvalidSize); + auto s2 = + sources_section.add_source("test2", fd2, SourceEntry::InvalidSize); REQUIRE(s2.has_value()); auto idx2 = sources_section.mapping_idx(fd2); @@ -281,7 +285,8 @@ TEST_CASE("The Sources and StringTable sections can be used to store source entr } WHEN("Adding taint-sources to the Sources section and the string table") { - THEN("Latest wins in terms in case output_file has multiple mappings for the same fd") { + THEN("Latest wins in terms in case output_file has multiple mappings for " + "the same fd") { int fd = 1; sources_section.add_source("first", fd); sources_section.add_source("second", fd); @@ -331,7 +336,8 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { string_table.add_string("d"); std::vector res; - std::copy(string_table.begin(), string_table.end(), std::back_inserter(res)); + std::copy(string_table.begin(), string_table.end(), + std::back_inserter(res)); REQUIRE(res.size() == 4); REQUIRE(res[0] == "a"); REQUIRE(res[1] == "b"); @@ -341,18 +347,20 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { } WHEN("Adding to the string table") { - THEN("A string bigger than the maximum string size will be truncated and stored") { + THEN("A string bigger than the maximum string size will be truncated and " + "stored") { // display the info logging spdlog::set_level(spdlog::level::debug); auto len = StringTable::max_entry_size + 10; std::string too_big(len, 'A'); - REQUIRE_NOTHROW([&](){ + REQUIRE_NOTHROW([&]() { auto offset = string_table.add_string(too_big); REQUIRE(offset.has_value()); std::string_view result = string_table.from_offset(offset.value()); - REQUIRE(result.size() + sizeof(StringTable::length_t) == StringTable::max_entry_size - 1); + REQUIRE(result.size() + sizeof(StringTable::length_t) == + StringTable::max_entry_size - 1); }()); } @@ -367,10 +375,11 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { } } - THEN("Add a maximumly big string and will still be able to add other strings") { + THEN("Add a maximumly big string and will still be able to add other " + "strings") { auto size = StringTable::max_entry_size - sizeof(StringTable::length_t); std::string s(size, 'A'); - REQUIRE_NOTHROW([&](){ + REQUIRE_NOTHROW([&]() { auto offset = string_table.add_string(s); REQUIRE(offset.has_value()); auto result = string_table.from_offset(offset.value()); @@ -390,15 +399,18 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { } } - TEST_CASE("An allocation that is larger than can be represented in the string table will result in truncation and does not prevent adding more strings", "[StringTable]") { - auto alloc_size = - static_cast(std::numeric_limits::max()) + - 1; - alignas(StringTable::offset_t) uint8_t backing[64]; - int dummy = 1; - StringTable st{SectionArg{.output_file = dummy, .range = backing}}; - auto span = StringTable::span_t{&backing[0], alloc_size}; - std::string_view tinystring{"eep"}; - REQUIRE_NOTHROW(st.add_string(tinystring)); - } +TEST_CASE( + "An allocation that is larger than can be represented in the string table " + "will result in truncation and does not prevent adding more strings", + "[StringTable]") { + auto alloc_size = + static_cast(std::numeric_limits::max()) + + 1; + alignas(StringTable::offset_t) uint8_t backing[64]; + int dummy = 1; + StringTable st{SectionArg{.output_file = dummy, .range = backing}}; + auto span = StringTable::span_t{&backing[0], alloc_size}; + std::string_view tinystring{"eep"}; + REQUIRE_NOTHROW(st.add_string(tinystring)); +} } // namespace taintdag \ No newline at end of file From 45ce1cf30d8cc3b812671d88c209b7d2551c6a3d Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 16:38:41 +0000 Subject: [PATCH 048/112] fix some, but not all, bitrot in Dockerfiles not included in the polytracker paper eval --- examples/Dockerfile-listgen.demo | 4 ++-- examples/Dockerfile-mupdf.demo | 4 ++-- examples/Dockerfile-pdfium.demo | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/Dockerfile-listgen.demo b/examples/Dockerfile-listgen.demo index 0b1417e8..6f1aa272 100644 --- a/examples/Dockerfile-listgen.demo +++ b/examples/Dockerfile-listgen.demo @@ -11,7 +11,7 @@ RUN apt update #Update pkg-config/util-linux (needed for FontConfig) RUN apt update RUN apt install pkg-config uuid-dev gperf libtool \ - gettext autopoint autoconf -y + gettext autopoint autoconf wget -y RUN apt-get install python3-dev @@ -38,7 +38,7 @@ RUN make -j5 install WORKDIR /polytracker/the_klondike #zlib -RUN wget https://www.zlib.net/zlib-1.2.11.tar.gz +RUN wget https://www.zlib.net/fossils/zlib-1.2.11.tar.gz RUN tar -xzvf zlib-1.2.11.tar.gz WORKDIR zlib-1.2.11 RUN ./configure --prefix=/usr && make -j$(nproc) test && make -j$(nproc) install diff --git a/examples/Dockerfile-mupdf.demo b/examples/Dockerfile-mupdf.demo index ca59d1c0..afab29d9 100644 --- a/examples/Dockerfile-mupdf.demo +++ b/examples/Dockerfile-mupdf.demo @@ -25,5 +25,5 @@ RUN polytracker build make -j$((`nproc`+1)) HAVE_X11=no HAVE_GLUT=no prefix=/usr RUN polytracker instrument-targets --cflog mutool RUN mv mutool.instrumented mutool_track # Note, the /workdir directory is intended to be mounted at runtime -VOLUME ["/workdir"] -WORKDIR /workdir \ No newline at end of file +# VOLUME ["/workdir"] +# WORKDIR /workdir \ No newline at end of file diff --git a/examples/Dockerfile-pdfium.demo b/examples/Dockerfile-pdfium.demo index 6d536b52..77942a3a 100644 --- a/examples/Dockerfile-pdfium.demo +++ b/examples/Dockerfile-pdfium.demo @@ -9,6 +9,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata \ RUN DEBIAN_FRONTEND=noninteractive apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + curl \ git \ pkg-config \ sudo \ From 78576f5b2e8534e2f268318468d158e53737d71b Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 16:44:59 +0000 Subject: [PATCH 049/112] removes docs todo --- docs/tdag.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/tdag.md b/docs/tdag.md index 8fbbb4ca..f41e35c6 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -37,9 +37,9 @@ Some specifics: - [Sources](../polytracker/src/taint_sources/taint_sources.cpp) contains source labels (byte offsets into the input) - The Source Label Index is a bitmap that defines how to index the sources section. - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) -- [Strings](../polytracker/include/taintdag/string_table.h) todo(kaoudis) the string table is used in conjunction with the fnmapping to put together an earlier version of the control flow log used for grammar extraction. Note that the string table also contains other things! -- [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this contains an early version of the function list part of the control flow log used for grammar extraction -- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. +- [Strings](../polytracker/include/taintdag/string_table.h) todo(kaoudis) the string table contains things like names of sources, the function names used with the functions section to map to the control flow log, etc. todo(kaoudis) in future this could probably be less of a catchall for 'all things stringly' and those things could have separate lookups or be stored inline if we're gonna mmap so much space anyway. +- [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see test_cf_log.py for details of how this looks/works. +- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. ## TDAG Contents From 794ad8bb11bd84b6c8d080a241d195f78f88926c Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 16:46:30 +0000 Subject: [PATCH 050/112] trunk lint --- docs/tdag.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tdag.md b/docs/tdag.md index f41e35c6..fa6a6f24 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -39,7 +39,7 @@ Some specifics: - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) - [Strings](../polytracker/include/taintdag/string_table.h) todo(kaoudis) the string table contains things like names of sources, the function names used with the functions section to map to the control flow log, etc. todo(kaoudis) in future this could probably be less of a catchall for 'all things stringly' and those things could have separate lookups or be stored inline if we're gonna mmap so much space anyway. - [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see test_cf_log.py for details of how this looks/works. -- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. +- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. ## TDAG Contents From e855aadbe319a61f0c12ba03c22dee139f7ab6e6 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 16:47:44 +0000 Subject: [PATCH 051/112] removes stray todo --- docs/tdag.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tdag.md b/docs/tdag.md index fa6a6f24..4a5105d0 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -37,7 +37,7 @@ Some specifics: - [Sources](../polytracker/src/taint_sources/taint_sources.cpp) contains source labels (byte offsets into the input) - The Source Label Index is a bitmap that defines how to index the sources section. - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) -- [Strings](../polytracker/include/taintdag/string_table.h) todo(kaoudis) the string table contains things like names of sources, the function names used with the functions section to map to the control flow log, etc. todo(kaoudis) in future this could probably be less of a catchall for 'all things stringly' and those things could have separate lookups or be stored inline if we're gonna mmap so much space anyway. +- [Strings](../polytracker/include/taintdag/string_table.h) the string table contains things like names of sources, the function names used with the functions section to map to the control flow log, etc. todo(kaoudis) in future this could probably be less of a catchall for 'all things stringly' and those things could have separate lookups or be stored inline if we're gonna mmap so much space anyway. - [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see test_cf_log.py for details of how this looks/works. - [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. From 1ce1041e481dd470c4997779342d7405851b3f4a Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 16:50:53 +0000 Subject: [PATCH 052/112] stems out the cflog-instrumented bitcode so that it is comparable to the fully instrumented bitcode --- polytracker/build.py | 60 +++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/polytracker/build.py b/polytracker/build.py index 7232bd89..72cdb9f4 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -1,9 +1,9 @@ import argparse -import subprocess -import os import json +import os +import subprocess from pathlib import Path -from typing import List, Dict, Tuple +from typing import Dict, List, Tuple from .plugins import Command @@ -148,9 +148,8 @@ def _optimize_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: def _preopt_instrument_bitcode( - input_bitcode: Path, - output_bitcode: Path, - ignore_lists: List[str]) -> None: + input_bitcode: Path, output_bitcode: Path, ignore_lists: List[str] +) -> None: POLY_PASS_PATH: Path = _ensure_path_exists( _compiler_dir_path() / "pass" / "libPolytrackerPass.so" ) @@ -182,9 +181,7 @@ def _preopt_instrument_bitcode( def _instrument_bitcode( - input_bitcode: Path, - output_bitcode: Path, - ignore_lists: List[str] + input_bitcode: Path, output_bitcode: Path, ignore_lists: List[str] ) -> None: POLY_PASS_PATH: Path = _ensure_path_exists( _compiler_dir_path() / "pass" / "libPolytrackerPass.so" @@ -319,11 +316,31 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="specify additional ignore lists to polytracker", ) + parser.add_argument( + "--cflog", + action="store_true", + help="also instrument with function tracing and control affecting dataflow logging IN ADDITION TO the default dynamic taint analysis instrumentation passes", + ) + def run(self, args: argparse.Namespace): - _instrument_bitcode( - args.input, - args.output, - args.ignore_lists) + if args.cflog: + cflog_output = Path(f"{args.output.stem}.cflog_instrumented.bc") + _preopt_instrument_bitcode( + input_bitcode=args.input, + output_bitcode=cflog_output, + ignore_lists=args.ignore_lists, + ) + _instrument_bitcode( + input_bitcode=cflog_output, + output_bitcode=args.output, + ignore_lists=args.ignore_lists, + ) + else: + _instrument_bitcode( + input_bitcode=args.input, + output_bitcode=args.output, + ignore_lists=args.ignore_lists, + ) class LowerBitcode(Command): @@ -401,16 +418,23 @@ def run(self, args: argparse.Namespace): opt_bc = bc_path.with_suffix(".opt.bc") _extract_bitcode(target_path, bc_path) if args.cflog: - # Control affecting data flow logging happens before optimization + # Control affecting data flow logging instrumentation happens + # before optimization + cflog_bc_path = Path(f"{bc_path.stem}.cflog_instrumented.bc") _preopt_instrument_bitcode( input_bitcode=bc_path, - output_bitcode=bc_path, - ignore_lists=args.ignore_lists) + output_bitcode=cflog_bc_path, + ignore_lists=args.ignore_lists, + ) + + _optimize_bitcode(input_bitcode=cflog_bc_path, output_bitcode=opt_bc) + else: + _optimize_bitcode(input_bitcode=bc_path, output_bitcode=opt_bc) - _optimize_bitcode(bc_path, opt_bc) inst_bc_path = Path(f"{bc_path.stem}.instrumented.bc") _instrument_bitcode( input_bitcode=opt_bc, output_bitcode=inst_bc_path, - ignore_lists=args.ignore_lists) + ignore_lists=args.ignore_lists, + ) _lower_bitcode(inst_bc_path, Path(inst_bc_path.stem), target_cmd) From e4773514d79226ed760251fcd69b9999cb205e66 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 17:30:02 +0000 Subject: [PATCH 053/112] Revert "trunk lint" This reverts commit 794ad8bb11bd84b6c8d080a241d195f78f88926c. --- docs/tdag.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tdag.md b/docs/tdag.md index 4a5105d0..a5433b17 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -39,7 +39,7 @@ Some specifics: - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) - [Strings](../polytracker/include/taintdag/string_table.h) the string table contains things like names of sources, the function names used with the functions section to map to the control flow log, etc. todo(kaoudis) in future this could probably be less of a catchall for 'all things stringly' and those things could have separate lookups or be stored inline if we're gonna mmap so much space anyway. - [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see test_cf_log.py for details of how this looks/works. -- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. +- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. ## TDAG Contents From 05c5b846dae1bd890ebeb65b77323ff3c3be3d3e Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 17:30:31 +0000 Subject: [PATCH 054/112] Reapply "trunk lint" This reverts commit e4773514d79226ed760251fcd69b9999cb205e66. --- docs/tdag.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tdag.md b/docs/tdag.md index a5433b17..4a5105d0 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -39,7 +39,7 @@ Some specifics: - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) - [Strings](../polytracker/include/taintdag/string_table.h) the string table contains things like names of sources, the function names used with the functions section to map to the control flow log, etc. todo(kaoudis) in future this could probably be less of a catchall for 'all things stringly' and those things could have separate lookups or be stored inline if we're gonna mmap so much space anyway. - [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see test_cf_log.py for details of how this looks/works. -- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. +- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. ## TDAG Contents From d22027e65ca032b26c3379a98b3558346c609312 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 17:30:53 +0000 Subject: [PATCH 055/112] Revert "trunk fmt go brrr" This reverts commit 5f67bb115392a0772b042b8a5413faefab668fad. --- examples/analysis/ubet/eval.py | 5 +- examples/analysis/ubet/eval_nitro.py | 17 +- polytracker/include/taintdag/fnmapping.h | 7 +- polytracker/include/taintdag/polytracker.h | 3 +- polytracker/include/taintdag/section.h | 3 +- polytracker/include/taintdag/string_table.h | 12 +- .../src/passes/tainted_control_flow.cpp | 261 +++++++++--------- polytracker/src/polytracker/polytracker.cpp | 6 +- polytracker/src/taintdag/fnmapping.cpp | 10 +- polytracker/src/taintdag/polytracker.cpp | 3 +- polytracker/taint_dag.py | 81 +++--- tests/conftest.py | 9 +- tests/test_cf_log.py | 35 +-- tests/test_program_trace.py | 11 +- tests/test_stdin.cpp | 4 +- tests/test_stdin.py | 41 +-- unittests/src/taintdag/tdag.cpp | 56 ++-- 17 files changed, 254 insertions(+), 310 deletions(-) diff --git a/examples/analysis/ubet/eval.py b/examples/analysis/ubet/eval.py index c8ef1328..0e219d18 100644 --- a/examples/analysis/ubet/eval.py +++ b/examples/analysis/ubet/eval.py @@ -1,13 +1,14 @@ # /usr/bin/python import os import random -import subprocess import sys -from pathlib import Path +import subprocess from typing import List, Tuple +from pathlib import Path from polytracker import PolyTrackerTrace + src_arg = Path(sys.argv[1]) no_build = "nobuild" == sys.argv[2] if len(sys.argv) > 2 else False src_dir = src_arg.parent diff --git a/examples/analysis/ubet/eval_nitro.py b/examples/analysis/ubet/eval_nitro.py index 062052f8..c171091a 100644 --- a/examples/analysis/ubet/eval_nitro.py +++ b/examples/analysis/ubet/eval_nitro.py @@ -1,20 +1,19 @@ import argparse -import os +from collections import defaultdict import subprocess +import os import sys -from collections import defaultdict -from functools import partialmethod +from typing import Optional, Set, Iterator, Tuple, Dict +from polytracker import PolyTrackerTrace, taint_dag +from polytracker.taint_dag import TDFile, TDNode, TDSourceNode, TDUnionNode, TDRangeNode +from polytracker.mapping import InputOutputMapping from pathlib import Path -from typing import Dict, Iterator, Optional, Set, Tuple - -import cxxfilt # To Silence TQDM! from tqdm import tqdm +from functools import partialmethod -from polytracker import PolyTrackerTrace, taint_dag -from polytracker.mapping import InputOutputMapping -from polytracker.taint_dag import TDFile, TDNode, TDRangeNode, TDSourceNode, TDUnionNode +import cxxfilt tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) diff --git a/polytracker/include/taintdag/fnmapping.h b/polytracker/include/taintdag/fnmapping.h index 09d50d5e..685db4fa 100644 --- a/polytracker/include/taintdag/fnmapping.h +++ b/polytracker/include/taintdag/fnmapping.h @@ -25,8 +25,8 @@ struct Function { offset_t name_offset; uint32_t function_id; - Function(offset_t name_ofs, uint32_t f_id) - : name_offset(name_ofs), function_id(f_id){}; + Function(offset_t name_ofs, uint32_t f_id) : + name_offset(name_ofs), function_id(f_id) {}; }; class Functions : public FixedSizeAlloc { @@ -42,8 +42,7 @@ class Functions : public FixedSizeAlloc { : FixedSizeAlloc{of.range}, string_table{of.output_file.template section()} {} - std::optional add_mapping(uint32_t function_id, - std::string_view function_name); + std::optional add_mapping(uint32_t function_id, std::string_view function_name); private: StringTable &string_table; diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index 7e10f69b..a1afd679 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -65,8 +65,7 @@ class PolyTracker { void leave_function(uint32_t function_id); // Log function name - void record_function_name(uint32_t function_id, - std::string_view function_name); + void record_function_name(uint32_t function_id, std::string_view function_name); // Log tainted data flowed into the sink void taint_sink(int fd, util::Offset offset, void const *mem, size_t length); diff --git a/polytracker/include/taintdag/section.h b/polytracker/include/taintdag/section.h index d4682026..ac2efe09 100644 --- a/polytracker/include/taintdag/section.h +++ b/polytracker/include/taintdag/section.h @@ -142,8 +142,7 @@ template struct FixedSizeAlloc : SectionBase { .t = *new (&*(write_context->mem.begin())) T{std::forward(args)...}}; } - spdlog::error("Failed to allocate memory in the section for the object, so " - "could not construct the object in the tdag section"); + spdlog::error("Failed to allocate memory in the section for the object, so could not construct the object in the tdag section"); return {}; } diff --git a/polytracker/include/taintdag/string_table.h b/polytracker/include/taintdag/string_table.h index 7b064a41..4368872d 100644 --- a/polytracker/include/taintdag/string_table.h +++ b/polytracker/include/taintdag/string_table.h @@ -30,7 +30,7 @@ struct StringTable : public SectionBase { // that can be expressed. static constexpr size_t max_entry_size = std::min(static_cast(std::numeric_limits::max()), - max_offset - sizeof(length_t)); + max_offset - sizeof(length_t)); static constexpr uint8_t tag{3}; static constexpr size_t allocation_size{0x100000}; @@ -45,23 +45,19 @@ struct StringTable : public SectionBase { // by using `from_offset`. std::optional add_string(std::string_view sv) { if ((sv.size() + sizeof(length_t)) > max_entry_size) { - spdlog::info("Tried to store a string of size {0:d} but max is {1:d} " - "(will truncate string)", - sv.size(), max_entry_size); + spdlog::info("Tried to store a string of size {0:d} but max is {1:d} (will truncate string)", sv.size(), max_entry_size); size_t to_truncate = max_entry_size - sizeof(length_t) - 1; sv = sv.substr(0, to_truncate); if ((sv.size() + sizeof(length_t)) > max_entry_size) { - error_exit("Truncated string was too big: ", - sv.size() + sizeof(length_t)); + error_exit("Truncated string was too big: ", sv.size() + sizeof(length_t)); } } auto len = allocated_len(sv.size()); if (auto write_context = write(len)) { - // todo(kaoudis) this is possibly a type confusion issue resulting in - // truncation since size_t is bigger than the current length_t + // todo(kaoudis) this is possibly a type confusion issue resulting in truncation since size_t is bigger than the current length_t *reinterpret_cast(&*(write_context->mem.begin())) = sv.size(); // copy string diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index e4336697..73f21552 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -22,163 +22,162 @@ static llvm::cl::list ignore_lists( "pt-ftrace-ignore-list", - llvm::cl::desc("File that specifies functions that pt-tcf should ignore")); + llvm::cl::desc( + "File that specifies functions that pt-tcf should ignore")); namespace polytracker { namespace { -uint32_t get_or_add_mapping(uintptr_t key, - std::unordered_map &mapping, - uint32_t &counter) { - if (auto it = mapping.find(key); it != mapping.end()) { - return it->second; - } else { - return mapping[key] = counter++; + uint32_t get_or_add_mapping(uintptr_t key, std::unordered_map &mapping, uint32_t &counter) { + if (auto it = mapping.find(key); it != mapping.end()) { + return it->second; + } else { + return mapping[key] = counter++; + } } -} } // namespace -void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, - llvm::Value *val) { - llvm::IRBuilder<> ir(&inst); - auto dummy_val{val}; - if (inst.getType()->isVectorTy()) { - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); + void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, + llvm::Value *val) { + llvm::IRBuilder<> ir(&inst); + auto dummy_val{val}; + if (inst.getType()->isVectorTy()) { + dummy_val = ir.CreateExtractElement(val, uint64_t(0)); + } + ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); + } + + llvm::ConstantInt * + TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { + auto func_address = reinterpret_cast(&func); + auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); + return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); + } + + llvm::ConstantInt * + TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { + return get_function_id_const(*(i.getParent()->getParent())); } - ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); -} - -llvm::ConstantInt * -TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { - auto func_address = reinterpret_cast(&func); - auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); - return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); -} - -llvm::ConstantInt * -TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { - return get_function_id_const(*(i.getParent()->getParent())); -} - -void TaintedControlFlowPass::visitGetElementPtrInst( - llvm::GetElementPtrInst &gep) { - llvm::IRBuilder<> ir(&gep); - for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { - continue; + + void TaintedControlFlowPass::visitGetElementPtrInst( + llvm::GetElementPtrInst &gep) { + llvm::IRBuilder<> ir(&gep); + for (auto &idx : gep.indices()) { + if (llvm::isa(idx)) { + continue; + } + + // we do not handle VectorTypes yet + if ((*(idx->getType())).isVectorTy()) { + continue; + } + + auto callret = ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), + get_function_id_const(gep)}); + + idx = ir.CreateSExtOrTrunc(callret, idx->getType()); } + } - // we do not handle VectorTypes yet - if ((*(idx->getType())).isVectorTy()) { - continue; + void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { + if (bi.isUnconditional()) { + return; } - auto callret = ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), - get_function_id_const(gep)}); + llvm::IRBuilder<> ir(&bi); + auto cond = bi.getCondition(); + + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); - idx = ir.CreateSExtOrTrunc(callret, idx->getType()); + bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); } -} -void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { - if (bi.isUnconditional()) { - return; + void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); + + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); } - llvm::IRBuilder<> ir(&bi); - auto cond = bi.getCondition(); + void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { + // TODO(hbrodin): Can't handle atm. + if (si.getType()->isVectorTy()) { + return; + } + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); -} + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); + } -void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); + void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { + llvm::LLVMContext *context = &mod.getContext(); + llvm::IRBuilder<> ir(*context); - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + cond_br_log_fn = mod.getOrInsertFunction( + "__polytracker_log_tainted_control_flow", + llvm::AttributeList::get( + mod.getContext(), + {{llvm::AttributeList::FunctionIndex, + llvm::Attribute::get(mod.getContext(), + llvm::Attribute::ReadNone)}}), + ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); -} + enter_log_fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), llvm::Type::getInt8PtrTy(*context)); -void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - // TODO(hbrodin): Can't handle atm. - if (si.getType()->isVectorTy()) { - return; - } - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); -} - -void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { - llvm::LLVMContext *context = &mod.getContext(); - llvm::IRBuilder<> ir(*context); - - cond_br_log_fn = mod.getOrInsertFunction( - "__polytracker_log_tainted_control_flow", - llvm::AttributeList::get( - mod.getContext(), - {{llvm::AttributeList::FunctionIndex, - llvm::Attribute::get(mod.getContext(), - llvm::Attribute::ReadNone)}}), - ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); - - enter_log_fn_type = llvm::FunctionType::get( - llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), - llvm::Type::getInt8PtrTy(*context)); - - fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", - enter_log_fn_type); - - fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", - ir.getVoidTy(), ir.getInt32Ty()); -} - -void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { - if (func.isDeclaration()) { - return; + fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", enter_log_fn_type); + + fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); } - llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); - - ir.CreateCall(fn_enter_log_fn, {get_function_id_const(func), - ir.CreateGlobalStringPtr(func.getName())}); -} - -void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { - llvm::IRBuilder<> ir(&ri); - ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); -} - -llvm::PreservedAnalyses -TaintedControlFlowPass::run(llvm::Module &mod, - llvm::ModuleAnalysisManager &mam) { - label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); - declareLoggingFunctions(mod); - auto fnsToIgnore{readIgnoreLists(ignore_lists)}; - - for (auto &fn : mod) { - auto fname{fn.getName()}; - if (fnsToIgnore.count(fname.str())) { - continue; - } else { - instrumentFunctionEnter(fn); - visit(fn); + + void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { + if (func.isDeclaration()) { + return; } + llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); + + ir.CreateCall(fn_enter_log_fn, + { + get_function_id_const(func), + ir.CreateGlobalStringPtr(func.getName()) + } + ); + } + + void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { + llvm::IRBuilder<> ir(&ri); + ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); } - return llvm::PreservedAnalyses::none(); -} + llvm::PreservedAnalyses + TaintedControlFlowPass::run(llvm::Module &mod, + llvm::ModuleAnalysisManager &mam) { + label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); + declareLoggingFunctions(mod); + auto fnsToIgnore{readIgnoreLists(ignore_lists)}; + + for (auto &fn : mod) { + auto fname{fn.getName()}; + if (fnsToIgnore.count(fname.str())) { + continue; + } else { + instrumentFunctionEnter(fn); + visit(fn); + } + } + + return llvm::PreservedAnalyses::none(); + } } // namespace polytracker \ No newline at end of file diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 3ced27b2..1dc3149d 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -55,11 +55,9 @@ extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( return conditional; } -extern "C" void __polytracker_enter_function(uint32_t function_id, - const char *function_name) { +extern "C" void __polytracker_enter_function(uint32_t function_id, const char* function_name) { get_polytracker_tdag().enter_function(function_id); - get_polytracker_tdag().record_function_name(function_id, - std::string_view(function_name)); + get_polytracker_tdag().record_function_name(function_id, std::string_view(function_name)); } extern "C" void __polytracker_leave_function(uint32_t function_id) { diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index c57e4ed5..f1dbf831 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -19,16 +19,14 @@ using index_t = Functions::index_t; } // namespace -std::optional Functions::add_mapping(uint32_t function_id, - std::string_view function_name) { +std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { std::unique_lock mappings_lock(mappings_mutex); if (auto it{mappings.find(function_name)}; it != mappings.end()) { return it->second; } - std::optional maybe_name_offset = - string_table.add_string(function_name); + std::optional maybe_name_offset = string_table.add_string(function_name); if (!maybe_name_offset.has_value()) { spdlog::error("Could not write function name to strings table"); return {}; @@ -36,9 +34,7 @@ std::optional Functions::add_mapping(uint32_t function_id, auto maybe_ctx = construct(Function(maybe_name_offset.value(), function_id)); if (!maybe_ctx.has_value()) { - spdlog::error("Could not write Function {0} with id {1:d}, string table " - "ofs {2:d} to the tdag functions section", - function_name, function_id, maybe_name_offset.value()); + spdlog::error("Could not write Function {0} with id {1:d}, string table ofs {2:d} to the tdag functions section", function_name, function_id, maybe_name_offset.value()); return {}; } diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index fa905187..ff540f9e 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -178,8 +178,7 @@ void PolyTracker::log_tainted_control_flow(label_t lbl, uint32_t function_id) { output_file_.section().tainted_control_flow(lbl, function_id); } -void PolyTracker::record_function_name(uint32_t function_id, - std::string_view function_name) { +void PolyTracker::record_function_name(uint32_t function_id, std::string_view function_name) { output_file_.section().add_mapping(function_id, function_name); } diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 56d6a4bb..24aeedd3 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -1,47 +1,47 @@ -from ctypes import ( - Structure, - c_char, - c_int32, - c_int64, - c_uint8, - c_uint16, - c_uint32, - c_uint64, - sizeof, -) -from enum import Enum -from mmap import PROT_READ, mmap -from pathlib import Path from typing import ( BinaryIO, - Dict, + Union, Iterable, Iterator, - List, Optional, - Set, + Dict, Tuple, + List, + Set, Type, - Union, cast, ) from cxxfilt import demangle +from enum import Enum +from pathlib import Path +from mmap import mmap, PROT_READ +from ctypes import ( + Structure, + c_char, + c_int64, + c_uint64, + c_int32, + c_uint32, + c_uint8, + c_uint16, + sizeof, +) from typing_extensions import deprecated -from .inputs import Input from .plugins import Command -from .polytracker import ProgramTrace from .repl import PolyTrackerREPL +from .polytracker import ProgramTrace +from .inputs import Input from .taint_forest import TaintForest, TaintForestNode from .tracing import ( BasicBlock, ByteOffset, Function, TaintAccess, + TraceEvent, TaintOutput, Taints, - TraceEvent, ) @@ -94,11 +94,9 @@ def enumerate(self): for offset in range(0, len(self.mem), sizeof(TDFDHeader)): yield TDFDHeader.from_buffer_copy(self.mem[offset:]) - @deprecated("Use ControlFlowEvent instead, TDEvents are no longer written") class TDEvent(Structure): """This is an old version of the ControlFlowEvent kept for backward compatibility only""" - _fields_ = [("kind", c_uint8), ("fnidx", c_uint16)] class Kind(Enum): @@ -108,11 +106,9 @@ class Kind(Enum): def __repr__(self) -> str: return f"kind: {self.Kind(self.kind).name} fnidx: {self.fnidx}" - @deprecated("Use TDControlFlowLog instead, TDEvents section is no longer written") class TDEventsSection: """This is an old version of the CFLog kept for backward compatibility only""" - def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] @@ -120,7 +116,6 @@ def __iter__(self): for offset in range(0, len(self.section), sizeof(TDEvent)): yield TDEvent.from_buffer_copy(self.section, offset) - class TDStringSection: """TDAG String Table section. @@ -360,7 +355,6 @@ class TDFunctionsSection: """This section holds the mapping between the function IDs stored in callstack form in the cflog section, and the function names stored in the string table. See fnmapping in the C++ part of the codebase for the "write" side part of Polytracker that pertains to this section. Each entry is an uint32_t as set in fnmapping.cpp, but a TDFnHeader will then contain *two* of these: the function_id and the name_offset. Structure in memory: |offset|function id|...""" - def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] @@ -388,8 +382,10 @@ def invalid_fd(self): class TDFnHeader(Structure): # This corresponds to the Function inline constructor in fnmapping.h. # Anything using Structure needs to be in sync with the corresponding C++. - _fields_ = [("name_offset", c_uint32), ("function_id", c_uint32)] - + _fields_ = [ + ("name_offset", c_uint32), + ("function_id", c_uint32) + ] class TDNode: def __init__(self, affects_control_flow: bool = False): @@ -449,6 +445,7 @@ def __repr__(self) -> str: return f"TDSink fdidx: {self.fdidx} offset: {self.offset} label: {self.label}" + TDSection = Union[ TDLabelSection, TDSourceSection, @@ -539,24 +536,21 @@ def mangled_fn_symbol_lookup(self) -> Dict[int, str]: return lookup def _maybe_demangle(self, function_id: int) -> Union[str, int]: - """Depending on the age of the tdag, it may not contain a function mapping. If the tdag doesn't contain a function mapping, this will only return function ids and you'll need to manually map them against symbols gathered statically from the compiled instrumented binary.""" + """Depending on the age of the tdag, it may not contain a function mapping. If the tdag doesn't contain a function mapping, this will only return function ids and you'll need to manually map them against symbols gathered statically from the compiled instrumented binary. """ maybe_symbol = self.mangled_fn_symbol_lookup.get(function_id) if maybe_symbol is not None: return demangle(maybe_symbol) else: return function_id - def cflog(self, demangle_symbols: bool = False) -> Iterator[ControlFlowEvent]: + def cflog(self, demangle_symbols: bool=False) -> Iterator[ControlFlowEvent]: """Presents the control flow log. Does not demangle symbols by default, for performance.""" cflog_section = self.sections_by_type[TDControlFlowLogSection] assert isinstance(cflog_section, TDControlFlowLogSection) if demangle_symbols: for cflog_entry in cflog_section: - cflog_entry.callstack[:] = [ - self._maybe_demangle(function_id) - for function_id in cflog_entry.callstack - ] + cflog_entry.callstack[:] = [self._maybe_demangle(function_id) for function_id in cflog_entry.callstack] yield cflog_entry else: @@ -869,13 +863,13 @@ def __init_arguments__(self, parser): "--print-taint-sinks", "-s", action="store_true", - help="print taint sinks (very slow for large traces)", + help="print taint sinks", ) parser.add_argument( "--print-taint-nodes", "-n", action="store_true", - help="print taint nodes (very slow for large traces)", + help="print taint nodes", ) parser.add_argument( @@ -889,7 +883,7 @@ def __init_arguments__(self, parser): "--print-control-flow-log", "-c", action="store_true", - help="print control flow log events (very slow for large traces)", + help="print control flow log events", ) def run(self, args): @@ -911,11 +905,8 @@ def run(self, args): print(f"Label {lbl}: {tdfile.decode_node(lbl)}") if args.print_function_trace: - if ( - TDFunctionsSection in tdfile.sections_by_type.keys() - and len(tdfile.mangled_fn_symbol_lookup) > 0 - ): - for k, v in tdfile.mangled_fn_symbol_lookup: + if TDFunctionsSection in tdfile.sections_by_type.keys() and len(tdfile.mangled_fn_symbol_lookup) > 0: + for k,v in tdfile.mangled_fn_symbol_lookup: print(f"function_id '{k}': function '{demangle(v)}'") else: print("Error: no Functions section could be read from the tdag!") @@ -926,7 +917,5 @@ def run(self, args): for event in tdfile.cflog(demangle_symbols=True): print(event) else: - print( - "Error: no Control Flow Log section could be read from the tdag!" - ) + print("Error: no Control Flow Log section could be read from the tdag!") print(f"Sections that could be read: {tdfile.sections}") diff --git a/tests/conftest.py b/tests/conftest.py index 627da284..ee0e2eaa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,11 @@ -import subprocess import sys -from pathlib import Path -from typing import List - import pytest - +import subprocess import polytracker +from pathlib import Path +from typing import List + def pytest_configure(config): config.addinivalue_line( diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index d67b6d22..ae6d80a2 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -1,21 +1,20 @@ -import subprocess -from pathlib import Path -from typing import List - import cxxfilt import pytest +import subprocess import polytracker -from polytracker import ProgramTrace +from pathlib import Path + from polytracker.taint_dag import ( + ControlFlowEvent, CFEnterFunctionEvent, CFLeaveFunctionEvent, - ControlFlowEvent, TaintedControlFlowEvent, TDControlFlowLogSection, TDNode, ) - +from polytracker import ProgramTrace +from typing import List @pytest.mark.program_trace("test_fntrace.cpp") def test_function_mapping(program_trace: ProgramTrace): @@ -26,12 +25,9 @@ def test_function_mapping(program_trace: ProgramTrace): for symbol in mangled_symbols: assert cxxfilt.demangle(symbol) in expected_names - @pytest.mark.program_trace("test_fntrace.cpp") def test_callstack_mapping(program_trace: ProgramTrace): - cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ - TDControlFlowLogSection - ] + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] for cflog_entry in cflog: assert len(cflog_entry.callstack) > 0 @@ -40,20 +36,17 @@ def test_callstack_mapping(program_trace: ProgramTrace): # when we look up the function id it should map to a name we traced assert callstack_entry in program_trace.tdfile.mangled_fn_symbol_lookup - @pytest.mark.program_trace("test_fntrace.cpp") def test_label_mapping(program_trace: ProgramTrace): - cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ - TDControlFlowLogSection - ] + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] for cflog_entry in cflog: if type(cflog_entry) == TaintedControlFlowEvent: - assert hasattr(cflog_entry, "label") + assert hasattr(cflog_entry, 'label') node: TDNode = program_trace.tdfile.decode_node(cflog_entry.label) assert node.affects_control_flow else: - assert not hasattr(cflog_entry, "label") + assert not hasattr(cflog_entry, 'label') @pytest.mark.program_trace("test_cf_log.cpp") @@ -88,7 +81,9 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): CFEnterFunctionEvent(["main", "f1(unsigned char)"]), TaintedControlFlowEvent(["main", "f1(unsigned char)"], 7), CFEnterFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), - TaintedControlFlowEvent(["main", "f1(unsigned char)", "f2(unsigned char)"], 7), + TaintedControlFlowEvent( + ["main", "f1(unsigned char)", "f2(unsigned char)"], 7 + ), CFLeaveFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), CFLeaveFunctionEvent(["main", "f1(unsigned char)"]), CFLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit @@ -105,6 +100,4 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): for entry in cflog: for callstack_entry in entry.callstack: - assert callstack_entry in list( - program_trace.tdfile.mangled_fn_symbol_lookup.values() - ) + assert callstack_entry in list(program_trace.tdfile.mangled_fn_symbol_lookup.values()) \ No newline at end of file diff --git a/tests/test_program_trace.py b/tests/test_program_trace.py index 2ec6906e..b6d765da 100644 --- a/tests/test_program_trace.py +++ b/tests/test_program_trace.py @@ -1,10 +1,14 @@ from collections import defaultdict +import pytest from subprocess import CalledProcessError from typing import Dict, Union -import pytest - -from polytracker import BasicBlockEntry, FunctionEntry, FunctionReturn, ProgramTrace +from polytracker import ( + BasicBlockEntry, + FunctionEntry, + FunctionReturn, + ProgramTrace, +) @pytest.mark.skip(reason="taint_dag does not support traces yet") @@ -178,7 +182,6 @@ def test_cxx_global_object(program_trace: ProgramTrace): assert taints[0].offset == 1 assert taints[0].length == 1 - @pytest.mark.skip(reason="the Taint Forest is currently not implemented") @pytest.mark.program_trace("test_simple_union.cpp", input="ABCDEFGH\n11235878\n") def test_taint_forest(program_trace: ProgramTrace): diff --git a/tests/test_stdin.cpp b/tests/test_stdin.cpp index 632f59b0..bfd1ff91 100644 --- a/tests/test_stdin.cpp +++ b/tests/test_stdin.cpp @@ -1,7 +1,7 @@ #include +#include #include #include -#include int stdin_read() { char inbyte; @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) { if (argc != 2) { exit(EXIT_FAILURE); } - + if (std::strncmp(argv[1], "read", 4) == 0) { printf("got read\n"); stdin_read(); diff --git a/tests/test_stdin.py b/tests/test_stdin.py index f117055c..c8f37875 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -1,23 +1,20 @@ -import subprocess -from pathlib import Path -from random import choice -from string import printable - import pytest +import subprocess import polytracker from polytracker import taint_dag +from pathlib import Path +from random import choice +from string import printable + # Ensure stdin reads in multiple ways are verified # examples: getc, fgetc, fread, fread_unlocked, fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw -_stdin_data = "\n".join(choice(printable) for _ in range(40)).encode("utf-8") - +_stdin_data = '\n'.join(choice(printable) for _ in range(40)).encode("utf-8") -def _create_tdag_trace( - instrumented_binary: Path, trace_file: Path, method: str -) -> None: - """Rather than using pytest.mark.parametrize on this setup function, split +def _create_tdag_trace(instrumented_binary: Path, trace_file: Path, method: str) -> None: + """Rather than using pytest.mark.parametrize on this setup function, split out DRY from the test framework so it's easy to see when an individual test fails.""" # https://docs.python.org/3/library/subprocess.html#subprocess.CalledProcessError.returncode @@ -29,16 +26,13 @@ def _create_tdag_trace( close_fds=False, ).check_returncode() - def _test_trace(trace_file: Path) -> None: - """Test the tdag output, checking its inputs to make sure we tainted and + """Test the tdag output, checking its inputs to make sure we tainted and tracked every byte of stdin. Offsets must be ordered as they were read.""" - - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load( - trace_file - ) + + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) assert "/dev/stdin" in [input.path for input in program_trace.inputs] - + expected_offset = 0 for input_label in program_trace.tdfile.input_labels(): src_node = program_trace.tdfile.decode_node(input_label) @@ -46,46 +40,39 @@ def _test_trace(trace_file: Path) -> None: assert src_node.offset == expected_offset assert program_trace.tdfile.fd_headers[src_node.idx][0] == Path("/dev/stdin") expected_offset += 1 - + assert expected_offset == len(_stdin_data) - @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_read(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "read") _test_trace(trace_file) - @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_fread(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "fread") _test_trace(trace_file) - - + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getc(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getc") _test_trace(trace_file) - @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getc_unlocked(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getc_unlocked") _test_trace(trace_file) - @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getchar(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getchar") _test_trace(trace_file) - @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getchar_unlocked(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getchar_unlocked") _test_trace(trace_file) - @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_fgetc(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "fgetc") diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index 8554355f..4793e5f1 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -1,11 +1,11 @@ #include -#include "taintdag/labels.h" #include "taintdag/outputfile.h" #include "taintdag/section.h" #include "taintdag/storage.h" #include "taintdag/string_table.h" #include "taintdag/taint_source.h" +#include "taintdag/labels.h" #include "utils.h" @@ -244,15 +244,12 @@ struct DummyOutputFile { StringTable &string_table; }; -TEST_CASE( - "The Sources and StringTable sections can be used to store source entries", - "[Sources, StringTable]") { +TEST_CASE("The Sources and StringTable sections can be used to store source entries", "[Sources, StringTable]") { OutputFile of{std::tmpnam(nullptr)}; auto &sources_section{of.section()}; auto &string_table{of.section()}; - SECTION("Can add taint-source entries to the Sources section", - "[Sources, StringTable]") { + SECTION("Can add taint-source entries to the Sources section", "[Sources, StringTable]") { int fd = 3; REQUIRE(!sources_section.mapping_idx(fd)); @@ -270,8 +267,7 @@ TEST_CASE( REQUIRE(m1.size == 122); int fd2 = 99; - auto s2 = - sources_section.add_source("test2", fd2, SourceEntry::InvalidSize); + auto s2 = sources_section.add_source("test2", fd2, SourceEntry::InvalidSize); REQUIRE(s2.has_value()); auto idx2 = sources_section.mapping_idx(fd2); @@ -285,8 +281,7 @@ TEST_CASE( } WHEN("Adding taint-sources to the Sources section and the string table") { - THEN("Latest wins in terms in case output_file has multiple mappings for " - "the same fd") { + THEN("Latest wins in terms in case output_file has multiple mappings for the same fd") { int fd = 1; sources_section.add_source("first", fd); sources_section.add_source("second", fd); @@ -336,8 +331,7 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { string_table.add_string("d"); std::vector res; - std::copy(string_table.begin(), string_table.end(), - std::back_inserter(res)); + std::copy(string_table.begin(), string_table.end(), std::back_inserter(res)); REQUIRE(res.size() == 4); REQUIRE(res[0] == "a"); REQUIRE(res[1] == "b"); @@ -347,20 +341,18 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { } WHEN("Adding to the string table") { - THEN("A string bigger than the maximum string size will be truncated and " - "stored") { + THEN("A string bigger than the maximum string size will be truncated and stored") { // display the info logging spdlog::set_level(spdlog::level::debug); auto len = StringTable::max_entry_size + 10; std::string too_big(len, 'A'); - REQUIRE_NOTHROW([&]() { + REQUIRE_NOTHROW([&](){ auto offset = string_table.add_string(too_big); REQUIRE(offset.has_value()); std::string_view result = string_table.from_offset(offset.value()); - REQUIRE(result.size() + sizeof(StringTable::length_t) == - StringTable::max_entry_size - 1); + REQUIRE(result.size() + sizeof(StringTable::length_t) == StringTable::max_entry_size - 1); }()); } @@ -375,11 +367,10 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { } } - THEN("Add a maximumly big string and will still be able to add other " - "strings") { + THEN("Add a maximumly big string and will still be able to add other strings") { auto size = StringTable::max_entry_size - sizeof(StringTable::length_t); std::string s(size, 'A'); - REQUIRE_NOTHROW([&]() { + REQUIRE_NOTHROW([&](){ auto offset = string_table.add_string(s); REQUIRE(offset.has_value()); auto result = string_table.from_offset(offset.value()); @@ -399,18 +390,15 @@ TEST_CASE("StringTable add/iterate", "[StringTable]") { } } -TEST_CASE( - "An allocation that is larger than can be represented in the string table " - "will result in truncation and does not prevent adding more strings", - "[StringTable]") { - auto alloc_size = - static_cast(std::numeric_limits::max()) + - 1; - alignas(StringTable::offset_t) uint8_t backing[64]; - int dummy = 1; - StringTable st{SectionArg{.output_file = dummy, .range = backing}}; - auto span = StringTable::span_t{&backing[0], alloc_size}; - std::string_view tinystring{"eep"}; - REQUIRE_NOTHROW(st.add_string(tinystring)); -} + TEST_CASE("An allocation that is larger than can be represented in the string table will result in truncation and does not prevent adding more strings", "[StringTable]") { + auto alloc_size = + static_cast(std::numeric_limits::max()) + + 1; + alignas(StringTable::offset_t) uint8_t backing[64]; + int dummy = 1; + StringTable st{SectionArg{.output_file = dummy, .range = backing}}; + auto span = StringTable::span_t{&backing[0], alloc_size}; + std::string_view tinystring{"eep"}; + REQUIRE_NOTHROW(st.add_string(tinystring)); + } } // namespace taintdag \ No newline at end of file From 0b21c5137b0315872ac9c0ec915eca1cad469e8a Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Feb 2025 18:52:20 +0000 Subject: [PATCH 056/112] whilst making more sense of the tdag tests fix the string table comments --- polytracker/include/taintdag/string_table.h | 23 ++++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/polytracker/include/taintdag/string_table.h b/polytracker/include/taintdag/string_table.h index 4368872d..5c25318b 100644 --- a/polytracker/include/taintdag/string_table.h +++ b/polytracker/include/taintdag/string_table.h @@ -30,7 +30,7 @@ struct StringTable : public SectionBase { // that can be expressed. static constexpr size_t max_entry_size = std::min(static_cast(std::numeric_limits::max()), - max_offset - sizeof(length_t)); + max_offset - sizeof(length_t)); static constexpr uint8_t tag{3}; static constexpr size_t allocation_size{0x100000}; @@ -39,25 +39,32 @@ struct StringTable : public SectionBase { template StringTable(SectionArg output_file) : SectionBase{output_file.range} {} - // Appends the string `sv` to the string table. - // Returns the offset of the string entry. Note that this is not the - // string, but the offset to the size of it. Recover the string - // by using `from_offset`. + // Adds the string `sv` to the string table. + // Returns the offset in bytes from the beginning of the section of the string + // entry. Note that this is not the string, but the offset to the size of it. + // Recover the string by using `from_offset`. If a string is bigger than the + // maximum size allowed for an entry it will be truncated. If the string table + // allocation is full, the string will not be stored and no offset will be + // returned. std::optional add_string(std::string_view sv) { if ((sv.size() + sizeof(length_t)) > max_entry_size) { - spdlog::info("Tried to store a string of size {0:d} but max is {1:d} (will truncate string)", sv.size(), max_entry_size); + spdlog::info("Tried to store a string of size {0:d} but max is {1:d} " + "(will truncate string)", + sv.size(), max_entry_size); size_t to_truncate = max_entry_size - sizeof(length_t) - 1; sv = sv.substr(0, to_truncate); if ((sv.size() + sizeof(length_t)) > max_entry_size) { - error_exit("Truncated string was too big: ", sv.size() + sizeof(length_t)); + error_exit("Truncated string was too big: ", + sv.size() + sizeof(length_t)); } } auto len = allocated_len(sv.size()); if (auto write_context = write(len)) { - // todo(kaoudis) this is possibly a type confusion issue resulting in truncation since size_t is bigger than the current length_t + // todo(kaoudis) this is possibly a type confusion issue resulting in + // truncation since size_t is bigger than the current length_t *reinterpret_cast(&*(write_context->mem.begin())) = sv.size(); // copy string From 55d074dd3ce8a51fd77db7658fad1e7ceb4f501e Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Wed, 5 Mar 2025 20:48:14 +0000 Subject: [PATCH 057/112] fix Dockerfile casing warning --- Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index f7f7f7eb..3b8c8a80 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build base image -FROM ubuntu:jammy as base +FROM ubuntu:jammy AS base LABEL org.opencontainers.image.authors="evan.sultanik@trailofbits.com" @@ -41,7 +41,7 @@ RUN GO111MODULE=off go get github.com/SRI-CSL/gllvm/cmd/... ENV PATH=$PATH:/root/go/bin # Clone llvm to build `libc++` from source -FROM base as llvm-sources +FROM base AS llvm-sources RUN git clone --depth 1 --branch llvmorg-13.0.0 https://github.com/llvm/llvm-project.git /llvm-project @@ -54,7 +54,7 @@ RUN git clone --depth 1 --branch llvmorg-13.0.0 https://github.com/llvm/llvm-pro # Build "clean" `libc++` with `gclang`. Used to link the uninstrumented # target of the user project. Installed into `/cxx_lib/clean_build`. -FROM llvm-sources as clean-libcxx +FROM llvm-sources AS clean-libcxx ENV WLLVM_BC_STORE=/cxx_clean_bitcode RUN mkdir -p $WLLVM_BC_STORE @@ -78,7 +78,7 @@ RUN cmake --build $LIBCXX_BUILD_DIR --target install-cxx install-cxxabi -j$((`np # Build "poly" `libc++` with `gclang`. Used to link the instrumented # target of the user project. Installed into `/cxx_lib/poly_build`. -FROM clean-libcxx as poly-libcxx +FROM clean-libcxx AS poly-libcxx ENV WLLVM_BC_STORE=/cxx_poly_bitcode RUN mkdir -p $WLLVM_BC_STORE @@ -104,7 +104,7 @@ RUN cmake -GNinja \ RUN cmake --build $LIBCXX_BUILD_DIR --target install-cxx install-cxxabi -j$((`nproc`+1)) # Build and install the polytracker -FROM poly-libcxx as polytracker +FROM poly-libcxx AS polytracker ARG DFSAN_FILENAME_ARCH=x86_64 From c306dc2928954c30555df3670bb93c8ed14a7b92 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 6 Mar 2025 16:21:15 +0000 Subject: [PATCH 058/112] cleans up unit tests and fleshes out the integration test with all current sections used in this diff --- CMakeLists.txt | 4 +- .../include/taintdag/control_flow_log.h | 24 +- .../taintdag/control_flow_log_encoding.h | 21 + polytracker/src/CMakeLists.txt | 3 +- polytracker/src/polytracker/polytracker.cpp | 6 +- .../taintdag/control_flow_log_encoding.cpp | 26 + unittests/src/taintdag/CMakeLists.txt | 8 +- ..._log.cpp => control_flow_log_encoding.cpp} | 7 +- unittests/src/taintdag/section.cpp | 205 ++++++++ unittests/src/taintdag/storage.cpp | 40 ++ unittests/src/taintdag/string_table.cpp | 164 ++++++ ...{encoding.cpp => taint_label_encoding.cpp} | 0 unittests/src/taintdag/tdag.cpp | 466 ++++-------------- 13 files changed, 564 insertions(+), 410 deletions(-) create mode 100644 polytracker/include/taintdag/control_flow_log_encoding.h create mode 100644 polytracker/src/taintdag/control_flow_log_encoding.cpp rename unittests/src/taintdag/{control_flow_log.cpp => control_flow_log_encoding.cpp} (90%) create mode 100644 unittests/src/taintdag/section.cpp create mode 100644 unittests/src/taintdag/storage.cpp create mode 100644 unittests/src/taintdag/string_table.cpp rename unittests/src/taintdag/{encoding.cpp => taint_label_encoding.cpp} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8178ad5f..b5f920ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,8 +22,6 @@ endif() set(CMAKE_CXX_STANDARD 20) -add_subdirectory(third_party/Catch2) -list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/Catch2/contrib") add_subdirectory(third_party/indicators) set(SPDLOG_NO_EXCEPTIONS TRUE) add_subdirectory(third_party/spdlog) @@ -31,4 +29,6 @@ add_subdirectory(third_party/spdlog) add_subdirectory(polytracker) enable_testing() +add_subdirectory(third_party/Catch2) +list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/Catch2/contrib") add_subdirectory(unittests/src/taintdag) diff --git a/polytracker/include/taintdag/control_flow_log.h b/polytracker/include/taintdag/control_flow_log.h index 1bcff380..95884353 100644 --- a/polytracker/include/taintdag/control_flow_log.h +++ b/polytracker/include/taintdag/control_flow_log.h @@ -1,4 +1,3 @@ - /* * Copyright (c) 2022-present, Trail of Bits, Inc. * All rights reserved. @@ -9,6 +8,7 @@ #pragma once +#include "taintdag/control_flow_log_encoding.h" #include "taintdag/outputfile.h" #include "taintdag/section.h" #include "taintdag/taint.h" @@ -16,22 +16,6 @@ namespace taintdag { -namespace detail { -// A uint32_t varint encoded by setting highest bit for all but the final byte. -// Requires up to 5 bytes of storage as each output byte uses 7 input bits. -// Total maximum need is floor(32/7) = 5. Returns number of bytes required. -size_t varint_encode(uint32_t val, uint8_t *buffer) { - auto orig_buffer = buffer; - while (val >= 0x80) { - *buffer++ = 0x80 | (val & 0x7f); - val >>= 7; - } - *buffer++ = val & 0x7f; - return buffer - orig_buffer; -} -// TODO (hbrodin): Should probably used std::span -} // namespace detail - struct ControlFlowLog : public SectionBase { enum EventType { EnterFunction = 0, @@ -49,7 +33,7 @@ struct ControlFlowLog : public SectionBase { void function_event(EventType evt, uint32_t function_id) { uint8_t buffer[6]; buffer[0] = static_cast(evt); - auto used = detail::varint_encode(function_id, &buffer[1]); + auto used = varint_encode(function_id, &buffer[1]); auto total = used + 1; if (auto wctx = write(total)) { @@ -71,9 +55,9 @@ struct ControlFlowLog : public SectionBase { // 1 byte event, <= 5 bytes function id, <= 5 bytes label uint8_t buffer[11]; buffer[0] = static_cast(TaintedControlFlow); - auto used = detail::varint_encode(function_id, &buffer[1]); + auto used = varint_encode(function_id, &buffer[1]); auto total = used + 1; - used = detail::varint_encode(label, &buffer[total]); + used = varint_encode(label, &buffer[total]); total += used; if (auto wctx = write(total)) { diff --git a/polytracker/include/taintdag/control_flow_log_encoding.h b/polytracker/include/taintdag/control_flow_log_encoding.h new file mode 100644 index 00000000..4339c1c4 --- /dev/null +++ b/polytracker/include/taintdag/control_flow_log_encoding.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +// Separate from control_flow_log.h to avoid duplicate symbol inclusion in +// testing +namespace taintdag { +// For inclusion in the control flow log, we use varint_encode to bit-pack +// each entry. Returns number of bytes required, which is also included in +// the section so that we know entry boundaries. +size_t varint_encode(uint32_t val, uint8_t *buffer); +} // namespace taintdag \ No newline at end of file diff --git a/polytracker/src/CMakeLists.txt b/polytracker/src/CMakeLists.txt index 29d01c50..38a1193c 100644 --- a/polytracker/src/CMakeLists.txt +++ b/polytracker/src/CMakeLists.txt @@ -42,7 +42,8 @@ set(TAINTDAG_SOURCES ${TAINTDAG_DIR}/fnmapping.cpp ${TAINTDAG_DIR}/polytracker.cpp ${TAINTDAG_DIR}/print.cpp - ${TAINTDAG_DIR}/util.cpp) + ${TAINTDAG_DIR}/util.cpp + ${TAINTDAG_DIR}/control_flow_log_encoding.cpp) add_library(Polytracker STATIC ${POLYTRACKER_SOURCES} ${TAINT_SOURCES} ${TAINTDAG_SOURCES}) diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 1dc3149d..a00a724b 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -55,9 +55,11 @@ extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( return conditional; } -extern "C" void __polytracker_enter_function(uint32_t function_id, const char* function_name) { +extern "C" void __polytracker_enter_function(uint32_t function_id, + const char *function_name) { + get_polytracker_tdag().record_function_name(function_id, + std::string_view(function_name)); get_polytracker_tdag().enter_function(function_id); - get_polytracker_tdag().record_function_name(function_id, std::string_view(function_name)); } extern "C" void __polytracker_leave_function(uint32_t function_id) { diff --git a/polytracker/src/taintdag/control_flow_log_encoding.cpp b/polytracker/src/taintdag/control_flow_log_encoding.cpp new file mode 100644 index 00000000..cb6f383d --- /dev/null +++ b/polytracker/src/taintdag/control_flow_log_encoding.cpp @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include "taintdag/control_flow_log_encoding.h" + +// Separate from control_flow_log.h to avoid duplicate symbol inclusion in +// testing +namespace taintdag { +// A uint32_t varint encoded by setting highest bit for all but the final byte. +// Requires up to 5 bytes of storage as each output byte uses 7 input bits. +// Total maximum need is floor(32/7) = 5. +size_t varint_encode(uint32_t val, uint8_t *buffer) { + auto orig_buffer = buffer; + while (val >= 0x80) { + *buffer++ = 0x80 | (val & 0x7f); + val >>= 7; + } + *buffer++ = val & 0x7f; + return buffer - orig_buffer; +} +} // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/CMakeLists.txt b/unittests/src/taintdag/CMakeLists.txt index ec91e1c8..fd2daf02 100644 --- a/unittests/src/taintdag/CMakeLists.txt +++ b/unittests/src/taintdag/CMakeLists.txt @@ -6,12 +6,16 @@ add_executable( tdag.cpp taintdag.cpp bitmap_section.cpp - encoding.cpp + control_flow_log_encoding.cpp + taint_label_encoding.cpp fnmapping.cpp union.cpp labeldeq.cpp + section.cpp stream_offset.cpp - control_flow_log.cpp) + string_table.cpp + storage.cpp + ) target_include_directories(${TAINTDAG_UNITTEST} PRIVATE ${CMAKE_SOURCE_DIR}/polytracker/include) diff --git a/unittests/src/taintdag/control_flow_log.cpp b/unittests/src/taintdag/control_flow_log_encoding.cpp similarity index 90% rename from unittests/src/taintdag/control_flow_log.cpp rename to unittests/src/taintdag/control_flow_log_encoding.cpp index fcafe61b..97ce2fb2 100644 --- a/unittests/src/taintdag/control_flow_log.cpp +++ b/unittests/src/taintdag/control_flow_log_encoding.cpp @@ -7,12 +7,13 @@ * the LICENSE file found in the root directory of this source tree. */ -#include "taintdag/control_flow_log.h" +#include "taintdag/control_flow_log_encoding.h" #include "taintdag/section.h" + #include -TEST_CASE("Simple varint encoding") { - using namespace taintdag::detail; +TEST_CASE("Simple control flow log varint encoding") { + using namespace taintdag; uint8_t buffer[5]; SECTION("Encode 0") { diff --git a/unittests/src/taintdag/section.cpp b/unittests/src/taintdag/section.cpp new file mode 100644 index 00000000..d012ea99 --- /dev/null +++ b/unittests/src/taintdag/section.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + + #include + #include + + #include "taintdag/section.h" + + #include "utils.h" + + namespace taintdag { + TEST_CASE("SectionBase operations are consistent", "[SectionBase]") { + + // To be able to capture error_exits + test::ErrorExitReplace errthrow; + + // Exposing the members of SectionBase + struct TestSectionBase : public SectionBase { + TestSectionBase(span_t t) : SectionBase{t} {} + + auto write(size_t s) { return SectionBase::write(s); } + + auto offset(SectionBase::span_t::iterator o) { + return SectionBase::offset(o); + } + + auto offset(uint8_t const *p) { return SectionBase::offset(p); } + }; + + std::uint8_t backing[64]; + TestSectionBase sb{backing}; + SectionBase::span_t last; + + REQUIRE(sb.size() == 0); + + // Allocate 1 byte + { + auto ctx = sb.write(1); + REQUIRE(ctx); + last = ctx->mem; + } + REQUIRE(sb.size() == 1); + REQUIRE(sb.offset(last.begin()) == 0); + REQUIRE(sb.offset(&*last.begin()) == 0); + REQUIRE(last.size() == 1); + + // Allocate remainder but 1 byte + auto n = sizeof(backing) - 2; + { + auto ctx = sb.write(n); + REQUIRE(ctx); + // Allocation is compact + REQUIRE(ctx->mem.begin() == last.end()); + last = ctx->mem; + } + + REQUIRE(sb.size() == n + 1); + REQUIRE(sb.offset(last.begin()) == 1); + REQUIRE(sb.offset(&*last.begin()) == 1); + REQUIRE(last.size() == n); + + // Allocate last byte + { + auto ctx = sb.write(1); + REQUIRE(ctx); + // Allocation is compact + REQUIRE(ctx->mem.begin() == last.end()); + last = ctx->mem; + } + + REQUIRE(sb.size() == n + 1 + 1); + REQUIRE(sb.offset(last.begin()) == n + 1); + REQUIRE(sb.offset(&*last.begin()) == n + 1); + REQUIRE(last.size() == 1); + + // Attempt additional allocation, should fail. + auto ctx = sb.write(1); + REQUIRE(!ctx); + + // If offset is requested for out of bounds memory, just abort. Something + // is seriously wrong. + REQUIRE_THROWS_AS(sb.offset(SectionBase::span_t::iterator{}), + test::ErrorExit); + REQUIRE_THROWS_AS(sb.offset(last.end()), test::ErrorExit); + + REQUIRE_THROWS_AS(sb.offset(static_cast(nullptr)), + test::ErrorExit); + REQUIRE_THROWS_AS( + sb.offset(reinterpret_cast(&backing + sizeof(backing))), + test::ErrorExit); + } + + TEST_CASE("FixedSizeAlloc operations are consistent", "[FixedSizeAlloc]") { + + // To be able to capture error_exits + test::ErrorExitReplace errthrow; + + struct Dummy { + int32_t i; + char c; + + Dummy(int32_t ii, char cc) : i{ii}, c{cc} {} + }; + + // Assumptions for the test case. + REQUIRE(alignof(Dummy) == 4); + REQUIRE(sizeof(Dummy) == 8); + + using Section = FixedSizeAlloc; + + const size_t backing_count = 3; + const size_t backing_bytes = backing_count * sizeof(Dummy); + + // To ensure we get correct alignment of the backing + alignas(Dummy) std::uint8_t backing[backing_bytes]; + Section s{backing}; + + REQUIRE(s.entry_size() == sizeof(Dummy)); + REQUIRE(s.align_of == alignof(Dummy)); + REQUIRE(s.size() == 0); + REQUIRE(s.count() == 0); + REQUIRE(s.begin() == s.end()); + + SECTION("Adding instances affect size, count and constructed instance is " + "available") { + // Can add first entry + { + auto ctx = s.construct(999, 'A'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == 999); + REQUIRE(ctx->t.c == 'A'); + REQUIRE(s.index(ctx->t) == 0); + } + REQUIRE(s.count() == 1); + REQUIRE(s.size() == sizeof(Dummy)); + + // Can add when there is already an entry but not full. + { + auto ctx = s.construct(33, 'B'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == 33); + REQUIRE(ctx->t.c == 'B'); + REQUIRE(s.index(ctx->t) == 1); + } + REQUIRE(s.count() == 2); + REQUIRE(s.size() == 2 * sizeof(Dummy)); + + // Can fill the backing store with entries + { + auto ctx = s.construct(-1, 'C'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == -1); + REQUIRE(ctx->t.c == 'C'); + REQUIRE(s.index(ctx->t) == 2); + } + REQUIRE(s.count() == 3); + REQUIRE(s.size() == 3 * sizeof(Dummy)); + + // Can't insert beyound capacity + auto ctx = s.construct(-5, 'D'); + REQUIRE(!ctx); + } + + SECTION("Require aligned construction") { + SectionBase::span_t b1{&backing[1], sizeof(backing) - 7}; + REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); + + SectionBase::span_t b2{&backing[2], sizeof(backing) - 6}; + REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); + + SectionBase::span_t b3{&backing[3], sizeof(backing) - 5}; + REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); + } + + SECTION("Require size to be a multiple of align_of") { + SectionBase::span_t b1{&backing[0], sizeof(backing) - 1}; + REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); + + SectionBase::span_t b2{&backing[0], sizeof(backing) - 2}; + REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); + + SectionBase::span_t b3{&backing[0], sizeof(backing) - 3}; + REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); + } + + SECTION("Iteration") { + s.construct(-1, 'a'); + REQUIRE(std::distance(s.begin(), s.end()) == 1); + s.construct(-2, 'b'); + REQUIRE(std::distance(s.begin(), s.end()) == 2); + s.construct(-3, 'c'); + REQUIRE(std::distance(s.begin(), s.end()) == 3); + + // Know that begin is valid due to above + auto &first = *s.begin(); + REQUIRE(first.i == -1); + REQUIRE(first.c == 'a'); + } + } + } // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/storage.cpp b/unittests/src/taintdag/storage.cpp new file mode 100644 index 00000000..aec583c2 --- /dev/null +++ b/unittests/src/taintdag/storage.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include +#include + +#include "taintdag/storage.h" + +#include "utils.h" + +namespace taintdag { + TEST_CASE("Type properties of FixedSizeFile", "[FixedSizeFile]") { + // Don't want multiple copies referring to the same file + REQUIRE(!std::is_copy_constructible_v); + REQUIRE(!std::is_copy_assignable_v); + + // NOTE(hbrodin): The FixedSizeFile is currently not move + // constructible/assignable. There is nothing preventing such an + // implementation. Currently there is no need so leave this as is. + REQUIRE(!std::is_move_assignable_v); + REQUIRE(!std::is_move_constructible_v); + } + + TEST_CASE("Type properties of MMapFile", "[MMapFile]") { + // Don't want multiple copies referring to the same regions + REQUIRE(!std::is_copy_constructible_v); + REQUIRE(!std::is_copy_assignable_v); + + // NOTE(hbrodin): The MMapFile is currently not move constructible/assignable. + // Behavior is currently inherited from FixedSizeFile. Should that change, + // the MMapFile would change as well. + REQUIRE(!std::is_move_assignable_v); + REQUIRE(!std::is_move_constructible_v); + } +} // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/string_table.cpp b/unittests/src/taintdag/string_table.cpp new file mode 100644 index 00000000..32b2c48c --- /dev/null +++ b/unittests/src/taintdag/string_table.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include + +#include "taintdag/outputfile.h" +#include "taintdag/string_table.h" +#include "taintdag/taint_source.h" + +#include "utils.h" + +namespace taintdag { +TEST_CASE( + "The Sources and StringTable sections can store source entries", + "[Sources, StringTable]") { + OutputFile of{std::tmpnam(nullptr)}; + auto &sources_section{of.section()}; + auto &string_table{of.section()}; + + SECTION("Can add taint-source entries to the Sources section", + "[Sources, StringTable]") { + int fd = 3; + REQUIRE(!sources_section.mapping_idx(fd)); + + auto s1 = sources_section.add_source("test", fd, 122); + REQUIRE(s1.has_value()); + + auto m = sources_section.mapping_idx(fd); + REQUIRE(m.has_value()); + REQUIRE(*s1 == *m); + + auto m1 = sources_section.get(*m); + REQUIRE(m1.fd == fd); + + REQUIRE(m1.name(string_table) == "test"); + REQUIRE(m1.size == 122); + + int fd2 = 99; + auto s2 = + sources_section.add_source("test2", fd2, SourceEntry::InvalidSize); + REQUIRE(s2.has_value()); + + auto idx2 = sources_section.mapping_idx(fd2); + REQUIRE(idx2.has_value()); + + auto m2 = sources_section.get(*idx2); + REQUIRE(m2.fd == fd2); + REQUIRE(m2.name(string_table) == "test2"); + + REQUIRE(m2.size == SourceEntry::InvalidSize); + } + + WHEN("Adding taint-sources to the Sources section and the string table") { + THEN("Latest wins in terms in case output_file has multiple mappings for " + "the same fd") { + int fd = 1; + sources_section.add_source("first", fd); + sources_section.add_source("second", fd); + + auto mm = sources_section.mapping_idx(fd); + REQUIRE(mm); + + auto m = sources_section.get(*mm); + REQUIRE(m.fd == fd); + REQUIRE(m.name(string_table) == "second"); + } + } +} + +TEST_CASE("StringTable add/iterate", "[StringTable]") { + // To be able to capture error_exits + test::ErrorExitReplace errthrow; + + OutputFile of{std::tmpnam(nullptr)}; + auto &string_table{of.section()}; + + SECTION("StringTable properties") { + // squish everything together as close as we can + REQUIRE(StringTable::align_of == 2UL); + // no elements in the string table to start + REQUIRE(string_table.size() == 0); + REQUIRE(string_table.begin() == string_table.end()); + } + + WHEN("A string is added") { + THEN("It should also be retrievable from the offset of its length") { + auto ofs = string_table.add_string("Hello"); + REQUIRE(ofs); + REQUIRE(string_table.from_offset(*ofs) == "Hello"); + + auto ofs2 = string_table.add_string("World"); + REQUIRE(ofs2); + REQUIRE(string_table.from_offset(*ofs2) == "World"); + } + } + + WHEN("Multiple strings are added") { + THEN("They should be iterable using begin() and end()") { + string_table.add_string("a"); + string_table.add_string("b"); + string_table.add_string("c"); + string_table.add_string("d"); + + std::vector res; + std::copy(string_table.begin(), string_table.end(), + std::back_inserter(res)); + REQUIRE(res.size() == 4); + REQUIRE(res[0] == "a"); + REQUIRE(res[1] == "b"); + REQUIRE(res[2] == "c"); + REQUIRE(res[3] == "d"); + } + } + + WHEN("Adding to the string table") { + THEN("A string bigger than the maximum string size will be truncated and " + "stored") { + // display the info logging + spdlog::set_level(spdlog::level::debug); + + auto len = StringTable::max_entry_size + 10; + std::string too_big(len, 'A'); + REQUIRE_NOTHROW([&]() { + auto offset = string_table.add_string(too_big); + REQUIRE(offset.has_value()); + + std::string_view result = string_table.from_offset(offset.value()); + REQUIRE(result.size() + sizeof(StringTable::length_t) == + StringTable::max_entry_size - 1); + }()); + } + + THEN("Can fill the remainder of the string table to capacity with many " + "short strings") { + std::string s{"a"}; + while (auto os = string_table.add_string(s)) { + if (!os.has_value()) { + break; + } + + auto offset = os.value(); + REQUIRE(string_table.size() > offset); + REQUIRE(offset <= string_table.max_offset); + + auto result = string_table.from_offset(offset); + REQUIRE(s.compare(result.data()) == 0); + } + } + + THEN("Cannot add more strings if the table is full") { + std::string onemore{"excuse me may I have another\n"}; + REQUIRE_NOTHROW([&]() { + auto should_be_empty = string_table.add_string(onemore); + REQUIRE(!should_be_empty.has_value()); + }); + } + } +} +} // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/encoding.cpp b/unittests/src/taintdag/taint_label_encoding.cpp similarity index 100% rename from unittests/src/taintdag/encoding.cpp rename to unittests/src/taintdag/taint_label_encoding.cpp diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index 4793e5f1..eb7175ef 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -1,404 +1,110 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + #include +#include -#include "taintdag/outputfile.h" -#include "taintdag/section.h" -#include "taintdag/storage.h" +#include "taintdag/bitmap_section.h" +#include "taintdag/control_flow_log.h" +#include "taintdag/fnmapping.h" +#include "taintdag/labels.h" +#include "taintdag/sink.h" +#include "taintdag/stream_offset.h" #include "taintdag/string_table.h" +#include "taintdag/taint.h" #include "taintdag/taint_source.h" -#include "taintdag/labels.h" +#include "taintdag/util.h" #include "utils.h" namespace taintdag { - -TEST_CASE("Test TDAG", "[Integration]") { - OutputFile tdg("filename.bin"); - auto offset1 = tdg.section().add_string("Hello"); - auto offset2 = tdg.section().add_string("World!"); - REQUIRE(offset1 != offset2); - - auto idx = tdg.section().add_source("sourcename", -1); - REQUIRE(idx); - REQUIRE(*idx == 0); - auto idx2 = tdg.section().add_source("next-source", 2); - REQUIRE(*idx2 == 1); -} - -TEST_CASE("Type properties FixedSizeFile", "[FixedSizeFile]") { - // Don't want multiple copies referring to the same file - REQUIRE(!std::is_copy_constructible_v); - REQUIRE(!std::is_copy_assignable_v); - - // NOTE(hbrodin): The FixedSizeFile is currently not move - // constructible/assignable. There is nothing preventing such an - // implementation. Currently there is no need so leave this as is. - REQUIRE(!std::is_move_assignable_v); - REQUIRE(!std::is_move_constructible_v); -} - -TEST_CASE("Type properties MMapFile", "[MMapFile]") { - // Don't want multiple copies referring to the same regions - REQUIRE(!std::is_copy_constructible_v); - REQUIRE(!std::is_copy_assignable_v); - - // NOTE(hbrodin): The MMapFile is currently not move constructible/assignable. - // Behavior is currently inherited from FixedSizeFile. Should that change, - // the MMapFile would change as well. - REQUIRE(!std::is_move_assignable_v); - REQUIRE(!std::is_move_constructible_v); -} - -TEST_CASE("SectionBase operations are consistent", "[SectionBase]") { - - // To be able to capture error_exits - test::ErrorExitReplace errthrow; - - // Exposing the members of SectionBase - struct TestSectionBase : public SectionBase { - TestSectionBase(span_t t) : SectionBase{t} {} - - auto write(size_t s) { return SectionBase::write(s); } - - auto offset(SectionBase::span_t::iterator o) { - return SectionBase::offset(o); + TEST_CASE("Test basic TDAG construction", "[Integration]") { + using SourceLabelIndexSection = BitmapSectionBase<5, BitCount{max_label} + 1>; + using ConcreteOutputFile = + OutputFile; + ConcreteOutputFile tdg("test.tdag"); + + SECTION("Sources") { + auto idx = tdg.section().add_source("sourcename", -1); + REQUIRE(idx); + REQUIRE(*idx == 0); + REQUIRE(tdg.section().count() == 1); + auto idx2 = tdg.section().add_source("next-source", 2); + REQUIRE(*idx2 == 1); + REQUIRE(tdg.section().count() == 2); } - auto offset(uint8_t const *p) { return SectionBase::offset(p); } - }; + SECTION("Labels") { + // 25 is randomly chosen; ranges can be bigger + unsigned long length = rand() % 25 + 1; - std::uint8_t backing[64]; - TestSectionBase sb{backing}; - SectionBase::span_t last; - - REQUIRE(sb.size() == 0); - - // Allocate 1 byte - { - auto ctx = sb.write(1); - REQUIRE(ctx); - last = ctx->mem; - } - REQUIRE(sb.size() == 1); - REQUIRE(sb.offset(last.begin()) == 0); - REQUIRE(sb.offset(&*last.begin()) == 0); - REQUIRE(last.size() == 1); - - // Allocate remainder but 1 byte - auto n = sizeof(backing) - 2; - { - auto ctx = sb.write(n); - REQUIRE(ctx); - // Allocation is compact - REQUIRE(ctx->mem.begin() == last.end()); - last = ctx->mem; - } - - REQUIRE(sb.size() == n + 1); - REQUIRE(sb.offset(last.begin()) == 1); - REQUIRE(sb.offset(&*last.begin()) == 1); - REQUIRE(last.size() == n); - - // Allocate last byte - { - auto ctx = sb.write(1); - REQUIRE(ctx); - // Allocation is compact - REQUIRE(ctx->mem.begin() == last.end()); - last = ctx->mem; - } + // label range represents a data structure like an array + auto test_range = + tdg.section().create_source_labels(-1, -1, length); + REQUIRE(test_range.first != test_range.second); - REQUIRE(sb.size() == n + 1 + 1); - REQUIRE(sb.offset(last.begin()) == n + 1); - REQUIRE(sb.offset(&*last.begin()) == n + 1); - REQUIRE(last.size() == 1); + // todo(kaoudis) this seems like it should be specific, on the order of the + // number of items in the range. why isn't it? + auto size_with_range = tdg.section().count(); + REQUIRE(size_with_range > 0); - // Attempt additional allocation, should fail. - auto ctx = sb.write(1); - REQUIRE(!ctx); + tdg.section().set_range(BitIndex{test_range.first}, BitCount{length}); + REQUIRE(tdg.section().size() > 0); - // If offset is requested for out of bounds memory, just abort. Something - // is seriously wrong. - REQUIRE_THROWS_AS(sb.offset(SectionBase::span_t::iterator{}), - test::ErrorExit); - REQUIRE_THROWS_AS(sb.offset(last.end()), test::ErrorExit); + // label union represents a step in the progression of taint + auto test_union = tdg.section().union_taint(test_range.first, test_range.second); + REQUIRE(test_union != test_range.first); + REQUIRE(test_union != test_range.second); - REQUIRE_THROWS_AS(sb.offset(static_cast(nullptr)), - test::ErrorExit); - REQUIRE_THROWS_AS( - sb.offset(reinterpret_cast(&backing + sizeof(backing))), - test::ErrorExit); -} - -TEST_CASE("FixedSizeAlloc operations are consistent", "[FixedSizeAlloc]") { - - // To be able to capture error_exits - test::ErrorExitReplace errthrow; - - struct Dummy { - int32_t i; - char c; - - Dummy(int32_t ii, char cc) : i{ii}, c{cc} {} - }; - - // Assumptions for the test case. - REQUIRE(alignof(Dummy) == 4); - REQUIRE(sizeof(Dummy) == 8); - - using Section = FixedSizeAlloc; - - const size_t backing_count = 3; - const size_t backing_bytes = backing_count * sizeof(Dummy); - - // To ensure we get correct alignment of the backing - alignas(Dummy) std::uint8_t backing[backing_bytes]; - Section s{backing}; - - REQUIRE(s.entry_size() == sizeof(Dummy)); - REQUIRE(s.align_of == alignof(Dummy)); - REQUIRE(s.size() == 0); - REQUIRE(s.count() == 0); - REQUIRE(s.begin() == s.end()); - - SECTION("Adding instances affect size, count and constructed instance is " - "available") { - // Can add first entry - { - auto ctx = s.construct(999, 'A'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == 999); - REQUIRE(ctx->t.c == 'A'); - REQUIRE(s.index(ctx->t) == 0); + // added just one new label - the union + REQUIRE(tdg.section().count() == size_with_range + 1); } - REQUIRE(s.count() == 1); - REQUIRE(s.size() == sizeof(Dummy)); - // Can add when there is already an entry but not full. - { - auto ctx = s.construct(33, 'B'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == 33); - REQUIRE(ctx->t.c == 'B'); - REQUIRE(s.index(ctx->t) == 1); + SECTION("String Table") { + auto offset1 = tdg.section().add_string("Hello"); + auto offset2 = tdg.section().add_string("World!"); + REQUIRE(offset1 != offset2); + // for the string table, size() yields the size of all included entries, + // plus the size of the offsets to them + REQUIRE(tdg.section().size() == 16); } - REQUIRE(s.count() == 2); - REQUIRE(s.size() == 2 * sizeof(Dummy)); - // Can fill the backing store with entries - { - auto ctx = s.construct(-1, 'C'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == -1); - REQUIRE(ctx->t.c == 'C'); - REQUIRE(s.index(ctx->t) == 2); + SECTION("Sinks") { + tdg.section().log_single(-1, -1, 0); + REQUIRE(tdg.section().count() == 1); } - REQUIRE(s.count() == 3); - REQUIRE(s.size() == 3 * sizeof(Dummy)); - - // Can't insert beyound capacity - auto ctx = s.construct(-5, 'D'); - REQUIRE(!ctx); - } - - SECTION("Require aligned construction") { - SectionBase::span_t b1{&backing[1], sizeof(backing) - 7}; - REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); - - SectionBase::span_t b2{&backing[2], sizeof(backing) - 6}; - REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); - - SectionBase::span_t b3{&backing[3], sizeof(backing) - 5}; - REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); - } - - SECTION("Require size to be a multiple of align_of") { - SectionBase::span_t b1{&backing[0], sizeof(backing) - 1}; - REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); - - SectionBase::span_t b2{&backing[0], sizeof(backing) - 2}; - REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); - - SectionBase::span_t b3{&backing[0], sizeof(backing) - 3}; - REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); - } - - SECTION("Iteration") { - s.construct(-1, 'a'); - REQUIRE(std::distance(s.begin(), s.end()) == 1); - s.construct(-2, 'b'); - REQUIRE(std::distance(s.begin(), s.end()) == 2); - s.construct(-3, 'c'); - REQUIRE(std::distance(s.begin(), s.end()) == 3); - // Know that begin is valid due to above - auto &first = *s.begin(); - REQUIRE(first.i == -1); - REQUIRE(first.c == 'a'); - } -} - -// Dummy OutputFile, to allow retrieving the StringTable -struct DummyOutputFile { - template T §ion() { return string_table; } - - StringTable &string_table; -}; - -TEST_CASE("The Sources and StringTable sections can be used to store source entries", "[Sources, StringTable]") { - OutputFile of{std::tmpnam(nullptr)}; - auto &sources_section{of.section()}; - auto &string_table{of.section()}; - - SECTION("Can add taint-source entries to the Sources section", "[Sources, StringTable]") { - int fd = 3; - REQUIRE(!sources_section.mapping_idx(fd)); - - auto s1 = sources_section.add_source("test", fd, 122); - REQUIRE(s1.has_value()); - - auto m = sources_section.mapping_idx(fd); - REQUIRE(m.has_value()); - REQUIRE(*s1 == *m); - - auto m1 = sources_section.get(*m); - REQUIRE(m1.fd == fd); - - REQUIRE(m1.name(string_table) == "test"); - REQUIRE(m1.size == 122); - - int fd2 = 99; - auto s2 = sources_section.add_source("test2", fd2, SourceEntry::InvalidSize); - REQUIRE(s2.has_value()); - - auto idx2 = sources_section.mapping_idx(fd2); - REQUIRE(idx2.has_value()); - - auto m2 = sources_section.get(*idx2); - REQUIRE(m2.fd == fd2); - REQUIRE(m2.name(string_table) == "test2"); - - REQUIRE(m2.size == SourceEntry::InvalidSize); - } - - WHEN("Adding taint-sources to the Sources section and the string table") { - THEN("Latest wins in terms in case output_file has multiple mappings for the same fd") { - int fd = 1; - sources_section.add_source("first", fd); - sources_section.add_source("second", fd); - - auto mm = sources_section.mapping_idx(fd); - REQUIRE(mm); - - auto m = sources_section.get(*mm); - REQUIRE(m.fd == fd); - REQUIRE(m.name(string_table) == "second"); - } - } -} - -TEST_CASE("StringTable add/iterate", "[StringTable]") { - // To be able to capture error_exits - test::ErrorExitReplace errthrow; - - OutputFile of{std::tmpnam(nullptr)}; - auto &string_table{of.section()}; - - SECTION("StringTable properties") { - // squish everything together as close as we can - REQUIRE(StringTable::align_of == 2UL); - // no elements in the string table to start - REQUIRE(string_table.size() == 0); - REQUIRE(string_table.begin() == string_table.end()); - } - - WHEN("A string is added") { - THEN("It should also be retrievable from the offset of its length") { - auto ofs = string_table.add_string("Hello"); - REQUIRE(ofs); - REQUIRE(string_table.from_offset(*ofs) == "Hello"); - - auto ofs2 = string_table.add_string("World"); - REQUIRE(ofs2); - REQUIRE(string_table.from_offset(*ofs2) == "World"); + SECTION("Tainted Control Flow (includes String Table and Functions)") { + int function_id = 1; + + // just before enter_function, cf __polytracker_enter_function + // (we pair these always - function trace should only contain fns with + // enter and leave events!) + tdg.section().add_mapping(function_id, "hello_world"); + REQUIRE(tdg.section().count() == 1); + + // adds a new entry. entry size is dependent on varint_encoding, which + // uses up to 5 bytes packed into a size_t to represent a buffer that + // was originally filled with uint8_t's. + tdg.section().enter_function(function_id); + auto size_with_one_entry = tdg.section().size(); + REQUIRE(size_with_one_entry > 0); + + // adds a new entry + tdg.section().tainted_control_flow(-1, function_id); + auto size_with_two_entries = tdg.section().size(); + REQUIRE((size_with_two_entries / 2) >= size_with_one_entry); + + // adds a new entry + tdg.section().leave_function(function_id); + auto size_with_three_entries = tdg.section().size(); + REQUIRE((size_with_three_entries / 3) >= size_with_one_entry); } } - - WHEN("Multiple strings are added") { - THEN("They should be iterable using begin() and end()") { - string_table.add_string("a"); - string_table.add_string("b"); - string_table.add_string("c"); - string_table.add_string("d"); - - std::vector res; - std::copy(string_table.begin(), string_table.end(), std::back_inserter(res)); - REQUIRE(res.size() == 4); - REQUIRE(res[0] == "a"); - REQUIRE(res[1] == "b"); - REQUIRE(res[2] == "c"); - REQUIRE(res[3] == "d"); - } - } - - WHEN("Adding to the string table") { - THEN("A string bigger than the maximum string size will be truncated and stored") { - // display the info logging - spdlog::set_level(spdlog::level::debug); - - auto len = StringTable::max_entry_size + 10; - std::string too_big(len, 'A'); - REQUIRE_NOTHROW([&](){ - auto offset = string_table.add_string(too_big); - REQUIRE(offset.has_value()); - - std::string_view result = string_table.from_offset(offset.value()); - REQUIRE(result.size() + sizeof(StringTable::length_t) == StringTable::max_entry_size - 1); - }()); - } - - THEN("Can fill the string table with many short strings") { - std::string s{"a"}; - while (auto os = string_table.add_string(s)) { - if (!os.has_value()) { - break; - } - auto result = string_table.from_offset(os.value()); - REQUIRE(s.compare(result.data()) == 0); - } - } - - THEN("Add a maximumly big string and will still be able to add other strings") { - auto size = StringTable::max_entry_size - sizeof(StringTable::length_t); - std::string s(size, 'A'); - REQUIRE_NOTHROW([&](){ - auto offset = string_table.add_string(s); - REQUIRE(offset.has_value()); - auto result = string_table.from_offset(offset.value()); - // no truncation happened this time - REQUIRE(result.size() == size); - REQUIRE(s.compare(result.data()) == 0); - }()); - - std::string s2{1, 'B'}; - auto os2 = string_table.add_string(s2); - REQUIRE(os2.has_value()); - - std::string s3("hello"); - auto os3 = string_table.add_string(s3); - REQUIRE(os3.has_value()); - } - } -} - - TEST_CASE("An allocation that is larger than can be represented in the string table will result in truncation and does not prevent adding more strings", "[StringTable]") { - auto alloc_size = - static_cast(std::numeric_limits::max()) + - 1; - alignas(StringTable::offset_t) uint8_t backing[64]; - int dummy = 1; - StringTable st{SectionArg{.output_file = dummy, .range = backing}}; - auto span = StringTable::span_t{&backing[0], alloc_size}; - std::string_view tinystring{"eep"}; - REQUIRE_NOTHROW(st.add_string(tinystring)); - } } // namespace taintdag \ No newline at end of file From 47b0121bbe8e86c71aab3375fb2e807ed16467c3 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 6 Mar 2025 16:28:29 +0000 Subject: [PATCH 059/112] trunk fmt --- examples/analysis/ubet/eval.py | 5 +- examples/analysis/ubet/eval_nitro.py | 17 +- polytracker/include/taintdag/fnmapping.h | 7 +- polytracker/include/taintdag/polytracker.h | 3 +- polytracker/include/taintdag/section.h | 3 +- .../src/passes/tainted_control_flow.cpp | 261 ++++++------ polytracker/src/taintdag/fnmapping.cpp | 10 +- polytracker/src/taintdag/polytracker.cpp | 3 +- polytracker/taint_dag.py | 75 ++-- tests/conftest.py | 9 +- tests/test_cf_log.py | 35 +- tests/test_program_trace.py | 11 +- tests/test_stdin.cpp | 4 +- tests/test_stdin.py | 41 +- unittests/src/taintdag/section.cpp | 392 +++++++++--------- unittests/src/taintdag/storage.cpp | 46 +- unittests/src/taintdag/string_table.cpp | 5 +- .../src/taintdag/taint_label_encoding.cpp | 11 +- unittests/src/taintdag/tdag.cpp | 168 ++++---- 19 files changed, 571 insertions(+), 535 deletions(-) diff --git a/examples/analysis/ubet/eval.py b/examples/analysis/ubet/eval.py index 0e219d18..c8ef1328 100644 --- a/examples/analysis/ubet/eval.py +++ b/examples/analysis/ubet/eval.py @@ -1,14 +1,13 @@ # /usr/bin/python import os import random -import sys import subprocess -from typing import List, Tuple +import sys from pathlib import Path +from typing import List, Tuple from polytracker import PolyTrackerTrace - src_arg = Path(sys.argv[1]) no_build = "nobuild" == sys.argv[2] if len(sys.argv) > 2 else False src_dir = src_arg.parent diff --git a/examples/analysis/ubet/eval_nitro.py b/examples/analysis/ubet/eval_nitro.py index c171091a..062052f8 100644 --- a/examples/analysis/ubet/eval_nitro.py +++ b/examples/analysis/ubet/eval_nitro.py @@ -1,19 +1,20 @@ import argparse -from collections import defaultdict -import subprocess import os +import subprocess import sys -from typing import Optional, Set, Iterator, Tuple, Dict -from polytracker import PolyTrackerTrace, taint_dag -from polytracker.taint_dag import TDFile, TDNode, TDSourceNode, TDUnionNode, TDRangeNode -from polytracker.mapping import InputOutputMapping +from collections import defaultdict +from functools import partialmethod from pathlib import Path +from typing import Dict, Iterator, Optional, Set, Tuple + +import cxxfilt # To Silence TQDM! from tqdm import tqdm -from functools import partialmethod -import cxxfilt +from polytracker import PolyTrackerTrace, taint_dag +from polytracker.mapping import InputOutputMapping +from polytracker.taint_dag import TDFile, TDNode, TDRangeNode, TDSourceNode, TDUnionNode tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) diff --git a/polytracker/include/taintdag/fnmapping.h b/polytracker/include/taintdag/fnmapping.h index 685db4fa..09d50d5e 100644 --- a/polytracker/include/taintdag/fnmapping.h +++ b/polytracker/include/taintdag/fnmapping.h @@ -25,8 +25,8 @@ struct Function { offset_t name_offset; uint32_t function_id; - Function(offset_t name_ofs, uint32_t f_id) : - name_offset(name_ofs), function_id(f_id) {}; + Function(offset_t name_ofs, uint32_t f_id) + : name_offset(name_ofs), function_id(f_id){}; }; class Functions : public FixedSizeAlloc { @@ -42,7 +42,8 @@ class Functions : public FixedSizeAlloc { : FixedSizeAlloc{of.range}, string_table{of.output_file.template section()} {} - std::optional add_mapping(uint32_t function_id, std::string_view function_name); + std::optional add_mapping(uint32_t function_id, + std::string_view function_name); private: StringTable &string_table; diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index a1afd679..7e10f69b 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -65,7 +65,8 @@ class PolyTracker { void leave_function(uint32_t function_id); // Log function name - void record_function_name(uint32_t function_id, std::string_view function_name); + void record_function_name(uint32_t function_id, + std::string_view function_name); // Log tainted data flowed into the sink void taint_sink(int fd, util::Offset offset, void const *mem, size_t length); diff --git a/polytracker/include/taintdag/section.h b/polytracker/include/taintdag/section.h index ac2efe09..d4682026 100644 --- a/polytracker/include/taintdag/section.h +++ b/polytracker/include/taintdag/section.h @@ -142,7 +142,8 @@ template struct FixedSizeAlloc : SectionBase { .t = *new (&*(write_context->mem.begin())) T{std::forward(args)...}}; } - spdlog::error("Failed to allocate memory in the section for the object, so could not construct the object in the tdag section"); + spdlog::error("Failed to allocate memory in the section for the object, so " + "could not construct the object in the tdag section"); return {}; } diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 73f21552..e4336697 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -22,162 +22,163 @@ static llvm::cl::list ignore_lists( "pt-ftrace-ignore-list", - llvm::cl::desc( - "File that specifies functions that pt-tcf should ignore")); + llvm::cl::desc("File that specifies functions that pt-tcf should ignore")); namespace polytracker { namespace { - uint32_t get_or_add_mapping(uintptr_t key, std::unordered_map &mapping, uint32_t &counter) { - if (auto it = mapping.find(key); it != mapping.end()) { - return it->second; - } else { - return mapping[key] = counter++; - } +uint32_t get_or_add_mapping(uintptr_t key, + std::unordered_map &mapping, + uint32_t &counter) { + if (auto it = mapping.find(key); it != mapping.end()) { + return it->second; + } else { + return mapping[key] = counter++; } +} } // namespace - void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, - llvm::Value *val) { - llvm::IRBuilder<> ir(&inst); - auto dummy_val{val}; - if (inst.getType()->isVectorTy()) { - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); - } - ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); - } - - llvm::ConstantInt * - TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { - auto func_address = reinterpret_cast(&func); - auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); - return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); - } - - llvm::ConstantInt * - TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { - return get_function_id_const(*(i.getParent()->getParent())); +void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, + llvm::Value *val) { + llvm::IRBuilder<> ir(&inst); + auto dummy_val{val}; + if (inst.getType()->isVectorTy()) { + dummy_val = ir.CreateExtractElement(val, uint64_t(0)); } - - void TaintedControlFlowPass::visitGetElementPtrInst( - llvm::GetElementPtrInst &gep) { - llvm::IRBuilder<> ir(&gep); - for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { - continue; - } - - // we do not handle VectorTypes yet - if ((*(idx->getType())).isVectorTy()) { - continue; - } - - auto callret = ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), - get_function_id_const(gep)}); - - idx = ir.CreateSExtOrTrunc(callret, idx->getType()); + ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); +} + +llvm::ConstantInt * +TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { + auto func_address = reinterpret_cast(&func); + auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); + return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); +} + +llvm::ConstantInt * +TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { + return get_function_id_const(*(i.getParent()->getParent())); +} + +void TaintedControlFlowPass::visitGetElementPtrInst( + llvm::GetElementPtrInst &gep) { + llvm::IRBuilder<> ir(&gep); + for (auto &idx : gep.indices()) { + if (llvm::isa(idx)) { + continue; } - } - void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { - if (bi.isUnconditional()) { - return; + // we do not handle VectorTypes yet + if ((*(idx->getType())).isVectorTy()) { + continue; } - llvm::IRBuilder<> ir(&bi); - auto cond = bi.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); + auto callret = ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), + get_function_id_const(gep)}); - bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); + idx = ir.CreateSExtOrTrunc(callret, idx->getType()); } +} - void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { + if (bi.isUnconditional()) { + return; } - void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - // TODO(hbrodin): Can't handle atm. - if (si.getType()->isVectorTy()) { - return; - } - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); + llvm::IRBuilder<> ir(&bi); + auto cond = bi.getCondition(); - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); - } + bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} - void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { - llvm::LLVMContext *context = &mod.getContext(); - llvm::IRBuilder<> ir(*context); +void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); - cond_br_log_fn = mod.getOrInsertFunction( - "__polytracker_log_tainted_control_flow", - llvm::AttributeList::get( - mod.getContext(), - {{llvm::AttributeList::FunctionIndex, - llvm::Attribute::get(mod.getContext(), - llvm::Attribute::ReadNone)}}), - ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - enter_log_fn_type = llvm::FunctionType::get(llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), llvm::Type::getInt8PtrTy(*context)); + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} - fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", enter_log_fn_type); - - fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); +void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { + // TODO(hbrodin): Can't handle atm. + if (si.getType()->isVectorTy()) { + return; } - - void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { - if (func.isDeclaration()) { - return; - } - llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); - - ir.CreateCall(fn_enter_log_fn, - { - get_function_id_const(func), - ir.CreateGlobalStringPtr(func.getName()) - } - ); + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); + + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} + +void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { + llvm::LLVMContext *context = &mod.getContext(); + llvm::IRBuilder<> ir(*context); + + cond_br_log_fn = mod.getOrInsertFunction( + "__polytracker_log_tainted_control_flow", + llvm::AttributeList::get( + mod.getContext(), + {{llvm::AttributeList::FunctionIndex, + llvm::Attribute::get(mod.getContext(), + llvm::Attribute::ReadNone)}}), + ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); + + enter_log_fn_type = llvm::FunctionType::get( + llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), + llvm::Type::getInt8PtrTy(*context)); + + fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", + enter_log_fn_type); + + fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", + ir.getVoidTy(), ir.getInt32Ty()); +} + +void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { + if (func.isDeclaration()) { + return; } - - void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { - llvm::IRBuilder<> ir(&ri); - ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); - } - - llvm::PreservedAnalyses - TaintedControlFlowPass::run(llvm::Module &mod, - llvm::ModuleAnalysisManager &mam) { - label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); - declareLoggingFunctions(mod); - auto fnsToIgnore{readIgnoreLists(ignore_lists)}; - - for (auto &fn : mod) { - auto fname{fn.getName()}; - if (fnsToIgnore.count(fname.str())) { - continue; - } else { - instrumentFunctionEnter(fn); - visit(fn); - } + llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); + + ir.CreateCall(fn_enter_log_fn, {get_function_id_const(func), + ir.CreateGlobalStringPtr(func.getName())}); +} + +void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { + llvm::IRBuilder<> ir(&ri); + ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); +} + +llvm::PreservedAnalyses +TaintedControlFlowPass::run(llvm::Module &mod, + llvm::ModuleAnalysisManager &mam) { + label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); + declareLoggingFunctions(mod); + auto fnsToIgnore{readIgnoreLists(ignore_lists)}; + + for (auto &fn : mod) { + auto fname{fn.getName()}; + if (fnsToIgnore.count(fname.str())) { + continue; + } else { + instrumentFunctionEnter(fn); + visit(fn); } - - return llvm::PreservedAnalyses::none(); } + return llvm::PreservedAnalyses::none(); +} + } // namespace polytracker \ No newline at end of file diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index f1dbf831..c57e4ed5 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -19,14 +19,16 @@ using index_t = Functions::index_t; } // namespace -std::optional Functions::add_mapping(uint32_t function_id, std::string_view function_name) { +std::optional Functions::add_mapping(uint32_t function_id, + std::string_view function_name) { std::unique_lock mappings_lock(mappings_mutex); if (auto it{mappings.find(function_name)}; it != mappings.end()) { return it->second; } - std::optional maybe_name_offset = string_table.add_string(function_name); + std::optional maybe_name_offset = + string_table.add_string(function_name); if (!maybe_name_offset.has_value()) { spdlog::error("Could not write function name to strings table"); return {}; @@ -34,7 +36,9 @@ std::optional Functions::add_mapping(uint32_t function_id, std::string_ auto maybe_ctx = construct(Function(maybe_name_offset.value(), function_id)); if (!maybe_ctx.has_value()) { - spdlog::error("Could not write Function {0} with id {1:d}, string table ofs {2:d} to the tdag functions section", function_name, function_id, maybe_name_offset.value()); + spdlog::error("Could not write Function {0} with id {1:d}, string table " + "ofs {2:d} to the tdag functions section", + function_name, function_id, maybe_name_offset.value()); return {}; } diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index ff540f9e..fa905187 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -178,7 +178,8 @@ void PolyTracker::log_tainted_control_flow(label_t lbl, uint32_t function_id) { output_file_.section().tainted_control_flow(lbl, function_id); } -void PolyTracker::record_function_name(uint32_t function_id, std::string_view function_name) { +void PolyTracker::record_function_name(uint32_t function_id, + std::string_view function_name) { output_file_.section().add_mapping(function_id, function_name); } diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 24aeedd3..f6589fae 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -1,47 +1,47 @@ +from ctypes import ( + Structure, + c_char, + c_int32, + c_int64, + c_uint8, + c_uint16, + c_uint32, + c_uint64, + sizeof, +) +from enum import Enum +from mmap import PROT_READ, mmap +from pathlib import Path from typing import ( BinaryIO, - Union, + Dict, Iterable, Iterator, - Optional, - Dict, - Tuple, List, + Optional, Set, + Tuple, Type, + Union, cast, ) from cxxfilt import demangle -from enum import Enum -from pathlib import Path -from mmap import mmap, PROT_READ -from ctypes import ( - Structure, - c_char, - c_int64, - c_uint64, - c_int32, - c_uint32, - c_uint8, - c_uint16, - sizeof, -) from typing_extensions import deprecated +from .inputs import Input from .plugins import Command -from .repl import PolyTrackerREPL from .polytracker import ProgramTrace -from .inputs import Input +from .repl import PolyTrackerREPL from .taint_forest import TaintForest, TaintForestNode from .tracing import ( BasicBlock, ByteOffset, Function, TaintAccess, - TraceEvent, TaintOutput, Taints, + TraceEvent, ) @@ -94,9 +94,11 @@ def enumerate(self): for offset in range(0, len(self.mem), sizeof(TDFDHeader)): yield TDFDHeader.from_buffer_copy(self.mem[offset:]) + @deprecated("Use ControlFlowEvent instead, TDEvents are no longer written") class TDEvent(Structure): """This is an old version of the ControlFlowEvent kept for backward compatibility only""" + _fields_ = [("kind", c_uint8), ("fnidx", c_uint16)] class Kind(Enum): @@ -106,9 +108,11 @@ class Kind(Enum): def __repr__(self) -> str: return f"kind: {self.Kind(self.kind).name} fnidx: {self.fnidx}" + @deprecated("Use TDControlFlowLog instead, TDEvents section is no longer written") class TDEventsSection: """This is an old version of the CFLog kept for backward compatibility only""" + def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] @@ -116,6 +120,7 @@ def __iter__(self): for offset in range(0, len(self.section), sizeof(TDEvent)): yield TDEvent.from_buffer_copy(self.section, offset) + class TDStringSection: """TDAG String Table section. @@ -355,6 +360,7 @@ class TDFunctionsSection: """This section holds the mapping between the function IDs stored in callstack form in the cflog section, and the function names stored in the string table. See fnmapping in the C++ part of the codebase for the "write" side part of Polytracker that pertains to this section. Each entry is an uint32_t as set in fnmapping.cpp, but a TDFnHeader will then contain *two* of these: the function_id and the name_offset. Structure in memory: |offset|function id|...""" + def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] @@ -382,10 +388,8 @@ def invalid_fd(self): class TDFnHeader(Structure): # This corresponds to the Function inline constructor in fnmapping.h. # Anything using Structure needs to be in sync with the corresponding C++. - _fields_ = [ - ("name_offset", c_uint32), - ("function_id", c_uint32) - ] + _fields_ = [("name_offset", c_uint32), ("function_id", c_uint32)] + class TDNode: def __init__(self, affects_control_flow: bool = False): @@ -445,7 +449,6 @@ def __repr__(self) -> str: return f"TDSink fdidx: {self.fdidx} offset: {self.offset} label: {self.label}" - TDSection = Union[ TDLabelSection, TDSourceSection, @@ -536,21 +539,24 @@ def mangled_fn_symbol_lookup(self) -> Dict[int, str]: return lookup def _maybe_demangle(self, function_id: int) -> Union[str, int]: - """Depending on the age of the tdag, it may not contain a function mapping. If the tdag doesn't contain a function mapping, this will only return function ids and you'll need to manually map them against symbols gathered statically from the compiled instrumented binary. """ + """Depending on the age of the tdag, it may not contain a function mapping. If the tdag doesn't contain a function mapping, this will only return function ids and you'll need to manually map them against symbols gathered statically from the compiled instrumented binary.""" maybe_symbol = self.mangled_fn_symbol_lookup.get(function_id) if maybe_symbol is not None: return demangle(maybe_symbol) else: return function_id - def cflog(self, demangle_symbols: bool=False) -> Iterator[ControlFlowEvent]: + def cflog(self, demangle_symbols: bool = False) -> Iterator[ControlFlowEvent]: """Presents the control flow log. Does not demangle symbols by default, for performance.""" cflog_section = self.sections_by_type[TDControlFlowLogSection] assert isinstance(cflog_section, TDControlFlowLogSection) if demangle_symbols: for cflog_entry in cflog_section: - cflog_entry.callstack[:] = [self._maybe_demangle(function_id) for function_id in cflog_entry.callstack] + cflog_entry.callstack[:] = [ + self._maybe_demangle(function_id) + for function_id in cflog_entry.callstack + ] yield cflog_entry else: @@ -905,8 +911,11 @@ def run(self, args): print(f"Label {lbl}: {tdfile.decode_node(lbl)}") if args.print_function_trace: - if TDFunctionsSection in tdfile.sections_by_type.keys() and len(tdfile.mangled_fn_symbol_lookup) > 0: - for k,v in tdfile.mangled_fn_symbol_lookup: + if ( + TDFunctionsSection in tdfile.sections_by_type.keys() + and len(tdfile.mangled_fn_symbol_lookup) > 0 + ): + for k, v in tdfile.mangled_fn_symbol_lookup: print(f"function_id '{k}': function '{demangle(v)}'") else: print("Error: no Functions section could be read from the tdag!") @@ -917,5 +926,7 @@ def run(self, args): for event in tdfile.cflog(demangle_symbols=True): print(event) else: - print("Error: no Control Flow Log section could be read from the tdag!") + print( + "Error: no Control Flow Log section could be read from the tdag!" + ) print(f"Sections that could be read: {tdfile.sections}") diff --git a/tests/conftest.py b/tests/conftest.py index ee0e2eaa..627da284 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,12 @@ -import sys -import pytest import subprocess -import polytracker - +import sys from pathlib import Path from typing import List +import pytest + +import polytracker + def pytest_configure(config): config.addinivalue_line( diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index ae6d80a2..d67b6d22 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -1,20 +1,21 @@ +import subprocess +from pathlib import Path +from typing import List + import cxxfilt import pytest -import subprocess import polytracker -from pathlib import Path - +from polytracker import ProgramTrace from polytracker.taint_dag import ( - ControlFlowEvent, CFEnterFunctionEvent, CFLeaveFunctionEvent, + ControlFlowEvent, TaintedControlFlowEvent, TDControlFlowLogSection, TDNode, ) -from polytracker import ProgramTrace -from typing import List + @pytest.mark.program_trace("test_fntrace.cpp") def test_function_mapping(program_trace: ProgramTrace): @@ -25,9 +26,12 @@ def test_function_mapping(program_trace: ProgramTrace): for symbol in mangled_symbols: assert cxxfilt.demangle(symbol) in expected_names + @pytest.mark.program_trace("test_fntrace.cpp") def test_callstack_mapping(program_trace: ProgramTrace): - cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ + TDControlFlowLogSection + ] for cflog_entry in cflog: assert len(cflog_entry.callstack) > 0 @@ -36,17 +40,20 @@ def test_callstack_mapping(program_trace: ProgramTrace): # when we look up the function id it should map to a name we traced assert callstack_entry in program_trace.tdfile.mangled_fn_symbol_lookup + @pytest.mark.program_trace("test_fntrace.cpp") def test_label_mapping(program_trace: ProgramTrace): - cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[TDControlFlowLogSection] + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ + TDControlFlowLogSection + ] for cflog_entry in cflog: if type(cflog_entry) == TaintedControlFlowEvent: - assert hasattr(cflog_entry, 'label') + assert hasattr(cflog_entry, "label") node: TDNode = program_trace.tdfile.decode_node(cflog_entry.label) assert node.affects_control_flow else: - assert not hasattr(cflog_entry, 'label') + assert not hasattr(cflog_entry, "label") @pytest.mark.program_trace("test_cf_log.cpp") @@ -81,9 +88,7 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): CFEnterFunctionEvent(["main", "f1(unsigned char)"]), TaintedControlFlowEvent(["main", "f1(unsigned char)"], 7), CFEnterFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), - TaintedControlFlowEvent( - ["main", "f1(unsigned char)", "f2(unsigned char)"], 7 - ), + TaintedControlFlowEvent(["main", "f1(unsigned char)", "f2(unsigned char)"], 7), CFLeaveFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), CFLeaveFunctionEvent(["main", "f1(unsigned char)"]), CFLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit @@ -100,4 +105,6 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): for entry in cflog: for callstack_entry in entry.callstack: - assert callstack_entry in list(program_trace.tdfile.mangled_fn_symbol_lookup.values()) \ No newline at end of file + assert callstack_entry in list( + program_trace.tdfile.mangled_fn_symbol_lookup.values() + ) diff --git a/tests/test_program_trace.py b/tests/test_program_trace.py index b6d765da..2ec6906e 100644 --- a/tests/test_program_trace.py +++ b/tests/test_program_trace.py @@ -1,14 +1,10 @@ from collections import defaultdict -import pytest from subprocess import CalledProcessError from typing import Dict, Union -from polytracker import ( - BasicBlockEntry, - FunctionEntry, - FunctionReturn, - ProgramTrace, -) +import pytest + +from polytracker import BasicBlockEntry, FunctionEntry, FunctionReturn, ProgramTrace @pytest.mark.skip(reason="taint_dag does not support traces yet") @@ -182,6 +178,7 @@ def test_cxx_global_object(program_trace: ProgramTrace): assert taints[0].offset == 1 assert taints[0].length == 1 + @pytest.mark.skip(reason="the Taint Forest is currently not implemented") @pytest.mark.program_trace("test_simple_union.cpp", input="ABCDEFGH\n11235878\n") def test_taint_forest(program_trace: ProgramTrace): diff --git a/tests/test_stdin.cpp b/tests/test_stdin.cpp index bfd1ff91..632f59b0 100644 --- a/tests/test_stdin.cpp +++ b/tests/test_stdin.cpp @@ -1,7 +1,7 @@ #include -#include #include #include +#include int stdin_read() { char inbyte; @@ -101,7 +101,7 @@ int main(int argc, char *argv[]) { if (argc != 2) { exit(EXIT_FAILURE); } - + if (std::strncmp(argv[1], "read", 4) == 0) { printf("got read\n"); stdin_read(); diff --git a/tests/test_stdin.py b/tests/test_stdin.py index c8f37875..f117055c 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -1,20 +1,23 @@ -import pytest import subprocess - -import polytracker -from polytracker import taint_dag - from pathlib import Path from random import choice from string import printable +import pytest + +import polytracker +from polytracker import taint_dag + # Ensure stdin reads in multiple ways are verified # examples: getc, fgetc, fread, fread_unlocked, fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw -_stdin_data = '\n'.join(choice(printable) for _ in range(40)).encode("utf-8") +_stdin_data = "\n".join(choice(printable) for _ in range(40)).encode("utf-8") + -def _create_tdag_trace(instrumented_binary: Path, trace_file: Path, method: str) -> None: - """Rather than using pytest.mark.parametrize on this setup function, split +def _create_tdag_trace( + instrumented_binary: Path, trace_file: Path, method: str +) -> None: + """Rather than using pytest.mark.parametrize on this setup function, split out DRY from the test framework so it's easy to see when an individual test fails.""" # https://docs.python.org/3/library/subprocess.html#subprocess.CalledProcessError.returncode @@ -26,13 +29,16 @@ def _create_tdag_trace(instrumented_binary: Path, trace_file: Path, method: str) close_fds=False, ).check_returncode() + def _test_trace(trace_file: Path) -> None: - """Test the tdag output, checking its inputs to make sure we tainted and + """Test the tdag output, checking its inputs to make sure we tainted and tracked every byte of stdin. Offsets must be ordered as they were read.""" - - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load(trace_file) + + program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load( + trace_file + ) assert "/dev/stdin" in [input.path for input in program_trace.inputs] - + expected_offset = 0 for input_label in program_trace.tdfile.input_labels(): src_node = program_trace.tdfile.decode_node(input_label) @@ -40,39 +46,46 @@ def _test_trace(trace_file: Path) -> None: assert src_node.offset == expected_offset assert program_trace.tdfile.fd_headers[src_node.idx][0] == Path("/dev/stdin") expected_offset += 1 - + assert expected_offset == len(_stdin_data) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_read(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "read") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_fread(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "fread") _test_trace(trace_file) - + + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getc(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getc") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getc_unlocked(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getc_unlocked") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getchar(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getchar") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_getchar_unlocked(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "getchar_unlocked") _test_trace(trace_file) + @pytest.mark.program_trace("test_stdin.cpp") def test_stdin_fgetc(instrumented_binary: Path, trace_file: Path): _create_tdag_trace(instrumented_binary, trace_file, "fgetc") diff --git a/unittests/src/taintdag/section.cpp b/unittests/src/taintdag/section.cpp index d012ea99..892590bf 100644 --- a/unittests/src/taintdag/section.cpp +++ b/unittests/src/taintdag/section.cpp @@ -6,200 +6,200 @@ * the LICENSE file found in the root directory of this source tree. */ - #include - #include - - #include "taintdag/section.h" - - #include "utils.h" - - namespace taintdag { - TEST_CASE("SectionBase operations are consistent", "[SectionBase]") { - - // To be able to capture error_exits - test::ErrorExitReplace errthrow; - - // Exposing the members of SectionBase - struct TestSectionBase : public SectionBase { - TestSectionBase(span_t t) : SectionBase{t} {} - - auto write(size_t s) { return SectionBase::write(s); } - - auto offset(SectionBase::span_t::iterator o) { - return SectionBase::offset(o); - } - - auto offset(uint8_t const *p) { return SectionBase::offset(p); } - }; - - std::uint8_t backing[64]; - TestSectionBase sb{backing}; - SectionBase::span_t last; - - REQUIRE(sb.size() == 0); - - // Allocate 1 byte - { - auto ctx = sb.write(1); - REQUIRE(ctx); - last = ctx->mem; - } - REQUIRE(sb.size() == 1); - REQUIRE(sb.offset(last.begin()) == 0); - REQUIRE(sb.offset(&*last.begin()) == 0); - REQUIRE(last.size() == 1); - - // Allocate remainder but 1 byte - auto n = sizeof(backing) - 2; - { - auto ctx = sb.write(n); - REQUIRE(ctx); - // Allocation is compact - REQUIRE(ctx->mem.begin() == last.end()); - last = ctx->mem; - } - - REQUIRE(sb.size() == n + 1); - REQUIRE(sb.offset(last.begin()) == 1); - REQUIRE(sb.offset(&*last.begin()) == 1); - REQUIRE(last.size() == n); - - // Allocate last byte - { - auto ctx = sb.write(1); - REQUIRE(ctx); - // Allocation is compact - REQUIRE(ctx->mem.begin() == last.end()); - last = ctx->mem; - } - - REQUIRE(sb.size() == n + 1 + 1); - REQUIRE(sb.offset(last.begin()) == n + 1); - REQUIRE(sb.offset(&*last.begin()) == n + 1); - REQUIRE(last.size() == 1); - - // Attempt additional allocation, should fail. - auto ctx = sb.write(1); - REQUIRE(!ctx); - - // If offset is requested for out of bounds memory, just abort. Something - // is seriously wrong. - REQUIRE_THROWS_AS(sb.offset(SectionBase::span_t::iterator{}), - test::ErrorExit); - REQUIRE_THROWS_AS(sb.offset(last.end()), test::ErrorExit); - - REQUIRE_THROWS_AS(sb.offset(static_cast(nullptr)), - test::ErrorExit); - REQUIRE_THROWS_AS( - sb.offset(reinterpret_cast(&backing + sizeof(backing))), - test::ErrorExit); - } - - TEST_CASE("FixedSizeAlloc operations are consistent", "[FixedSizeAlloc]") { - - // To be able to capture error_exits - test::ErrorExitReplace errthrow; - - struct Dummy { - int32_t i; - char c; - - Dummy(int32_t ii, char cc) : i{ii}, c{cc} {} - }; - - // Assumptions for the test case. - REQUIRE(alignof(Dummy) == 4); - REQUIRE(sizeof(Dummy) == 8); - - using Section = FixedSizeAlloc; - - const size_t backing_count = 3; - const size_t backing_bytes = backing_count * sizeof(Dummy); - - // To ensure we get correct alignment of the backing - alignas(Dummy) std::uint8_t backing[backing_bytes]; - Section s{backing}; - - REQUIRE(s.entry_size() == sizeof(Dummy)); - REQUIRE(s.align_of == alignof(Dummy)); - REQUIRE(s.size() == 0); - REQUIRE(s.count() == 0); - REQUIRE(s.begin() == s.end()); - - SECTION("Adding instances affect size, count and constructed instance is " - "available") { - // Can add first entry - { - auto ctx = s.construct(999, 'A'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == 999); - REQUIRE(ctx->t.c == 'A'); - REQUIRE(s.index(ctx->t) == 0); - } - REQUIRE(s.count() == 1); - REQUIRE(s.size() == sizeof(Dummy)); - - // Can add when there is already an entry but not full. - { - auto ctx = s.construct(33, 'B'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == 33); - REQUIRE(ctx->t.c == 'B'); - REQUIRE(s.index(ctx->t) == 1); - } - REQUIRE(s.count() == 2); - REQUIRE(s.size() == 2 * sizeof(Dummy)); - - // Can fill the backing store with entries - { - auto ctx = s.construct(-1, 'C'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == -1); - REQUIRE(ctx->t.c == 'C'); - REQUIRE(s.index(ctx->t) == 2); - } - REQUIRE(s.count() == 3); - REQUIRE(s.size() == 3 * sizeof(Dummy)); - - // Can't insert beyound capacity - auto ctx = s.construct(-5, 'D'); - REQUIRE(!ctx); - } - - SECTION("Require aligned construction") { - SectionBase::span_t b1{&backing[1], sizeof(backing) - 7}; - REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); - - SectionBase::span_t b2{&backing[2], sizeof(backing) - 6}; - REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); - - SectionBase::span_t b3{&backing[3], sizeof(backing) - 5}; - REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); - } - - SECTION("Require size to be a multiple of align_of") { - SectionBase::span_t b1{&backing[0], sizeof(backing) - 1}; - REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); - - SectionBase::span_t b2{&backing[0], sizeof(backing) - 2}; - REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); - - SectionBase::span_t b3{&backing[0], sizeof(backing) - 3}; - REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); - } - - SECTION("Iteration") { - s.construct(-1, 'a'); - REQUIRE(std::distance(s.begin(), s.end()) == 1); - s.construct(-2, 'b'); - REQUIRE(std::distance(s.begin(), s.end()) == 2); - s.construct(-3, 'c'); - REQUIRE(std::distance(s.begin(), s.end()) == 3); - - // Know that begin is valid due to above - auto &first = *s.begin(); - REQUIRE(first.i == -1); - REQUIRE(first.c == 'a'); - } +#include +#include + +#include "taintdag/section.h" + +#include "utils.h" + +namespace taintdag { +TEST_CASE("SectionBase operations are consistent", "[SectionBase]") { + + // To be able to capture error_exits + test::ErrorExitReplace errthrow; + + // Exposing the members of SectionBase + struct TestSectionBase : public SectionBase { + TestSectionBase(span_t t) : SectionBase{t} {} + + auto write(size_t s) { return SectionBase::write(s); } + + auto offset(SectionBase::span_t::iterator o) { + return SectionBase::offset(o); + } + + auto offset(uint8_t const *p) { return SectionBase::offset(p); } + }; + + std::uint8_t backing[64]; + TestSectionBase sb{backing}; + SectionBase::span_t last; + + REQUIRE(sb.size() == 0); + + // Allocate 1 byte + { + auto ctx = sb.write(1); + REQUIRE(ctx); + last = ctx->mem; + } + REQUIRE(sb.size() == 1); + REQUIRE(sb.offset(last.begin()) == 0); + REQUIRE(sb.offset(&*last.begin()) == 0); + REQUIRE(last.size() == 1); + + // Allocate remainder but 1 byte + auto n = sizeof(backing) - 2; + { + auto ctx = sb.write(n); + REQUIRE(ctx); + // Allocation is compact + REQUIRE(ctx->mem.begin() == last.end()); + last = ctx->mem; + } + + REQUIRE(sb.size() == n + 1); + REQUIRE(sb.offset(last.begin()) == 1); + REQUIRE(sb.offset(&*last.begin()) == 1); + REQUIRE(last.size() == n); + + // Allocate last byte + { + auto ctx = sb.write(1); + REQUIRE(ctx); + // Allocation is compact + REQUIRE(ctx->mem.begin() == last.end()); + last = ctx->mem; + } + + REQUIRE(sb.size() == n + 1 + 1); + REQUIRE(sb.offset(last.begin()) == n + 1); + REQUIRE(sb.offset(&*last.begin()) == n + 1); + REQUIRE(last.size() == 1); + + // Attempt additional allocation, should fail. + auto ctx = sb.write(1); + REQUIRE(!ctx); + + // If offset is requested for out of bounds memory, just abort. Something + // is seriously wrong. + REQUIRE_THROWS_AS(sb.offset(SectionBase::span_t::iterator{}), + test::ErrorExit); + REQUIRE_THROWS_AS(sb.offset(last.end()), test::ErrorExit); + + REQUIRE_THROWS_AS(sb.offset(static_cast(nullptr)), + test::ErrorExit); + REQUIRE_THROWS_AS( + sb.offset(reinterpret_cast(&backing + sizeof(backing))), + test::ErrorExit); +} + +TEST_CASE("FixedSizeAlloc operations are consistent", "[FixedSizeAlloc]") { + + // To be able to capture error_exits + test::ErrorExitReplace errthrow; + + struct Dummy { + int32_t i; + char c; + + Dummy(int32_t ii, char cc) : i{ii}, c{cc} {} + }; + + // Assumptions for the test case. + REQUIRE(alignof(Dummy) == 4); + REQUIRE(sizeof(Dummy) == 8); + + using Section = FixedSizeAlloc; + + const size_t backing_count = 3; + const size_t backing_bytes = backing_count * sizeof(Dummy); + + // To ensure we get correct alignment of the backing + alignas(Dummy) std::uint8_t backing[backing_bytes]; + Section s{backing}; + + REQUIRE(s.entry_size() == sizeof(Dummy)); + REQUIRE(s.align_of == alignof(Dummy)); + REQUIRE(s.size() == 0); + REQUIRE(s.count() == 0); + REQUIRE(s.begin() == s.end()); + + SECTION("Adding instances affect size, count and constructed instance is " + "available") { + // Can add first entry + { + auto ctx = s.construct(999, 'A'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == 999); + REQUIRE(ctx->t.c == 'A'); + REQUIRE(s.index(ctx->t) == 0); } - } // namespace taintdag \ No newline at end of file + REQUIRE(s.count() == 1); + REQUIRE(s.size() == sizeof(Dummy)); + + // Can add when there is already an entry but not full. + { + auto ctx = s.construct(33, 'B'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == 33); + REQUIRE(ctx->t.c == 'B'); + REQUIRE(s.index(ctx->t) == 1); + } + REQUIRE(s.count() == 2); + REQUIRE(s.size() == 2 * sizeof(Dummy)); + + // Can fill the backing store with entries + { + auto ctx = s.construct(-1, 'C'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == -1); + REQUIRE(ctx->t.c == 'C'); + REQUIRE(s.index(ctx->t) == 2); + } + REQUIRE(s.count() == 3); + REQUIRE(s.size() == 3 * sizeof(Dummy)); + + // Can't insert beyound capacity + auto ctx = s.construct(-5, 'D'); + REQUIRE(!ctx); + } + + SECTION("Require aligned construction") { + SectionBase::span_t b1{&backing[1], sizeof(backing) - 7}; + REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); + + SectionBase::span_t b2{&backing[2], sizeof(backing) - 6}; + REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); + + SectionBase::span_t b3{&backing[3], sizeof(backing) - 5}; + REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); + } + + SECTION("Require size to be a multiple of align_of") { + SectionBase::span_t b1{&backing[0], sizeof(backing) - 1}; + REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); + + SectionBase::span_t b2{&backing[0], sizeof(backing) - 2}; + REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); + + SectionBase::span_t b3{&backing[0], sizeof(backing) - 3}; + REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); + } + + SECTION("Iteration") { + s.construct(-1, 'a'); + REQUIRE(std::distance(s.begin(), s.end()) == 1); + s.construct(-2, 'b'); + REQUIRE(std::distance(s.begin(), s.end()) == 2); + s.construct(-3, 'c'); + REQUIRE(std::distance(s.begin(), s.end()) == 3); + + // Know that begin is valid due to above + auto &first = *s.begin(); + REQUIRE(first.i == -1); + REQUIRE(first.c == 'a'); + } +} +} // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/storage.cpp b/unittests/src/taintdag/storage.cpp index aec583c2..a94bf001 100644 --- a/unittests/src/taintdag/storage.cpp +++ b/unittests/src/taintdag/storage.cpp @@ -14,27 +14,27 @@ #include "utils.h" namespace taintdag { - TEST_CASE("Type properties of FixedSizeFile", "[FixedSizeFile]") { - // Don't want multiple copies referring to the same file - REQUIRE(!std::is_copy_constructible_v); - REQUIRE(!std::is_copy_assignable_v); - - // NOTE(hbrodin): The FixedSizeFile is currently not move - // constructible/assignable. There is nothing preventing such an - // implementation. Currently there is no need so leave this as is. - REQUIRE(!std::is_move_assignable_v); - REQUIRE(!std::is_move_constructible_v); - } - - TEST_CASE("Type properties of MMapFile", "[MMapFile]") { - // Don't want multiple copies referring to the same regions - REQUIRE(!std::is_copy_constructible_v); - REQUIRE(!std::is_copy_assignable_v); - - // NOTE(hbrodin): The MMapFile is currently not move constructible/assignable. - // Behavior is currently inherited from FixedSizeFile. Should that change, - // the MMapFile would change as well. - REQUIRE(!std::is_move_assignable_v); - REQUIRE(!std::is_move_constructible_v); - } +TEST_CASE("Type properties of FixedSizeFile", "[FixedSizeFile]") { + // Don't want multiple copies referring to the same file + REQUIRE(!std::is_copy_constructible_v); + REQUIRE(!std::is_copy_assignable_v); + + // NOTE(hbrodin): The FixedSizeFile is currently not move + // constructible/assignable. There is nothing preventing such an + // implementation. Currently there is no need so leave this as is. + REQUIRE(!std::is_move_assignable_v); + REQUIRE(!std::is_move_constructible_v); +} + +TEST_CASE("Type properties of MMapFile", "[MMapFile]") { + // Don't want multiple copies referring to the same regions + REQUIRE(!std::is_copy_constructible_v); + REQUIRE(!std::is_copy_assignable_v); + + // NOTE(hbrodin): The MMapFile is currently not move constructible/assignable. + // Behavior is currently inherited from FixedSizeFile. Should that change, + // the MMapFile would change as well. + REQUIRE(!std::is_move_assignable_v); + REQUIRE(!std::is_move_constructible_v); +} } // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/string_table.cpp b/unittests/src/taintdag/string_table.cpp index 32b2c48c..64c26890 100644 --- a/unittests/src/taintdag/string_table.cpp +++ b/unittests/src/taintdag/string_table.cpp @@ -15,9 +15,8 @@ #include "utils.h" namespace taintdag { -TEST_CASE( - "The Sources and StringTable sections can store source entries", - "[Sources, StringTable]") { +TEST_CASE("The Sources and StringTable sections can store source entries", + "[Sources, StringTable]") { OutputFile of{std::tmpnam(nullptr)}; auto &sources_section{of.section()}; auto &string_table{of.section()}; diff --git a/unittests/src/taintdag/taint_label_encoding.cpp b/unittests/src/taintdag/taint_label_encoding.cpp index 9085394b..749854aa 100644 --- a/unittests/src/taintdag/taint_label_encoding.cpp +++ b/unittests/src/taintdag/taint_label_encoding.cpp @@ -7,8 +7,8 @@ using namespace taintdag; TEST_CASE("Encoding decoding") { - for (auto i=0;i<100000;i++) { - auto [t,_] = test::rand_taint(); + for (auto i = 0; i < 100000; i++) { + auto [t, _] = test::rand_taint(); auto encoded = taintdag::encode(t); Taint decoded = taintdag::decode(encoded); REQUIRE(decoded == t); @@ -25,7 +25,7 @@ TEST_CASE("Affects control flow") { } TEST_CASE("Basic sanity checks") { - for (size_t i=0;i<100000;i++) { + for (size_t i = 0; i < 100000; i++) { auto [st, _] = test::random_source_taint(); auto encoded = encode(st); REQUIRE((encoded >> source_taint_bit_shift)); @@ -34,15 +34,13 @@ TEST_CASE("Basic sanity checks") { } } - TEST_CASE("Compare equal ignore cf") { - for (size_t i=0;i<1000;i++) { + for (size_t i = 0; i < 1000; i++) { auto [t1, _1] = test::rand_taint(); auto [t2, _2] = test::rand_taint(); if (t1 == t2) continue; - auto e1 = encode(t1); // Affects control flow auto e1cf = add_affects_control_flow(e1); @@ -54,6 +52,5 @@ TEST_CASE("Compare equal ignore cf") { REQUIRE(equal_ignore_cf(e1, e1cf)); REQUIRE(!equal_ignore_cf(e1, e2)); REQUIRE(!equal_ignore_cf(e1cf, e2)); - } } \ No newline at end of file diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index eb7175ef..b6527d52 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -23,88 +23,90 @@ #include "utils.h" namespace taintdag { - TEST_CASE("Test basic TDAG construction", "[Integration]") { - using SourceLabelIndexSection = BitmapSectionBase<5, BitCount{max_label} + 1>; - using ConcreteOutputFile = - OutputFile; - ConcreteOutputFile tdg("test.tdag"); - - SECTION("Sources") { - auto idx = tdg.section().add_source("sourcename", -1); - REQUIRE(idx); - REQUIRE(*idx == 0); - REQUIRE(tdg.section().count() == 1); - auto idx2 = tdg.section().add_source("next-source", 2); - REQUIRE(*idx2 == 1); - REQUIRE(tdg.section().count() == 2); - } - - SECTION("Labels") { - // 25 is randomly chosen; ranges can be bigger - unsigned long length = rand() % 25 + 1; - - // label range represents a data structure like an array - auto test_range = - tdg.section().create_source_labels(-1, -1, length); - REQUIRE(test_range.first != test_range.second); - - // todo(kaoudis) this seems like it should be specific, on the order of the - // number of items in the range. why isn't it? - auto size_with_range = tdg.section().count(); - REQUIRE(size_with_range > 0); - - tdg.section().set_range(BitIndex{test_range.first}, BitCount{length}); - REQUIRE(tdg.section().size() > 0); - - // label union represents a step in the progression of taint - auto test_union = tdg.section().union_taint(test_range.first, test_range.second); - REQUIRE(test_union != test_range.first); - REQUIRE(test_union != test_range.second); - - // added just one new label - the union - REQUIRE(tdg.section().count() == size_with_range + 1); - } - - SECTION("String Table") { - auto offset1 = tdg.section().add_string("Hello"); - auto offset2 = tdg.section().add_string("World!"); - REQUIRE(offset1 != offset2); - // for the string table, size() yields the size of all included entries, - // plus the size of the offsets to them - REQUIRE(tdg.section().size() == 16); - } - - SECTION("Sinks") { - tdg.section().log_single(-1, -1, 0); - REQUIRE(tdg.section().count() == 1); - } - - SECTION("Tainted Control Flow (includes String Table and Functions)") { - int function_id = 1; - - // just before enter_function, cf __polytracker_enter_function - // (we pair these always - function trace should only contain fns with - // enter and leave events!) - tdg.section().add_mapping(function_id, "hello_world"); - REQUIRE(tdg.section().count() == 1); - - // adds a new entry. entry size is dependent on varint_encoding, which - // uses up to 5 bytes packed into a size_t to represent a buffer that - // was originally filled with uint8_t's. - tdg.section().enter_function(function_id); - auto size_with_one_entry = tdg.section().size(); - REQUIRE(size_with_one_entry > 0); - - // adds a new entry - tdg.section().tainted_control_flow(-1, function_id); - auto size_with_two_entries = tdg.section().size(); - REQUIRE((size_with_two_entries / 2) >= size_with_one_entry); - - // adds a new entry - tdg.section().leave_function(function_id); - auto size_with_three_entries = tdg.section().size(); - REQUIRE((size_with_three_entries / 3) >= size_with_one_entry); - } +TEST_CASE("Test basic TDAG construction", "[Integration]") { + using SourceLabelIndexSection = BitmapSectionBase<5, BitCount{max_label} + 1>; + using ConcreteOutputFile = + OutputFile; + ConcreteOutputFile tdg("test.tdag"); + + SECTION("Sources") { + auto idx = tdg.section().add_source("sourcename", -1); + REQUIRE(idx); + REQUIRE(*idx == 0); + REQUIRE(tdg.section().count() == 1); + auto idx2 = tdg.section().add_source("next-source", 2); + REQUIRE(*idx2 == 1); + REQUIRE(tdg.section().count() == 2); } + + SECTION("Labels") { + // 25 is randomly chosen; ranges can be bigger + unsigned long length = rand() % 25 + 1; + + // label range represents a data structure like an array + auto test_range = + tdg.section().create_source_labels(-1, -1, length); + REQUIRE(test_range.first != test_range.second); + + // todo(kaoudis) this seems like it should be specific, on the order of the + // number of items in the range. why isn't it? + auto size_with_range = tdg.section().count(); + REQUIRE(size_with_range > 0); + + tdg.section().set_range(BitIndex{test_range.first}, + BitCount{length}); + REQUIRE(tdg.section().size() > 0); + + // label union represents a step in the progression of taint + auto test_union = + tdg.section().union_taint(test_range.first, test_range.second); + REQUIRE(test_union != test_range.first); + REQUIRE(test_union != test_range.second); + + // added just one new label - the union + REQUIRE(tdg.section().count() == size_with_range + 1); + } + + SECTION("String Table") { + auto offset1 = tdg.section().add_string("Hello"); + auto offset2 = tdg.section().add_string("World!"); + REQUIRE(offset1 != offset2); + // for the string table, size() yields the size of all included entries, + // plus the size of the offsets to them + REQUIRE(tdg.section().size() == 16); + } + + SECTION("Sinks") { + tdg.section().log_single(-1, -1, 0); + REQUIRE(tdg.section().count() == 1); + } + + SECTION("Tainted Control Flow (includes String Table and Functions)") { + int function_id = 1; + + // just before enter_function, cf __polytracker_enter_function + // (we pair these always - function trace should only contain fns with + // enter and leave events!) + tdg.section().add_mapping(function_id, "hello_world"); + REQUIRE(tdg.section().count() == 1); + + // adds a new entry. entry size is dependent on varint_encoding, which + // uses up to 5 bytes packed into a size_t to represent a buffer that + // was originally filled with uint8_t's. + tdg.section().enter_function(function_id); + auto size_with_one_entry = tdg.section().size(); + REQUIRE(size_with_one_entry > 0); + + // adds a new entry + tdg.section().tainted_control_flow(-1, function_id); + auto size_with_two_entries = tdg.section().size(); + REQUIRE((size_with_two_entries / 2) >= size_with_one_entry); + + // adds a new entry + tdg.section().leave_function(function_id); + auto size_with_three_entries = tdg.section().size(); + REQUIRE((size_with_three_entries / 3) >= size_with_one_entry); + } +} } // namespace taintdag \ No newline at end of file From 8350eb208711684c3d241d28004f29b414ba8a2b Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 6 Mar 2025 16:58:44 +0000 Subject: [PATCH 060/112] document tests --- docs/tdag.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/tdag.md b/docs/tdag.md index 4a5105d0..9ba2c0a2 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -38,8 +38,8 @@ Some specifics: - The Source Label Index is a bitmap that defines how to index the sources section. - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) - [Strings](../polytracker/include/taintdag/string_table.h) the string table contains things like names of sources, the function names used with the functions section to map to the control flow log, etc. todo(kaoudis) in future this could probably be less of a catchall for 'all things stringly' and those things could have separate lookups or be stored inline if we're gonna mmap so much space anyway. -- [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see test_cf_log.py for details of how this looks/works. -- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. see test_cf_log.py for details of how this looks/works. +- [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see tests/test_cf_log.py for how this layout looks/works. +- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow ("tainted control flow" or "control affecting data flow" are ways that we have referred to this subset of data flow) passed through, plus recorded function ids that can be mapped back through the Functions section to the String Table to get mangled symbols. see test_cf_log.py and unittests/src/taintdag/tdag.cpp for details of how this looks/works. ## TDAG Contents From b960e16f26f8c37b465d79d6a25be0a8142f6dec Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Mar 2025 16:03:57 +0000 Subject: [PATCH 061/112] completed numbers --- examples/analysis/timings.md | 107 +++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 examples/analysis/timings.md diff --git a/examples/analysis/timings.md b/examples/analysis/timings.md new file mode 100644 index 00000000..baad9eaf --- /dev/null +++ b/examples/analysis/timings.md @@ -0,0 +1,107 @@ +# What did this change break? +Hopefully nothing? + +# What is this change doing? +My goal is for taint tracking to work exactly as before, but to clean up the ftrace/cflog/events side of the house, unifying `--cflog` and `--ftrace` options (cleaning up / simplifying how we are writing to the Functions, Events, Control Flow Log, and String Table sections overall) so we don't add duplicate instrumentation to software or write duplicate data to the TDAG and/or separate files (i.e., functionid.json) anymore. + +Everything that I could build got run on example inputs to make sure it worked as expected. As a part of these changes we don't write to functionid.json anymore and just use the space we were allocating and not filling in in the tdag, since it's a humongous region we don't use all of anyway. TDAG size is fixed, but our usage of it is slightly more efficient currently. A future goal could be to only mmap the space we need so file size can be smaller. + +# Instrumentation Time and Resulting Bitcode Sizes +These experiments reproduce the measurements from the +[PolyTracker paper](https://github.com/trailofbits/publications/blob/master/papers/issta24-polytracker.pdf), +but on different hardware. For uniformity, experiments were all conducted in an Ubuntu 24.04 cloud VM with + +- 500 GB disk +- 64 GiB RAM +- 8 vCPUs + +I'm comparing the before-and-after of the TDAG condensation changes on `kaoudis/merge-function-sections` with `master` at `e618c4d6d7481326d0ea76073d663d2b867e0e9d`, the hash of the work included in the camera ready version of the prior paper. The question I'm answering here is "what is the net result of these changes in terms of how the software works". + +All the current example Dockerfiles on `master` that work right now (we/I need to clean up the others a bit; they're a bit bitrotted) are included here for completeness. The following measurements aren't terribly scientific, they are from one run of the Dockerfile each (whereas for the paper I averaged ten runs apiece). + +## Bitcode sizes +The "in" .bc file is the whole-program .bc file that gets the first layer of instrumentation applied to it. The CFlog .bc is the "in" .bc with CFlog instrumentation, pre-optimization (if optimization occurs in the PolyTracker build). the final .bc file is the instrumented .bc file ending in `.instrumented.bc` that we lower to an executable. bc size may have changed because what instrumentation we use changed: I removed the separate function name recording / events pass-level code, and added function name recording to the tdag into the cflog pass. I also removed the separate `--ftrace` and `--taint` options: we do `--taint` by default, and `--ftrace` is part of `--cflog` now. + +Also note that some dockerfiles did not compile on the `master` branch prior to these changes with the `--cflog` option and I'm not sure why, but because of this I did not record cflog-inclusive bc size for them on `master`. + +As measured by `ls -lb` in the container, and normalized into MiB: + +| Dockerfile | In .bc size | Final .bc BEFORE (taint, ftrace, events) | Final .bc BEFORE (cflog, taint, ftrace, events) | CFlog-_only_ .bc |Final .bc AFTER (cflog, taint) | Final .bc AFTER (taint only) | +| -- | -- | -- | -- | -- | -- | -- | +| Dockerfile-acropalypse.demo | 1.65 MiB | 1.89 MiB | | 1.89 MiB | 4.4 MiB | 3.94 MiB | +| Dockerfile-daedalus-pdf.demo | 4.15 MiB | 4.76 MiB | 17.83 MiB | 4.95 MiB | 17.62 MiB | 16.39 MiB | +| Dockerfile-ffmpeg.demo | 30.52 MiB | 33.80 MiB | | 33.64 MiB | 84.3 MiB | 84.72 MiB | +| Dockerfile-file.demo | 0.85 MiB | 0.95 MiB | | 0.96 MiB | 1.98 MiB | 1.99 MiB | +| Dockerfile-libjpeg.demo | 1.25 MiB | 1.36 MiB | | 1.36 MiB | 3.33 MiB | 3.62 MiB | +| Dockerfile-mupdf.demo | 14.56 MiB | 18.19 MiB | | 18.19 MiB | 66 MiB | 82.72 MiB | +| Dockerfile-nitro-nitf.demo | 5.79 MiB | 8.23 MiB | 20.64 MiB | 6.57 MiB | 20.62 MiB | 18 MiB | +| Dockerfile-openjpeg.demo | 0.89 MiB | 1.15 MiB | | 1.13 MiB | 4.29 MiB | 3.71 MiB | +| Dockerfile-poppler.demo `pdftops` | 8.82 MiB | 10.25 MiB | 35.58 MiB | 10.17 MiB | 35.77 MiB | 35.99 MiB | +| Dockerfile-poppler.demo `pdftotext` | 8.04 MiB | 9.29 MiB | 31.82 MiB | 9.26 MiB | 32.01 MiB | 32.09 MiB | +| Dockerfile-qpdf.demo | 10.92 MiB | 13.14 MiB | | 13.14 MiB | 49.21 MiB | 47.65 MiB | +| Dockerfile-xpdf.demo `pdfinfo` | 3.78 MiB | 4.56 MiB | 17.14 MiB | 4.37 MiB | 16.88 MiB | 17.80 MiB | +| Dockerfile-xpdf.demo `pdftops` | 4.75 MiB | 5.78 MiB | 22.52 MiB | 5.55 MiB | 22.25 MiB | 23.85 MiB | +| Dockerfile-xpdf.demo `pdftotext` | 3.98 MiB | 4.85 MiB | 18.67 MiB | 4.64 MiB | 18.41 MiB | 19.37 MiB | + +## TDAG sizes +TDAG size is fixed because of how we write TDAGs right now; it didn't change. + +## Total instrumentation time + "Instrumentation time" here refers either to the time Docker takes to run `polytracker instrument-targets`, which includes how long it takes to do both cflog and taint label instrumentation placement as well as executable creation, or the time to do equivalent steps. + + Also note that some dockerfiles did not compile on the `master` branch prior to these changes with the `--cflog` option and I'm not sure why, but because of this I did not record cflog-inclusive instrumentation time for them on `master`. + + As measured by Docker: + +| Dockerfile | Instrumentation time (taint, ftrace, events) BEFORE | Instrumentation time (cflog, taint, ftrace, events) BEFORE | Instrumentation time (cflog, taint) AFTER | Instrumentation time (taint only) AFTER | +| -- | -- | -- | -- | -- | +| Dockerfile-acropalypse.demo | 26.7\* s | | 30.3\* s | 27.3\* s | +| Dockerfile-daedalus-pdf.demo | 34.2 s | 39.1 s | 37.5 s | 35.2 s | +| Dockerfile-ffmpeg.demo | 150.7 s | | 156.5 s | 158.3 s | +| Dockerfile-file.demo | 12.1 s | | 12.4 s | 12.6 s | +| Dockerfile-libjpeg.demo | 22.7 s | | 21.2 s | 23.6 s | +| Dockerfile-mupdf.demo | 152.4 s | | 129.2 s | 154.8 s | +| Dockerfile-nitro-nitf.demo | 30 s | 33.7 s | 33.8 s | 29.5 s | +| Dockerfile-openjpeg.demo | 45.3\* s | | 51.3\* s | 49.6\* s | +| Dockerfile-poppler.demo `pdftops` | 291.2 s | 279.1 s | 290 s | 305.9 s | +| Dockerfile-poppler.demo `pdftotext` | 255.5 s | 249 s | 255.3 s | 268.5 s | +| Dockerfile-qpdf.demo | 382.9 s | | 393.8 s | 391.9 s | +| Dockerfile-xpdf.demo `pdfinfo` | 154.5 s | 141.9 s | 143.3 s | 164.2 s | +| Dockerfile-xpdf.demo `pdftops` | 206.9 s | 189.9 s | 187.2 s | 217.2 s | +| Dockerfile-xpdf.demo `pdftotext` | 169.1 s | 157.1 s | 154.4 s | 184.3 s | + +## Notes + +### \* + +I combined the times recorded by Docker for extraction, linking, instrumentation, optimization (if included), and lowering to get this figure since that's everything `instrument-targets` would do. + +NB Dockerfile-acropalypse.demo does not run the typical bitcode optimization step as part of instrumentation and lowering. + +### N/As + +The following Dockerfiles did not build on master or on the new branch. Here's minimal notes on why. These should be investigated later if we care to keep them up to date. + +#### DaeDaLus NITF + +DaeDaLus NITF parser fails on the Cabal build of DaeDaLus, and also did at the time I did the prior eval work for the paper. I think this is because the DaeDaLus repository main branch is broken, and we need to pin a prior working commit in that Dockerfile. + +#### jq + +Linking failed for build defined in Dockerfile. Also to investigate later; was not included in paper. + +#### libgen + +`go get` is no longer a supported command outside a module, and the Go setup in this Dockerfile would need to be updated. + +#### listgen + +After solving a couple minor errors due to zlib URL changing etc, building the libxml2-2.9.10 codebase failed with Python macro errors. + +#### pdfium + +pdfium's build halted and prompted me for the country of my keyboard. This will need to be fixed so the build is completely automated. I think I recall doing some work here that I didn't save to make the build more automated - I think a particular commit might need to be pinned in the source repo. + +#### png + +The png Dockerfile seeisms unfinished and doesn't instrument anything. I have a different version in a volume saved from a different cloud provider that I can pull out and use later. From 72d789d3cb6129d968ef0cc942f0a974fda36e30 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Mar 2025 16:04:37 +0000 Subject: [PATCH 062/112] pin daedalus for the pdf example --- examples/Dockerfile-daedalus-pdf.demo | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/Dockerfile-daedalus-pdf.demo b/examples/Dockerfile-daedalus-pdf.demo index 816ee44e..d709270c 100644 --- a/examples/Dockerfile-daedalus-pdf.demo +++ b/examples/Dockerfile-daedalus-pdf.demo @@ -9,6 +9,7 @@ RUN apt-get update && apt-get -y upgrade && apt-get install -y git pkg-config RUN git clone --recursive https://github.com/GaloisInc/daedalus.git WORKDIR /polytracker/the_klondike/daedalus +RUN git checkout 46a2502a8c68b17534079356a71af90e87801fdc FROM trailofbits/polytracker:latest LABEL org.opencontainers.image.authors="evan.sultanik@trailofbits.com" From 046769bf7a486fc900ed215d3bcb7952832d2216 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Mar 2025 16:07:50 +0000 Subject: [PATCH 063/112] note about weird sizing --- examples/analysis/timings.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/analysis/timings.md b/examples/analysis/timings.md index baad9eaf..bab57bc5 100644 --- a/examples/analysis/timings.md +++ b/examples/analysis/timings.md @@ -1,5 +1,5 @@ # What did this change break? -Hopefully nothing? +Hopefully nothing? :D # What is this change doing? My goal is for taint tracking to work exactly as before, but to clean up the ftrace/cflog/events side of the house, unifying `--cflog` and `--ftrace` options (cleaning up / simplifying how we are writing to the Functions, Events, Control Flow Log, and String Table sections overall) so we don't add duplicate instrumentation to software or write duplicate data to the TDAG and/or separate files (i.e., functionid.json) anymore. @@ -70,6 +70,9 @@ TDAG size is fixed because of how we write TDAGs right now; it didn't change. | Dockerfile-xpdf.demo `pdftops` | 206.9 s | 189.9 s | 187.2 s | 217.2 s | | Dockerfile-xpdf.demo `pdftotext` | 169.1 s | 157.1 s | 154.4 s | 184.3 s | +# What's weird here +The sizes of bitcode when instrumented with all our passes before AND after these changes seem like they could be indicative of extra instrumentation (perhaps the labels pass instrumenting the cflog and/or functions pass?), though I haven't dug into whether this is truly happening yet. It doesn't _seem like_ this is exactly hurting anything at the moment, but I would be curious if others notice the same. + ## Notes ### \* From 50cfd0e2b33ca112a49b2ec753a9aecad5f0395f Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Mar 2025 16:08:50 +0000 Subject: [PATCH 064/112] note about daedalus --- examples/analysis/timings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/analysis/timings.md b/examples/analysis/timings.md index bab57bc5..1d318b63 100644 --- a/examples/analysis/timings.md +++ b/examples/analysis/timings.md @@ -87,7 +87,7 @@ The following Dockerfiles did not build on master or on the new branch. Here's m #### DaeDaLus NITF -DaeDaLus NITF parser fails on the Cabal build of DaeDaLus, and also did at the time I did the prior eval work for the paper. I think this is because the DaeDaLus repository main branch is broken, and we need to pin a prior working commit in that Dockerfile. +DaeDaLus NITF parser fails on the Cabal build of DaeDaLus, and also did at the time I did the prior eval work for the paper. I think this is because the DaeDaLus repository main branch is broken, and we need to pin a prior working commit in that Dockerfile. I don't know what this commit would be - the one mentioned in the Dockerfile doesn't build, either. #### jq From 1128eb657a4cf43d7409225453e76fb8690c0e5c Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Mon, 24 Mar 2025 16:11:22 +0000 Subject: [PATCH 065/112] trunk fmt'd --- examples/analysis/timings.md | 83 +++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/examples/analysis/timings.md b/examples/analysis/timings.md index 1d318b63..6218405e 100644 --- a/examples/analysis/timings.md +++ b/examples/analysis/timings.md @@ -1,12 +1,15 @@ # What did this change break? + Hopefully nothing? :D # What is this change doing? + My goal is for taint tracking to work exactly as before, but to clean up the ftrace/cflog/events side of the house, unifying `--cflog` and `--ftrace` options (cleaning up / simplifying how we are writing to the Functions, Events, Control Flow Log, and String Table sections overall) so we don't add duplicate instrumentation to software or write duplicate data to the TDAG and/or separate files (i.e., functionid.json) anymore. Everything that I could build got run on example inputs to make sure it worked as expected. As a part of these changes we don't write to functionid.json anymore and just use the space we were allocating and not filling in in the tdag, since it's a humongous region we don't use all of anyway. TDAG size is fixed, but our usage of it is slightly more efficient currently. A future goal could be to only mmap the space we need so file size can be smaller. # Instrumentation Time and Resulting Bitcode Sizes + These experiments reproduce the measurements from the [PolyTracker paper](https://github.com/trailofbits/publications/blob/master/papers/issta24-polytracker.pdf), but on different hardware. For uniformity, experiments were all conducted in an Ubuntu 24.04 cloud VM with @@ -20,57 +23,61 @@ I'm comparing the before-and-after of the TDAG condensation changes on `kaoudis/ All the current example Dockerfiles on `master` that work right now (we/I need to clean up the others a bit; they're a bit bitrotted) are included here for completeness. The following measurements aren't terribly scientific, they are from one run of the Dockerfile each (whereas for the paper I averaged ten runs apiece). ## Bitcode sizes + The "in" .bc file is the whole-program .bc file that gets the first layer of instrumentation applied to it. The CFlog .bc is the "in" .bc with CFlog instrumentation, pre-optimization (if optimization occurs in the PolyTracker build). the final .bc file is the instrumented .bc file ending in `.instrumented.bc` that we lower to an executable. bc size may have changed because what instrumentation we use changed: I removed the separate function name recording / events pass-level code, and added function name recording to the tdag into the cflog pass. I also removed the separate `--ftrace` and `--taint` options: we do `--taint` by default, and `--ftrace` is part of `--cflog` now. Also note that some dockerfiles did not compile on the `master` branch prior to these changes with the `--cflog` option and I'm not sure why, but because of this I did not record cflog-inclusive bc size for them on `master`. As measured by `ls -lb` in the container, and normalized into MiB: -| Dockerfile | In .bc size | Final .bc BEFORE (taint, ftrace, events) | Final .bc BEFORE (cflog, taint, ftrace, events) | CFlog-_only_ .bc |Final .bc AFTER (cflog, taint) | Final .bc AFTER (taint only) | -| -- | -- | -- | -- | -- | -- | -- | -| Dockerfile-acropalypse.demo | 1.65 MiB | 1.89 MiB | | 1.89 MiB | 4.4 MiB | 3.94 MiB | -| Dockerfile-daedalus-pdf.demo | 4.15 MiB | 4.76 MiB | 17.83 MiB | 4.95 MiB | 17.62 MiB | 16.39 MiB | -| Dockerfile-ffmpeg.demo | 30.52 MiB | 33.80 MiB | | 33.64 MiB | 84.3 MiB | 84.72 MiB | -| Dockerfile-file.demo | 0.85 MiB | 0.95 MiB | | 0.96 MiB | 1.98 MiB | 1.99 MiB | -| Dockerfile-libjpeg.demo | 1.25 MiB | 1.36 MiB | | 1.36 MiB | 3.33 MiB | 3.62 MiB | -| Dockerfile-mupdf.demo | 14.56 MiB | 18.19 MiB | | 18.19 MiB | 66 MiB | 82.72 MiB | -| Dockerfile-nitro-nitf.demo | 5.79 MiB | 8.23 MiB | 20.64 MiB | 6.57 MiB | 20.62 MiB | 18 MiB | -| Dockerfile-openjpeg.demo | 0.89 MiB | 1.15 MiB | | 1.13 MiB | 4.29 MiB | 3.71 MiB | -| Dockerfile-poppler.demo `pdftops` | 8.82 MiB | 10.25 MiB | 35.58 MiB | 10.17 MiB | 35.77 MiB | 35.99 MiB | -| Dockerfile-poppler.demo `pdftotext` | 8.04 MiB | 9.29 MiB | 31.82 MiB | 9.26 MiB | 32.01 MiB | 32.09 MiB | -| Dockerfile-qpdf.demo | 10.92 MiB | 13.14 MiB | | 13.14 MiB | 49.21 MiB | 47.65 MiB | -| Dockerfile-xpdf.demo `pdfinfo` | 3.78 MiB | 4.56 MiB | 17.14 MiB | 4.37 MiB | 16.88 MiB | 17.80 MiB | -| Dockerfile-xpdf.demo `pdftops` | 4.75 MiB | 5.78 MiB | 22.52 MiB | 5.55 MiB | 22.25 MiB | 23.85 MiB | -| Dockerfile-xpdf.demo `pdftotext` | 3.98 MiB | 4.85 MiB | 18.67 MiB | 4.64 MiB | 18.41 MiB | 19.37 MiB | +| Dockerfile | In .bc size | Final .bc BEFORE (taint, ftrace, events) | Final .bc BEFORE (cflog, taint, ftrace, events) | CFlog-_only_ .bc | Final .bc AFTER (cflog, taint) | Final .bc AFTER (taint only) | +| ----------------------------------- | ----------- | ---------------------------------------- | ----------------------------------------------- | ---------------- | ------------------------------ | ---------------------------- | +| Dockerfile-acropalypse.demo | 1.65 MiB | 1.89 MiB | | 1.89 MiB | 4.4 MiB | 3.94 MiB | +| Dockerfile-daedalus-pdf.demo | 4.15 MiB | 4.76 MiB | 17.83 MiB | 4.95 MiB | 17.62 MiB | 16.39 MiB | +| Dockerfile-ffmpeg.demo | 30.52 MiB | 33.80 MiB | | 33.64 MiB | 84.3 MiB | 84.72 MiB | +| Dockerfile-file.demo | 0.85 MiB | 0.95 MiB | | 0.96 MiB | 1.98 MiB | 1.99 MiB | +| Dockerfile-libjpeg.demo | 1.25 MiB | 1.36 MiB | | 1.36 MiB | 3.33 MiB | 3.62 MiB | +| Dockerfile-mupdf.demo | 14.56 MiB | 18.19 MiB | | 18.19 MiB | 66 MiB | 82.72 MiB | +| Dockerfile-nitro-nitf.demo | 5.79 MiB | 8.23 MiB | 20.64 MiB | 6.57 MiB | 20.62 MiB | 18 MiB | +| Dockerfile-openjpeg.demo | 0.89 MiB | 1.15 MiB | | 1.13 MiB | 4.29 MiB | 3.71 MiB | +| Dockerfile-poppler.demo `pdftops` | 8.82 MiB | 10.25 MiB | 35.58 MiB | 10.17 MiB | 35.77 MiB | 35.99 MiB | +| Dockerfile-poppler.demo `pdftotext` | 8.04 MiB | 9.29 MiB | 31.82 MiB | 9.26 MiB | 32.01 MiB | 32.09 MiB | +| Dockerfile-qpdf.demo | 10.92 MiB | 13.14 MiB | | 13.14 MiB | 49.21 MiB | 47.65 MiB | +| Dockerfile-xpdf.demo `pdfinfo` | 3.78 MiB | 4.56 MiB | 17.14 MiB | 4.37 MiB | 16.88 MiB | 17.80 MiB | +| Dockerfile-xpdf.demo `pdftops` | 4.75 MiB | 5.78 MiB | 22.52 MiB | 5.55 MiB | 22.25 MiB | 23.85 MiB | +| Dockerfile-xpdf.demo `pdftotext` | 3.98 MiB | 4.85 MiB | 18.67 MiB | 4.64 MiB | 18.41 MiB | 19.37 MiB | ## TDAG sizes + TDAG size is fixed because of how we write TDAGs right now; it didn't change. ## Total instrumentation time - "Instrumentation time" here refers either to the time Docker takes to run `polytracker instrument-targets`, which includes how long it takes to do both cflog and taint label instrumentation placement as well as executable creation, or the time to do equivalent steps. - - Also note that some dockerfiles did not compile on the `master` branch prior to these changes with the `--cflog` option and I'm not sure why, but because of this I did not record cflog-inclusive instrumentation time for them on `master`. - - As measured by Docker: - -| Dockerfile | Instrumentation time (taint, ftrace, events) BEFORE | Instrumentation time (cflog, taint, ftrace, events) BEFORE | Instrumentation time (cflog, taint) AFTER | Instrumentation time (taint only) AFTER | -| -- | -- | -- | -- | -- | -| Dockerfile-acropalypse.demo | 26.7\* s | | 30.3\* s | 27.3\* s | -| Dockerfile-daedalus-pdf.demo | 34.2 s | 39.1 s | 37.5 s | 35.2 s | -| Dockerfile-ffmpeg.demo | 150.7 s | | 156.5 s | 158.3 s | -| Dockerfile-file.demo | 12.1 s | | 12.4 s | 12.6 s | -| Dockerfile-libjpeg.demo | 22.7 s | | 21.2 s | 23.6 s | -| Dockerfile-mupdf.demo | 152.4 s | | 129.2 s | 154.8 s | -| Dockerfile-nitro-nitf.demo | 30 s | 33.7 s | 33.8 s | 29.5 s | -| Dockerfile-openjpeg.demo | 45.3\* s | | 51.3\* s | 49.6\* s | -| Dockerfile-poppler.demo `pdftops` | 291.2 s | 279.1 s | 290 s | 305.9 s | -| Dockerfile-poppler.demo `pdftotext` | 255.5 s | 249 s | 255.3 s | 268.5 s | -| Dockerfile-qpdf.demo | 382.9 s | | 393.8 s | 391.9 s | -| Dockerfile-xpdf.demo `pdfinfo` | 154.5 s | 141.9 s | 143.3 s | 164.2 s | -| Dockerfile-xpdf.demo `pdftops` | 206.9 s | 189.9 s | 187.2 s | 217.2 s | -| Dockerfile-xpdf.demo `pdftotext` | 169.1 s | 157.1 s | 154.4 s | 184.3 s | + +"Instrumentation time" here refers either to the time Docker takes to run `polytracker instrument-targets`, which includes how long it takes to do both cflog and taint label instrumentation placement as well as executable creation, or the time to do equivalent steps. + +Also note that some dockerfiles did not compile on the `master` branch prior to these changes with the `--cflog` option and I'm not sure why, but because of this I did not record cflog-inclusive instrumentation time for them on `master`. + +As measured by Docker: + +| Dockerfile | Instrumentation time (taint, ftrace, events) BEFORE | Instrumentation time (cflog, taint, ftrace, events) BEFORE | Instrumentation time (cflog, taint) AFTER | Instrumentation time (taint only) AFTER | +| ----------------------------------- | --------------------------------------------------- | ---------------------------------------------------------- | ----------------------------------------- | --------------------------------------- | +| Dockerfile-acropalypse.demo | 26.7\* s | | 30.3\* s | 27.3\* s | +| Dockerfile-daedalus-pdf.demo | 34.2 s | 39.1 s | 37.5 s | 35.2 s | +| Dockerfile-ffmpeg.demo | 150.7 s | | 156.5 s | 158.3 s | +| Dockerfile-file.demo | 12.1 s | | 12.4 s | 12.6 s | +| Dockerfile-libjpeg.demo | 22.7 s | | 21.2 s | 23.6 s | +| Dockerfile-mupdf.demo | 152.4 s | | 129.2 s | 154.8 s | +| Dockerfile-nitro-nitf.demo | 30 s | 33.7 s | 33.8 s | 29.5 s | +| Dockerfile-openjpeg.demo | 45.3\* s | | 51.3\* s | 49.6\* s | +| Dockerfile-poppler.demo `pdftops` | 291.2 s | 279.1 s | 290 s | 305.9 s | +| Dockerfile-poppler.demo `pdftotext` | 255.5 s | 249 s | 255.3 s | 268.5 s | +| Dockerfile-qpdf.demo | 382.9 s | | 393.8 s | 391.9 s | +| Dockerfile-xpdf.demo `pdfinfo` | 154.5 s | 141.9 s | 143.3 s | 164.2 s | +| Dockerfile-xpdf.demo `pdftops` | 206.9 s | 189.9 s | 187.2 s | 217.2 s | +| Dockerfile-xpdf.demo `pdftotext` | 169.1 s | 157.1 s | 154.4 s | 184.3 s | # What's weird here + The sizes of bitcode when instrumented with all our passes before AND after these changes seem like they could be indicative of extra instrumentation (perhaps the labels pass instrumenting the cflog and/or functions pass?), though I haven't dug into whether this is truly happening yet. It doesn't _seem like_ this is exactly hurting anything at the moment, but I would be curious if others notice the same. ## Notes From 39b593e132f868a8d25ad04f206d72bfd063ae6f Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 07:33:29 +0000 Subject: [PATCH 066/112] fixes 'Multiple top-level headings in the same document markdownlint/MD025' --- examples/analysis/timings.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/analysis/timings.md b/examples/analysis/timings.md index 6218405e..bd69b16d 100644 --- a/examples/analysis/timings.md +++ b/examples/analysis/timings.md @@ -1,14 +1,14 @@ -# What did this change break? - -Hopefully nothing? :D - # What is this change doing? My goal is for taint tracking to work exactly as before, but to clean up the ftrace/cflog/events side of the house, unifying `--cflog` and `--ftrace` options (cleaning up / simplifying how we are writing to the Functions, Events, Control Flow Log, and String Table sections overall) so we don't add duplicate instrumentation to software or write duplicate data to the TDAG and/or separate files (i.e., functionid.json) anymore. Everything that I could build got run on example inputs to make sure it worked as expected. As a part of these changes we don't write to functionid.json anymore and just use the space we were allocating and not filling in in the tdag, since it's a humongous region we don't use all of anyway. TDAG size is fixed, but our usage of it is slightly more efficient currently. A future goal could be to only mmap the space we need so file size can be smaller. -# Instrumentation Time and Resulting Bitcode Sizes +## What did this change break? + +Hopefully nothing? :D + +## Instrumentation Time and Resulting Bitcode Sizes These experiments reproduce the measurements from the [PolyTracker paper](https://github.com/trailofbits/publications/blob/master/papers/issta24-polytracker.pdf), @@ -22,7 +22,7 @@ I'm comparing the before-and-after of the TDAG condensation changes on `kaoudis/ All the current example Dockerfiles on `master` that work right now (we/I need to clean up the others a bit; they're a bit bitrotted) are included here for completeness. The following measurements aren't terribly scientific, they are from one run of the Dockerfile each (whereas for the paper I averaged ten runs apiece). -## Bitcode sizes +### Bitcode sizes The "in" .bc file is the whole-program .bc file that gets the first layer of instrumentation applied to it. The CFlog .bc is the "in" .bc with CFlog instrumentation, pre-optimization (if optimization occurs in the PolyTracker build). the final .bc file is the instrumented .bc file ending in `.instrumented.bc` that we lower to an executable. bc size may have changed because what instrumentation we use changed: I removed the separate function name recording / events pass-level code, and added function name recording to the tdag into the cflog pass. I also removed the separate `--ftrace` and `--taint` options: we do `--taint` by default, and `--ftrace` is part of `--cflog` now. @@ -47,11 +47,11 @@ As measured by `ls -lb` in the container, and normalized into MiB: | Dockerfile-xpdf.demo `pdftops` | 4.75 MiB | 5.78 MiB | 22.52 MiB | 5.55 MiB | 22.25 MiB | 23.85 MiB | | Dockerfile-xpdf.demo `pdftotext` | 3.98 MiB | 4.85 MiB | 18.67 MiB | 4.64 MiB | 18.41 MiB | 19.37 MiB | -## TDAG sizes +### TDAG sizes TDAG size is fixed because of how we write TDAGs right now; it didn't change. -## Total instrumentation time +### Total instrumentation time "Instrumentation time" here refers either to the time Docker takes to run `polytracker instrument-targets`, which includes how long it takes to do both cflog and taint label instrumentation placement as well as executable creation, or the time to do equivalent steps. @@ -76,7 +76,7 @@ As measured by Docker: | Dockerfile-xpdf.demo `pdftops` | 206.9 s | 189.9 s | 187.2 s | 217.2 s | | Dockerfile-xpdf.demo `pdftotext` | 169.1 s | 157.1 s | 154.4 s | 184.3 s | -# What's weird here +## What's weird here The sizes of bitcode when instrumented with all our passes before AND after these changes seem like they could be indicative of extra instrumentation (perhaps the labels pass instrumenting the cflog and/or functions pass?), though I haven't dug into whether this is truly happening yet. It doesn't _seem like_ this is exactly hurting anything at the moment, but I would be curious if others notice the same. From 683083ab4a28865a5b05c397d939ab62b88835ca Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 09:35:05 +0000 Subject: [PATCH 067/112] turn off some of the linter crud --- .flake8 | 3 +++ .trunk/.gitignore | 1 + .trunk/trunk.yaml | 58 +++++++++++++++++++++++++++++++---------------- 3 files changed, 42 insertions(+), 20 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..0037e615 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +# Autoformatter friendly flake8 config (all formatting rules disabled) +[flake8] +extend-ignore = D1, D2, E1, E2, E3, E501, W1, W2, W3, W5 \ No newline at end of file diff --git a/.trunk/.gitignore b/.trunk/.gitignore index 1e246529..15966d08 100644 --- a/.trunk/.gitignore +++ b/.trunk/.gitignore @@ -6,3 +6,4 @@ plugins user_trunk.yaml user.yaml +tmp diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index ca85e5b9..ef030898 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -1,6 +1,6 @@ version: 0.1 cli: - version: 1.14.1 + version: 1.22.11 plugins: sources: - id: trunk @@ -16,29 +16,47 @@ lint: - linters: [ALL] paths: - polytracker/src/compiler-rt/** + - third_party/** + - linters: [bandit] + paths: + - tests/** + definitions: + - name: flake8 + direct_configs: + - .flake8 + - name: ruff + direct_configs: + - .ruff.toml + - name: mypy + direct_configs: + - .mypy.ini + - name: clang-format + direct_configs: + - .clang-format + enabled: - - taplo@0.8.1 - - actionlint@1.6.25 - - bandit@1.7.5 - - black@23.7.0 - - checkov@2.4.5 + - taplo@0.9.3 + - actionlint@1.7.7 + - bandit@1.8.3 + - black@25.1.0 + - checkov@3.2.390 - clang-format@16.0.3 - - flake8@6.1.0 + - flake8@7.1.2 - git-diff-check - - hadolint@2.12.0 - - isort@5.12.0 - - markdownlint@0.35.0 - - mypy@1.5.1 - - oxipng@8.0.0 - - prettier@3.0.2 - - ruff@0.0.285 - - shellcheck@0.9.0 + - hadolint@2.12.1-beta + - isort@6.0.1 + - markdownlint@0.44.0 + - mypy@1.15.0 + - oxipng@9.1.4 + - prettier@3.5.3 + - ruff@0.11.1 + - shellcheck@0.10.0 - shfmt@3.6.0 - - svgo@3.0.2 - - terrascan@1.18.3 - - trivy@0.44.1 - - trufflehog@3.48.0 - - yamllint@1.32.0 + - svgo@3.3.2 + - terrascan@1.19.1 + - trivy@0.60.0 + - trufflehog@3.88.18 + - yamllint@1.36.2 actions: disabled: - trunk-announce From 1d0268613fb0b2f634cdce2cd23e0350aa92404f Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 11:05:32 +0000 Subject: [PATCH 068/112] apparently ruff.toml wasn't being found --- .ruff.toml => ruff.toml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .ruff.toml => ruff.toml (100%) diff --git a/.ruff.toml b/ruff.toml similarity index 100% rename from .ruff.toml rename to ruff.toml From 2cffa2db21aee800e69a354f76d486eea228630c Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 11:06:03 +0000 Subject: [PATCH 069/112] fix trunk linter issues with the build workflow, mainly bumps dependency versions and makes permissions read-all --- .github/workflows/build.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f3c729da..70f5a7b0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,11 +1,11 @@ name: Build - +permissions: read-all on: workflow_dispatch: {} push: branches: - - "master" + - master tags: - "*" @@ -21,16 +21,16 @@ jobs: steps: - name: Clone polytracker repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Build and export - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: . tags: trailofbits/polytracker:latest @@ -79,7 +79,7 @@ jobs: steps: - name: Clone polytracker repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 @@ -94,10 +94,10 @@ jobs: run: docker load --input /tmp/polytracker.tar - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Build ${{ matrix.example }} - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: examples file: examples/Dockerfile-${{ matrix.example }}.demo From f191df911ccbf2f02cb80c8ed8dbc8f90a749359 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 11:06:40 +0000 Subject: [PATCH 070/112] shushes some more linter crud and bumps the go and node versions, removes my attempt at setting the config files manually --- .trunk/trunk.yaml | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index ef030898..a962cf27 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -4,14 +4,17 @@ cli: plugins: sources: - id: trunk - ref: v1.2.1 + ref: v1.6.7 uri: https://github.com/trunk-io/plugins runtimes: enabled: - - go@1.19.5 - - node@18.12.1 + - go@1.21.0 + - node@18.20.5 - python@3.10.8 lint: + threshold: + - linters: [ALL] + level: high ignore: - linters: [ALL] paths: @@ -24,15 +27,6 @@ lint: - name: flake8 direct_configs: - .flake8 - - name: ruff - direct_configs: - - .ruff.toml - - name: mypy - direct_configs: - - .mypy.ini - - name: clang-format - direct_configs: - - .clang-format enabled: - taplo@0.9.3 From d20b845bab4e91a01fddef1284715cba9a7a02b0 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 11:07:00 +0000 Subject: [PATCH 071/112] shush the linter --- examples/analysis/ubet/eval_nitro.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/analysis/ubet/eval_nitro.py b/examples/analysis/ubet/eval_nitro.py index 062052f8..8652e745 100644 --- a/examples/analysis/ubet/eval_nitro.py +++ b/examples/analysis/ubet/eval_nitro.py @@ -1,6 +1,6 @@ import argparse import os -import subprocess +import subprocess # nosec B404 import sys from collections import defaultdict from functools import partialmethod @@ -312,9 +312,11 @@ def get_cflog_entires(tdfile, is_debug): print_cols( str(dbg_entry[0]), str(rel_entry[0]), - f" !!! DBG: {dbg_callstack} != REL: {rel_callstack}" - if dbg_callstack != rel_callstack - else "", + ( + f" !!! DBG: {dbg_callstack} != REL: {rel_callstack}" + if dbg_callstack != rel_callstack + else "" + ), ) dbgidx += 1 relidx += 1 From a9cafd0ae0906cff6b0cd41f5d717e54b25a9ae7 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 11:07:17 +0000 Subject: [PATCH 072/112] shush the linter --- polytracker/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/build.py b/polytracker/build.py index 72cdb9f4..af5b2771 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -1,7 +1,7 @@ import argparse import json import os -import subprocess +import subprocess # nosec B404 from pathlib import Path from typing import Dict, List, Tuple From 5bf05deb8e80d82184be9c114e81d6af968be885 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 11:09:35 +0000 Subject: [PATCH 073/112] use squashing lint bugs as an excuse to throw descriptive errors instead of the asserts which bandit as run by trunk gets mad about --- polytracker/taint_dag.py | 121 +++++++++++++++++++++++++++------------ 1 file changed, 83 insertions(+), 38 deletions(-) diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index f6589fae..970ba3b3 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -97,7 +97,9 @@ def enumerate(self): @deprecated("Use ControlFlowEvent instead, TDEvents are no longer written") class TDEvent(Structure): - """This is an old version of the ControlFlowEvent kept for backward compatibility only""" + """TDEvent is an old version of the ControlFlowEvent kept for backward + compatibility only. + """ _fields_ = [("kind", c_uint8), ("fnidx", c_uint16)] @@ -109,12 +111,14 @@ def __repr__(self) -> str: return f"kind: {self.Kind(self.kind).name} fnidx: {self.fnidx}" -@deprecated("Use TDControlFlowLog instead, TDEvents section is no longer written") +@deprecated("Use TDControlFlowLog; the TDEvents section is no longer written") class TDEventsSection: - """This is an old version of the CFLog kept for backward compatibility only""" + """TDEventsSection is an old version of the CFLog kept for backward + compatibility only. + """ def __init__(self, mem, hdr): - self.section = mem[hdr.offset : hdr.offset + hdr.size] + self.section = mem[hdr.offset : hdr.offset + hdr.size] # nosec E203 def __iter__(self): for offset in range(0, len(self.section), sizeof(TDEvent)): @@ -131,7 +135,8 @@ class TDStringSection: - source names - function names - additional label metadata - Check usages of StringTableBase in the C++ ("write side") part of the codebase. + Check usages of StringTableBase in the C++ ("write side") part of the + codebase. """ def __init__(self, mem, hdr): @@ -140,7 +145,11 @@ def __init__(self, mem, hdr): def read_string(self, offset): n = c_uint16.from_buffer_copy(self.section[offset:]).value - assert len(self.section) >= offset + sizeof(c_uint16) + n + if not (len(self.section) >= offset + sizeof(c_uint16) + n): + raise AssertionError( + """Section out of alignment with c_uint16 + so string could not be read""" + ) return str( self.section[offset + sizeof(c_uint16) : offset + sizeof(c_uint16) + n], "utf-8", @@ -206,16 +215,18 @@ class TaintedControlFlowEvent: Current callstack (including the function the control flow happened in) is available in the `callstack` member.""" - def __init__(self, callstack: List, label: int = None): + def __init__(self, callstack: List, label: Optional[int] = None): self.callstack = callstack self.label = label def __repr__(self) -> str: return f"TaintedControlFlowEvent: taint label {self.label} | {self.callstack}" - def __eq__(self, __o: object) -> bool: - if isinstance(__o, TaintedControlFlowEvent): - return self.label == __o.label and self.callstack == __o.callstack + def __eq__(self, other) -> bool: + if isinstance(other, TaintedControlFlowEvent) and self.label is not None: + return self.label == other.label and self.callstack == other.callstack + elif self.label is None and other.label is None: + return self.callstack == other.callstack return False @@ -262,7 +273,7 @@ def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] def __iter__(self) -> Iterator[ControlFlowEvent]: - """Produces the cflog entries in order from the mmapped buffer.""" + """Produce the cflog entries in order from the mmapped buffer.""" buffer = self.section callstack = [] while buffer: @@ -326,7 +337,12 @@ class TDBitmapSection: def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] - assert len(self.section) % 8 == 0 # Multiple of uint64_t + if len(self.section) % 8 != 0: + # Multiple of uint64_t + raise AssertionError( + """Bitmap Section out of alignment and + cannot be read""" + ) def enumerate_set_bits(self): """Enumerates all bits that are set @@ -357,9 +373,16 @@ def __init__(self, mem, hdr): class TDFunctionsSection: - """This section holds the mapping between the function IDs stored in callstack form in the cflog section, and the function names stored in the string table. See fnmapping in the C++ part of the codebase for the "write" side part of Polytracker that pertains to this section. Each entry is an uint32_t as set in fnmapping.cpp, but a TDFnHeader will then contain *two* of these: the function_id and the name_offset. - - Structure in memory: |offset|function id|...""" + """TDFunctionsSection holds the mapping between the function IDs + stored in callstack form in the cflog section, and the function + names stored in the string table. See fnmapping in the C++ part + of the codebase for the "write" side part of Polytracker that + pertains to this section. Each entry is an uint32_t as set in + fnmapping.cpp, but a TDFnHeader will then contain *two* of these: + the function_id and the name_offset. + + Structure in memory: |offset|function id|... + """ def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] @@ -517,21 +540,27 @@ def __init__(self, file: BinaryIO) -> None: def read_fd_headers(self) -> Iterator[Tuple[Path, TDFDHeader]]: sources = self.sections_by_type[TDSourceSection] + if not isinstance(sources, TDSourceSection): + raise AssertionError("Sources Section could not be read") strings = self.sections_by_type[TDStringSection] - assert isinstance(sources, TDSourceSection) - assert isinstance(strings, TDStringSection) + if not isinstance(strings, TDStringSection): + raise AssertionError("Strings Table could not be read") for source in sources.enumerate(): yield Path(strings.read_string(source.name_offset)), source @property def mangled_fn_symbol_lookup(self) -> Dict[int, str]: - """Unordered! map of dynamically observed function IDs to clang symbols. You can demangle the symbols with cxxfilt.demangle.""" + """Unordered! map of dynamically observed function IDs to clang + symbols. You can demangle the symbols with cxxfilt.demangle. + """ lookup = {} functions = self.sections_by_type[TDFunctionsSection] - assert isinstance(functions, TDFunctionsSection) + if not isinstance(functions, TDFunctionsSection): + raise AssertionError("Functions Section could not be read") strings = self.sections_by_type[TDStringSection] - assert isinstance(strings, TDStringSection) + if not isinstance(strings, TDStringSection): + raise AssertionError("String Table could not be read") for entry in functions: lookup[entry.function_id] = strings.read_string(entry.name_offset) @@ -539,17 +568,24 @@ def mangled_fn_symbol_lookup(self) -> Dict[int, str]: return lookup def _maybe_demangle(self, function_id: int) -> Union[str, int]: - """Depending on the age of the tdag, it may not contain a function mapping. If the tdag doesn't contain a function mapping, this will only return function ids and you'll need to manually map them against symbols gathered statically from the compiled instrumented binary.""" + """Depending on the age of the tdag, it may not contain a function + mapping. If the tdag doesn't contain a function mapping, this will + only return function ids and you'll need to manually map them against + symbols gathered statically from the compiled instrumented binary. + """ maybe_symbol = self.mangled_fn_symbol_lookup.get(function_id) if maybe_symbol is not None: return demangle(maybe_symbol) else: return function_id - def cflog(self, demangle_symbols: bool = False) -> Iterator[ControlFlowEvent]: - """Presents the control flow log. Does not demangle symbols by default, for performance.""" + def cflog(self, demangle_symbols=False) -> Iterator[ControlFlowEvent]: + """Presents the control flow log. Does not demangle symbols by default, + for performance. + """ cflog_section = self.sections_by_type[TDControlFlowLogSection] - assert isinstance(cflog_section, TDControlFlowLogSection) + if not isinstance(cflog_section, TDControlFlowLogSection): + raise AssertionError("CFLog section not correctly read from TDAG?") if demangle_symbols: for cflog_entry in cflog_section: @@ -560,25 +596,28 @@ def cflog(self, demangle_symbols: bool = False) -> Iterator[ControlFlowEvent]: yield cflog_entry else: - cflog_section() + cflog_section.__iter__() def input_labels(self) -> Iterator[int]: """Enumerates all taint labels that are input labels (source taint)""" source_index_section = self.sections_by_type[TDSourceIndexSection] - assert isinstance(source_index_section, TDSourceIndexSection) + if not isinstance(source_index_section, TDSourceIndexSection): + raise AssertionError("Source Index not correctly read from TDAG?") return source_index_section.enumerate_set_bits() @property def label_count(self): label_section = self.sections_by_type[TDLabelSection] - assert isinstance(label_section, TDLabelSection) + if not isinstance(label_section, TDLabelSection): + raise AssertionError("Could not read Label Section from TDAG?") return label_section.count() def read_node(self, label: int) -> int: if label in self.raw_nodes: return self.raw_nodes[label] label_section = self.sections_by_type[TDLabelSection] - assert isinstance(label_section, TDLabelSection) + if not isinstance(label_section, TDLabelSection): + raise AssertionError("Could not read Label Section from TDAG?") result = label_section.read_raw(label) self.raw_nodes[label] = result @@ -614,7 +653,8 @@ def nodes(self) -> Iterator[TDNode]: @property def sinks(self) -> Iterator[TDSink]: sink_section = self.sections_by_type[TDSinkSection] - assert isinstance(sink_section, TDSinkSection) + if not isinstance(sink_section, TDSinkSection): + raise AssertionError("Could not read Sink Section from TDAG?") yield from sink_section.enumerate() @@ -652,9 +692,14 @@ def basic_blocks(self) -> Iterable[BasicBlock]: raise NotImplementedError() def file_offset(self, node: TaintForestNode) -> ByteOffset: - assert node.source is not None + if node.source is None: + raise AssertionError( + """ + No source could be found from which offset could be calculated""" + ) tdnode: TDNode = self.tdfile.decode_node(node.label) - assert isinstance(tdnode, TDSourceNode) + if not isinstance(tdnode, TDSourceNode): + raise AssertionError("Source Node could not be decoded") return ByteOffset(node.source, tdnode.offset) @property @@ -695,7 +740,8 @@ def inputs(self) -> Iterator[Input]: seen: Set[int] = set() for source_label in self.tdfile.input_labels(): source_node = self.tdfile.decode_node(source_label) - assert isinstance(source_node, TDSourceNode) + if not isinstance(source_node, TDSourceNode): + raise AssertionError("Source Node could not be decoded?") if source_node.idx not in seen: path, fd_header = self.tdfile.fd_headers[source_node.idx] yield Input(fd_header.fd, str(path), fd_header.size) @@ -832,10 +878,11 @@ def create_node(self, label: int) -> TDTaintForestNode: (curr, node.last), ) - assert False + raise AssertionError("TDTaintForestNode could not be created") def get_node(self, label: int, source: Optional[Input] = None) -> TDTaintForestNode: - assert source is None + if source is not None: + raise AssertionError("Node could not be retrieved from label") if self.node_cache[label] is not None: return cast(TDTaintForestNode, self.node_cache[label]) @@ -918,7 +965,7 @@ def run(self, args): for k, v in tdfile.mangled_fn_symbol_lookup: print(f"function_id '{k}': function '{demangle(v)}'") else: - print("Error: no Functions section could be read from the tdag!") + print("Error: no Functions section was read from tdag!") print(f"Sections that could be read: {tdfile.sections}") if args.print_control_flow_log: @@ -926,7 +973,5 @@ def run(self, args): for event in tdfile.cflog(demangle_symbols=True): print(event) else: - print( - "Error: no Control Flow Log section could be read from the tdag!" - ) + print("Error: no Control Flow Log section read from tdag!") print(f"Sections that could be read: {tdfile.sections}") From 21f4eef93faa2379fe8a0b094a2b5d3902ce68a9 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 11:10:07 +0000 Subject: [PATCH 074/112] use correct comparison for types and squash some function-level type lint errors --- tests/test_cf_log.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index d67b6d22..39b6e03f 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -6,7 +6,6 @@ import pytest import polytracker -from polytracker import ProgramTrace from polytracker.taint_dag import ( CFEnterFunctionEvent, CFLeaveFunctionEvent, @@ -18,7 +17,7 @@ @pytest.mark.program_trace("test_fntrace.cpp") -def test_function_mapping(program_trace: ProgramTrace): +def test_function_mapping(program_trace) -> None: mangled_symbols = list(program_trace.tdfile.mangled_fn_symbol_lookup.values()) assert mangled_symbols == ["main", "_Z9factoriali"] @@ -28,7 +27,7 @@ def test_function_mapping(program_trace: ProgramTrace): @pytest.mark.program_trace("test_fntrace.cpp") -def test_callstack_mapping(program_trace: ProgramTrace): +def test_callstack_mapping(program_trace) -> None: cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ TDControlFlowLogSection ] @@ -42,13 +41,13 @@ def test_callstack_mapping(program_trace: ProgramTrace): @pytest.mark.program_trace("test_fntrace.cpp") -def test_label_mapping(program_trace: ProgramTrace): +def test_label_mapping(program_trace) -> None: cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ TDControlFlowLogSection ] for cflog_entry in cflog: - if type(cflog_entry) == TaintedControlFlowEvent: + if type(cflog_entry) is TaintedControlFlowEvent: assert hasattr(cflog_entry, "label") node: TDNode = program_trace.tdfile.decode_node(cflog_entry.label) assert node.affects_control_flow @@ -57,7 +56,7 @@ def test_label_mapping(program_trace: ProgramTrace): @pytest.mark.program_trace("test_cf_log.cpp") -def test_cf_log(instrumented_binary: Path, trace_file: Path): +def test_cf_log(instrumented_binary: Path, trace_file: Path) -> None: """Demonstrates how the cflog should work end to end, integrated with the fn mapping and the function symbols from the strings table.""" # Data to write to stdin, one byte at a time stdin_data = "abcdefgh" @@ -98,7 +97,7 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): for got, expected in zip(cflog, expected_seq): assert got == expected - if type(got) == TaintedControlFlowEvent: + if type(got) is TaintedControlFlowEvent: assert got.label is not None assert len(got.callstack) > 0 From fc691426c3453d8050bae0ab61fe58b5c6bfc481 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 11:10:27 +0000 Subject: [PATCH 075/112] linters, hush --- tests/test_stdin.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/test_stdin.py b/tests/test_stdin.py index f117055c..38c3b1d0 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -1,17 +1,17 @@ -import subprocess +import subprocess # nosec B404 from pathlib import Path from random import choice -from string import printable +from string import printable as chars import pytest import polytracker -from polytracker import taint_dag # Ensure stdin reads in multiple ways are verified -# examples: getc, fgetc, fread, fread_unlocked, fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw +# examples: getc, fgetc, fread, fread_unlocked, +# fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw -_stdin_data = "\n".join(choice(printable) for _ in range(40)).encode("utf-8") +_stdin_data = "\n".join(choice(chars) for _ in range(40)) # nosec B311 def _create_tdag_trace( @@ -21,11 +21,11 @@ def _create_tdag_trace( out DRY from the test framework so it's easy to see when an individual test fails.""" # https://docs.python.org/3/library/subprocess.html#subprocess.CalledProcessError.returncode - subprocess.run( + subprocess.run( # nosec B603 args=[str(instrumented_binary), method], env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": "1"}, stderr=subprocess.STDOUT, - input=_stdin_data, + input=_stdin_data.encode("utf-8"), close_fds=False, ).check_returncode() @@ -34,9 +34,7 @@ def _test_trace(trace_file: Path) -> None: """Test the tdag output, checking its inputs to make sure we tainted and tracked every byte of stdin. Offsets must be ordered as they were read.""" - program_trace: taint_dag.TDProgramTrace = polytracker.PolyTrackerTrace.load( - trace_file - ) + program_trace = polytracker.PolyTrackerTrace.load(trace_file) assert "/dev/stdin" in [input.path for input in program_trace.inputs] expected_offset = 0 From dbd12a71570d976c50d4b2cb3f91426455201cd6 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 14:50:12 +0000 Subject: [PATCH 076/112] fixes the running of the c++ tests at least locally in act, maybe fixes it on the runner too --- .github/workflows/build.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 70f5a7b0..b08d3b52 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -61,7 +61,11 @@ jobs: uses: addnab/docker-run-action@v3 with: image: trailofbits/polytracker:latest - run: ./polytracker-build/unittests/src/taintdag/tests-taintdag + run: | + cd /polytracker-build/unittests/src/taintdag && \ + pwd && \ + chmod +x tests-taintdag && \ + ./tests-taintdag - name: Python (integration) tests uses: addnab/docker-run-action@v3 From 47a03812b660bb66012a9b8aeb11eccd458a8e5b Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 15:01:45 +0000 Subject: [PATCH 077/112] don't need to grant top level read permissions --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b08d3b52..d5c47b07 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,5 +1,5 @@ name: Build -permissions: read-all +permissions: {} on: workflow_dispatch: {} From c828f9c3d60367505afe1f3aca8736e898d96a97 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 15:02:27 +0000 Subject: [PATCH 078/112] trunk fmt --- .github/workflows/build.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d5c47b07..8b2651cb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -62,10 +62,10 @@ jobs: with: image: trailofbits/polytracker:latest run: | - cd /polytracker-build/unittests/src/taintdag && \ - pwd && \ - chmod +x tests-taintdag && \ - ./tests-taintdag + cd /polytracker-build/unittests/src/taintdag && \ + pwd && \ + chmod +x tests-taintdag && \ + ./tests-taintdag - name: Python (integration) tests uses: addnab/docker-run-action@v3 From e485ef1d2f5c9e406a87c8c6dc26961b5d295545 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Tue, 25 Mar 2025 16:02:37 +0000 Subject: [PATCH 079/112] use working source loc for poppler --- examples/Dockerfile-poppler.demo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Dockerfile-poppler.demo b/examples/Dockerfile-poppler.demo index 1f964559..3771c7ce 100644 --- a/examples/Dockerfile-poppler.demo +++ b/examples/Dockerfile-poppler.demo @@ -3,7 +3,7 @@ FROM ubuntu:focal AS poppler-sources WORKDIR /polytracker/the_klondike ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y git -RUN git clone --depth=1 --branch poppler-23.06.0 https://anongit.freedesktop.org/git/poppler/poppler.git +RUN git clone --depth=1 --branch poppler-23.06.0 https://gitlab.freedesktop.org/poppler/poppler.git # Now, build the qpdf image using previously downloaded source FROM trailofbits/polytracker:latest From f67d249f7598dad576c12be85867c5150dc8d1e6 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Wed, 2 Apr 2025 16:21:33 +0000 Subject: [PATCH 080/112] progress toward fixing the vector type errors - and making things a bit more comprehensive within the bounds of llvm 12 --- docs/tdag.md | 2 +- .../polytracker/passes/taint_tracking.h | 3 + .../polytracker/passes/tainted_control_flow.h | 2 + polytracker/src/passes/taint_tracking.cpp | 16 +++ .../src/passes/tainted_control_flow.cpp | 104 +++++++++++------- 5 files changed, 87 insertions(+), 40 deletions(-) diff --git a/docs/tdag.md b/docs/tdag.md index 9ba2c0a2..26705c2b 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -37,7 +37,7 @@ Some specifics: - [Sources](../polytracker/src/taint_sources/taint_sources.cpp) contains source labels (byte offsets into the input) - The Source Label Index is a bitmap that defines how to index the sources section. - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) -- [Strings](../polytracker/include/taintdag/string_table.h) the string table contains things like names of sources, the function names used with the functions section to map to the control flow log, etc. todo(kaoudis) in future this could probably be less of a catchall for 'all things stringly' and those things could have separate lookups or be stored inline if we're gonna mmap so much space anyway. +- [Strings](../polytracker/include/taintdag/string_table.h) the string table is a section with two current purposes: it contains names of sources, and also the function names used with the functions section to map to the control flow log. Its design is general purpose: it can store any type of stringly data we eventually decide to store. - [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see tests/test_cf_log.py for how this layout looks/works. - [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow ("tainted control flow" or "control affecting data flow" are ways that we have referred to this subset of data flow) passed through, plus recorded function ids that can be mapped back through the Functions section to the String Table to get mangled symbols. see test_cf_log.py and unittests/src/taintdag/tdag.cpp for details of how this looks/works. diff --git a/polytracker/include/polytracker/passes/taint_tracking.h b/polytracker/include/polytracker/passes/taint_tracking.h index 6ff676d6..2d66f5cd 100644 --- a/polytracker/include/polytracker/passes/taint_tracking.h +++ b/polytracker/include/polytracker/passes/taint_tracking.h @@ -32,6 +32,9 @@ class TaintTrackingPass : public llvm::PassInfoMixin, void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); void visitBranchInst(llvm::BranchInst &bi); void visitSwitchInst(llvm::SwitchInst &si); + void visitSelectInst(llvm::SelectInst &si); + void visitIndirectBrInst(llvm::IndirectBrInst &ibi); + void visitInvokeInst(llvm::InvokeInst &ii); }; } // namespace polytracker \ No newline at end of file diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index ddbd4c10..791a4cc7 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -45,6 +45,8 @@ class TaintedControlFlowPass void visitBranchInst(llvm::BranchInst &bi); void visitSwitchInst(llvm::SwitchInst &si); void visitSelectInst(llvm::SelectInst &si); + void visitIndirectBrInst(llvm::IndirectBrInst &ibi); + void visitInvokeInst(llvm::InvokeInst &ii); void instrumentFunctionEnter(llvm::Function &func); void visitReturnInst(llvm::ReturnInst &ri); diff --git a/polytracker/src/passes/taint_tracking.cpp b/polytracker/src/passes/taint_tracking.cpp index 266cfa71..5dbd3a34 100644 --- a/polytracker/src/passes/taint_tracking.cpp +++ b/polytracker/src/passes/taint_tracking.cpp @@ -104,6 +104,22 @@ void TaintTrackingPass::visitSwitchInst(llvm::SwitchInst &si) { insertCondBrLogCall(si, si.getCondition()); } +void TaintTrackingPass::visitSelectInst(llvm::SelectInst &si) { + insertCondBrLogCall(si, si.getCondition()); +} + +void TaintTrackingPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { + insertCondBrLogCall(ibi, ibi.getAddress()); +} + +void TaintTrackingPass::visitInvokeInst(llvm::InvokeInst &ii) { + // Track taint on the function pointer for indirect calls + auto func = ii.getCalledFunction(); + if (!func) { + insertCondBrLogCall(ii, ii.getCalledOperand()); + } +} + void TaintTrackingPass::declareLoggingFunctions(llvm::Module &mod) { llvm::IRBuilder<> ir(mod.getContext()); taint_start_fn = mod.getOrInsertFunction("__taint_start", ir.getVoidTy()); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index e4336697..33b57a17 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -38,16 +38,6 @@ uint32_t get_or_add_mapping(uintptr_t key, } } // namespace -void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, - llvm::Value *val) { - llvm::IRBuilder<> ir(&inst); - auto dummy_val{val}; - if (inst.getType()->isVectorTy()) { - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); - } - ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); -} - llvm::ConstantInt * TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { auto func_address = reinterpret_cast(&func); @@ -61,23 +51,24 @@ TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { } void TaintedControlFlowPass::visitGetElementPtrInst( - llvm::GetElementPtrInst &gep) { + llvm::GetElementPtrInst &gep) { llvm::IRBuilder<> ir(&gep); + // if an index is a constant, skip it for (auto &idx : gep.indices()) { if (llvm::isa(idx)) { continue; } - // we do not handle VectorTypes yet - if ((*(idx->getType())).isVectorTy()) { - continue; + // for now, taint only the first element of any vector index! + // this is an improvement over skipping vector types as we previously did + if (llvm::isa(idx->getType())) { + auto firstVectorElem = ir.CreateExtractElement(idx, ir.getInt32(0)); + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(gep)}); + } else { + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), get_function_id_const(gep)}); } - - auto callret = ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), - get_function_id_const(gep)}); - - idx = ir.CreateSExtOrTrunc(callret, idx->getType()); } } @@ -88,38 +79,73 @@ void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { llvm::IRBuilder<> ir(&bi); auto cond = bi.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); - - bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); + + // just taint the first element of any vector type for now. + // this is an improvement over skipping vector types as we previously did! + if (llvm::isa(cond->getType())) { + auto firstVectorElem = ir.CreateExtractElement(cond, ir.getInt32(0)); + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(bi)}); + } else { + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); + } } void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { llvm::IRBuilder<> ir(&si); auto cond = si.getCondition(); - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); + // just taint the first element of any vector type for now. + // this is an improvement over skipping vector types as we previously did! + if (llvm::isa(cond->getType())) { + auto firstVectorElem = ir.CreateExtractElement(cond, ir.getInt32(0)); + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(si)}); + } else { + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + } } void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - // TODO(hbrodin): Can't handle atm. - if (si.getType()->isVectorTy()) { - return; - } llvm::IRBuilder<> ir(&si); auto cond = si.getCondition(); - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + // just taint the first element of any vector type for now. + // this is an improvement over skipping vector types as we previously did! + if (llvm::isa(cond->getType())) { + auto firstVectorElem = ir.CreateExtractElement(cond, ir.getInt32(0)); + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(si)}); + } else { + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + } +} + +void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { + llvm::IRBuilder<> ir(&ibi); + auto addr = ibi.getAddress(); + + if (llvm::isa(addr->getType())) { + auto firstVectorElem = ir.CreateExtractElement(addr, ir.getInt32(0)); + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(ibi)}); + } else { + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(addr, ir.getInt64Ty()), get_function_id_const(ibi)}); + } +} - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { + llvm::IRBuilder<> ir(&ii); + auto func = ii.getCalledFunction(); + + // Log the function entry with the function ID + ir.CreateCall(fn_enter_log_fn, + {get_function_id_const(ii), + ir.CreateGlobalStringPtr(func ? func->getName() : "indirect")}); } void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { From 2406a5aec9f453cb0f2d694ac1f61c8e42d8d8d3 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 3 Apr 2025 11:38:05 +0000 Subject: [PATCH 081/112] serves me right for letting cursor do stuff and not thinking about it --- polytracker/src/passes/taint_tracking.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/polytracker/src/passes/taint_tracking.cpp b/polytracker/src/passes/taint_tracking.cpp index 5dbd3a34..0ba76572 100644 --- a/polytracker/src/passes/taint_tracking.cpp +++ b/polytracker/src/passes/taint_tracking.cpp @@ -73,9 +73,11 @@ void TaintTrackingPass::insertCondBrLogCall(llvm::Instruction &inst, llvm::Value *val) { llvm::IRBuilder<> ir(&inst); auto dummy_val{val}; + if (inst.getType()->isVectorTy()) { dummy_val = ir.CreateExtractElement(val, uint64_t(0)); } + ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); } @@ -113,11 +115,7 @@ void TaintTrackingPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { } void TaintTrackingPass::visitInvokeInst(llvm::InvokeInst &ii) { - // Track taint on the function pointer for indirect calls - auto func = ii.getCalledFunction(); - if (!func) { - insertCondBrLogCall(ii, ii.getCalledOperand()); - } + insertCondBrLogCall(ii, ii.getCalledOperand()); } void TaintTrackingPass::declareLoggingFunctions(llvm::Module &mod) { From 641a620e40933e42bcd2280311d917fcf5903205 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 3 Apr 2025 11:38:47 +0000 Subject: [PATCH 082/112] try turning off qt6 tests too? --- examples/Dockerfile-poppler.demo | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/Dockerfile-poppler.demo b/examples/Dockerfile-poppler.demo index 3771c7ce..157b8da4 100644 --- a/examples/Dockerfile-poppler.demo +++ b/examples/Dockerfile-poppler.demo @@ -21,6 +21,7 @@ RUN polytracker build cmake -S . -B build \ -DBUILD_SHARED_LIBS=OFF \ -DBUILD_GTK_TESTS=OFF \ -DBUILD_QT5_TESTS=OFF \ + -DBUILD_QT6_TESTS=OFF \ -DBUILD_CPP_TESTS=OFF \ -DENABLE_BOOST=OFF \ -DENABLE_CPP=OFF \ From 4d64bde1c4b653660f1a7e9434b2ad8bf448ddc5 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 3 Apr 2025 11:38:56 +0000 Subject: [PATCH 083/112] comment --- polytracker/include/polytracker/passes/taint_tracking.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/include/polytracker/passes/taint_tracking.h b/polytracker/include/polytracker/passes/taint_tracking.h index 2d66f5cd..e2f156cc 100644 --- a/polytracker/include/polytracker/passes/taint_tracking.h +++ b/polytracker/include/polytracker/passes/taint_tracking.h @@ -15,7 +15,7 @@ namespace polytracker { class TaintTrackingPass : public llvm::PassInfoMixin, public llvm::InstVisitor { - // + // represents the taint label type llvm::IntegerType *label_ty{nullptr}; // Taint tracking startup llvm::FunctionCallee taint_start_fn; From 8c6e5c717b40f8ec4321d28ca17aa48bafba025a Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 3 Apr 2025 11:39:36 +0000 Subject: [PATCH 084/112] try refining the tainted control flow pass in the same fashion as the taint tracking pass, to see if changing stuff in only one place for simplicity of debugging is now possible... --- .../polytracker/passes/tainted_control_flow.h | 7 +- .../src/passes/tainted_control_flow.cpp | 91 +++++-------------- 2 files changed, 27 insertions(+), 71 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index 791a4cc7..9369f246 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -17,10 +17,8 @@ namespace polytracker { class TaintedControlFlowPass : public llvm::PassInfoMixin, public llvm::InstVisitor { - // + // represents the taint label type llvm::IntegerType *label_ty{nullptr}; - // Taint tracking startup - llvm::FunctionCallee taint_start_fn; // Log taint label affecting control flow llvm::FunctionCallee cond_br_log_fn; // Log enter/leave functions @@ -29,8 +27,7 @@ class TaintedControlFlowPass llvm::FunctionCallee fn_leave_log_fn; // Helpers - void insertCondBrLogCall(llvm::Instruction &inst, llvm::Value *val); - void insertTaintStartupCall(llvm::Module &mod); + void insertInstrumentation(llvm::Instruction &inst, llvm::Value *val); void declareLoggingFunctions(llvm::Module &mod); llvm::ConstantInt *get_function_id_const(llvm::Function &f); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 33b57a17..3887e4b3 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -50,25 +50,32 @@ TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { return get_function_id_const(*(i.getParent()->getParent())); } +void +TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Value *val) { + llvm::IRBuilder<> ir(&inst); + auto dummy_val{val}; + + if (llvm::isa(val->getType())) { + dummy_val = ir.CreateExtractElement(val, uint64_t(0)); + } + + // Log the type of dummy_val before the call + spdlog::debug("HIIIIII dummy_val type: {}", dummy_val->getType()->getTypeID()); + + // logs the label and the function id at this point; + // data flow has affected control flow here. + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(dummy_val, label_ty), get_function_id_const(inst)}); +} + void TaintedControlFlowPass::visitGetElementPtrInst( llvm::GetElementPtrInst &gep) { - llvm::IRBuilder<> ir(&gep); // if an index is a constant, skip it for (auto &idx : gep.indices()) { if (llvm::isa(idx)) { continue; } - - // for now, taint only the first element of any vector index! - // this is an improvement over skipping vector types as we previously did - if (llvm::isa(idx->getType())) { - auto firstVectorElem = ir.CreateExtractElement(idx, ir.getInt32(0)); - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(gep)}); - } else { - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), get_function_id_const(gep)}); - } + insertInstrumentation(gep, idx); } } @@ -76,76 +83,28 @@ void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { if (bi.isUnconditional()) { return; } - - llvm::IRBuilder<> ir(&bi); auto cond = bi.getCondition(); - - // just taint the first element of any vector type for now. - // this is an improvement over skipping vector types as we previously did! - if (llvm::isa(cond->getType())) { - auto firstVectorElem = ir.CreateExtractElement(cond, ir.getInt32(0)); - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(bi)}); - } else { - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); - } + insertInstrumentation(bi, cond); } void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { - llvm::IRBuilder<> ir(&si); auto cond = si.getCondition(); - - // just taint the first element of any vector type for now. - // this is an improvement over skipping vector types as we previously did! - if (llvm::isa(cond->getType())) { - auto firstVectorElem = ir.CreateExtractElement(cond, ir.getInt32(0)); - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(si)}); - } else { - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - } + insertInstrumentation(si, cond); } void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - llvm::IRBuilder<> ir(&si); auto cond = si.getCondition(); - - // just taint the first element of any vector type for now. - // this is an improvement over skipping vector types as we previously did! - if (llvm::isa(cond->getType())) { - auto firstVectorElem = ir.CreateExtractElement(cond, ir.getInt32(0)); - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(si)}); - } else { - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - } + insertInstrumentation(si, cond); } void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { - llvm::IRBuilder<> ir(&ibi); auto addr = ibi.getAddress(); - - if (llvm::isa(addr->getType())) { - auto firstVectorElem = ir.CreateExtractElement(addr, ir.getInt32(0)); - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(firstVectorElem, ir.getInt64Ty()), get_function_id_const(ibi)}); - } else { - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(addr, ir.getInt64Ty()), get_function_id_const(ibi)}); - } + insertInstrumentation(ibi, addr); } void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { - llvm::IRBuilder<> ir(&ii); - auto func = ii.getCalledFunction(); - - // Log the function entry with the function ID - ir.CreateCall(fn_enter_log_fn, - {get_function_id_const(ii), - ir.CreateGlobalStringPtr(func ? func->getName() : "indirect")}); + auto func = ii.getCalledOperand(); + insertInstrumentation(ii, func); } void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { From be949d6c30822e81a9b84d89a900affb8c7054dd Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 3 Apr 2025 15:19:19 +0000 Subject: [PATCH 085/112] handle vector types and constants properly --- polytracker/src/passes/taint_tracking.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/polytracker/src/passes/taint_tracking.cpp b/polytracker/src/passes/taint_tracking.cpp index 0ba76572..391c4cdd 100644 --- a/polytracker/src/passes/taint_tracking.cpp +++ b/polytracker/src/passes/taint_tracking.cpp @@ -74,7 +74,7 @@ void TaintTrackingPass::insertCondBrLogCall(llvm::Instruction &inst, llvm::IRBuilder<> ir(&inst); auto dummy_val{val}; - if (inst.getType()->isVectorTy()) { + if (llvm::isa(val->getType())) { dummy_val = ir.CreateExtractElement(val, uint64_t(0)); } @@ -88,7 +88,7 @@ void TaintTrackingPass::insertTaintStartupCall(llvm::Module &mod) { void TaintTrackingPass::visitGetElementPtrInst(llvm::GetElementPtrInst &gep) { for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { + if (llvm::isa(idx)) { continue; } insertCondBrLogCall(gep, idx); @@ -107,15 +107,27 @@ void TaintTrackingPass::visitSwitchInst(llvm::SwitchInst &si) { } void TaintTrackingPass::visitSelectInst(llvm::SelectInst &si) { - insertCondBrLogCall(si, si.getCondition()); + auto cond = si.getCondition(); + if (llvm::isa(cond)) { + return; + } + insertCondBrLogCall(si, cond); } void TaintTrackingPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { - insertCondBrLogCall(ibi, ibi.getAddress()); + auto addr = ibi.getAddress(); + if (llvm::isa(addr)) { + return; + } + insertCondBrLogCall(ibi, addr); } void TaintTrackingPass::visitInvokeInst(llvm::InvokeInst &ii) { - insertCondBrLogCall(ii, ii.getCalledOperand()); + auto called = ii.getCalledOperand(); + if (llvm::isa(called)) { + return; + } + insertCondBrLogCall(ii, called); } void TaintTrackingPass::declareLoggingFunctions(llvm::Module &mod) { From 4d70bfa71f72abb1b267111eeef534e63ffdf4fb Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 3 Apr 2025 16:38:23 +0000 Subject: [PATCH 086/112] got experiment with taint tracking working - try this --- examples/Dockerfile-poppler.demo | 4 +-- .../polytracker/passes/tainted_control_flow.h | 6 ++--- polytracker/src/passes/pass_plugin.cpp | 2 +- .../src/passes/tainted_control_flow.cpp | 27 +++++++++---------- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/examples/Dockerfile-poppler.demo b/examples/Dockerfile-poppler.demo index 157b8da4..530d2fbb 100644 --- a/examples/Dockerfile-poppler.demo +++ b/examples/Dockerfile-poppler.demo @@ -45,7 +45,7 @@ RUN polytracker build cmake -S . -B build \ RUN polytracker build cmake --build build -j$(nproc) # pdftotext (separate for measurement purposes only) -RUN polytracker instrument-targets --cflog pdftotext --ignore-lists freetype fontconfig +RUN polytracker instrument-targets pdftotext --ignore-lists freetype fontconfig # pdftops (separate for measurement purposes only) -RUN polytracker instrument-targets --cflog pdftops --ignore-lists freetype fontconfig \ No newline at end of file +RUN polytracker instrument-targets pdftops --ignore-lists freetype fontconfig \ No newline at end of file diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index 9369f246..b39245b8 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -41,9 +41,9 @@ class TaintedControlFlowPass void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); void visitBranchInst(llvm::BranchInst &bi); void visitSwitchInst(llvm::SwitchInst &si); - void visitSelectInst(llvm::SelectInst &si); - void visitIndirectBrInst(llvm::IndirectBrInst &ibi); - void visitInvokeInst(llvm::InvokeInst &ii); + // void visitSelectInst(llvm::SelectInst &si); + // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); + // void visitInvokeInst(llvm::InvokeInst &ii); void instrumentFunctionEnter(llvm::Function &func); void visitReturnInst(llvm::ReturnInst &ri); diff --git a/polytracker/src/passes/pass_plugin.cpp b/polytracker/src/passes/pass_plugin.cpp index 5de824cf..d910dfd4 100644 --- a/polytracker/src/passes/pass_plugin.cpp +++ b/polytracker/src/passes/pass_plugin.cpp @@ -33,7 +33,7 @@ llvm::PassPluginLibraryInfo getPolyTrackerPluginInfo() { return true; } if (name == "pt-tcf") { - mpm.addPass(polytracker::TaintedControlFlowPass()); + // mpm.addPass(polytracker::TaintedControlFlowPass()); return true; } return false; diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 3887e4b3..d78ddce6 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -59,9 +59,6 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val dummy_val = ir.CreateExtractElement(val, uint64_t(0)); } - // Log the type of dummy_val before the call - spdlog::debug("HIIIIII dummy_val type: {}", dummy_val->getType()->getTypeID()); - // logs the label and the function id at this point; // data flow has affected control flow here. ir.CreateCall(cond_br_log_fn, @@ -92,20 +89,20 @@ void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { insertInstrumentation(si, cond); } -void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - auto cond = si.getCondition(); - insertInstrumentation(si, cond); -} +// void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { +// auto cond = si.getCondition(); +// insertInstrumentation(si, cond); +// } -void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { - auto addr = ibi.getAddress(); - insertInstrumentation(ibi, addr); -} +// void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { +// auto addr = ibi.getAddress(); +// insertInstrumentation(ibi, addr); +// } -void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { - auto func = ii.getCalledOperand(); - insertInstrumentation(ii, func); -} +// void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { +// auto func = ii.getCalledOperand(); +// insertInstrumentation(ii, func); +// } void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { llvm::LLVMContext *context = &mod.getContext(); From 21ea2ddb7b0e5b9d726d5d92428e026f74b944d6 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 3 Apr 2025 16:55:38 +0000 Subject: [PATCH 087/112] oops - checkout does mean we need to have everything committed --- examples/Dockerfile-mupdf.demo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Dockerfile-mupdf.demo b/examples/Dockerfile-mupdf.demo index afab29d9..30e3f549 100644 --- a/examples/Dockerfile-mupdf.demo +++ b/examples/Dockerfile-mupdf.demo @@ -22,7 +22,7 @@ WORKDIR /polytracker/the_klondike/mupdf RUN git checkout d00de0e96a4a5ec90ffc30837d40cd624a6a89e0 # Instrument mutool RUN polytracker build make -j$((`nproc`+1)) HAVE_X11=no HAVE_GLUT=no prefix=/usr/local build=release install -RUN polytracker instrument-targets --cflog mutool +RUN polytracker instrument-targets mutool RUN mv mutool.instrumented mutool_track # Note, the /workdir directory is intended to be mounted at runtime # VOLUME ["/workdir"] From 2713aa8116eb5e28c65a6f707e9d9280b3a5cfb9 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Thu, 3 Apr 2025 16:56:34 +0000 Subject: [PATCH 088/112] adn qpdf --- examples/Dockerfile-qpdf.demo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Dockerfile-qpdf.demo b/examples/Dockerfile-qpdf.demo index c685579b..de10c39f 100644 --- a/examples/Dockerfile-qpdf.demo +++ b/examples/Dockerfile-qpdf.demo @@ -23,5 +23,5 @@ WORKDIR /polytracker/the_klondike/qpdf RUN polytracker build cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DBUILD_STATIC_LIBS=ON #Instrument and build track target RUN polytracker build cmake --build build -j$((`nproc`+1)) -RUN polytracker instrument-targets --cflog qpdf --ignore-lists libz +RUN polytracker instrument-targets qpdf --ignore-lists libz RUN mv qpdf.instrumented qpdf_track From 2c260bd0dae6fe93999670614cc530415633f453 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 07:01:02 +0000 Subject: [PATCH 089/112] try it with the tcf pass, just without the extra instructions... which one will break? rouletteeee --- polytracker/src/passes/pass_plugin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/src/passes/pass_plugin.cpp b/polytracker/src/passes/pass_plugin.cpp index d910dfd4..5de824cf 100644 --- a/polytracker/src/passes/pass_plugin.cpp +++ b/polytracker/src/passes/pass_plugin.cpp @@ -33,7 +33,7 @@ llvm::PassPluginLibraryInfo getPolyTrackerPluginInfo() { return true; } if (name == "pt-tcf") { - // mpm.addPass(polytracker::TaintedControlFlowPass()); + mpm.addPass(polytracker::TaintedControlFlowPass()); return true; } return false; From f59a0decb7aed03caa147458cf4abfbac7e61084 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 07:21:05 +0000 Subject: [PATCH 090/112] adds select back --- .../include/polytracker/passes/tainted_control_flow.h | 2 +- polytracker/src/passes/tainted_control_flow.cpp | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index b39245b8..2de556f1 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -41,7 +41,7 @@ class TaintedControlFlowPass void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); void visitBranchInst(llvm::BranchInst &bi); void visitSwitchInst(llvm::SwitchInst &si); - // void visitSelectInst(llvm::SelectInst &si); + void visitSelectInst(llvm::SelectInst &si); // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); // void visitInvokeInst(llvm::InvokeInst &ii); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index d78ddce6..fe8289df 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -89,10 +89,13 @@ void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { insertInstrumentation(si, cond); } -// void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { -// auto cond = si.getCondition(); -// insertInstrumentation(si, cond); -// } +void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { + auto cond = si.getCondition(); + if (llvm::isa(cond)) { + return; + } + insertInstrumentation(si, cond); +} // void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { // auto addr = ibi.getAddress(); From b9eb6d4dbbb223de9f189bfa3009fc49292eaa5f Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 07:39:42 +0000 Subject: [PATCH 091/112] adds ibi back --- polytracker/src/passes/tainted_control_flow.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index fe8289df..5ecabee8 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -97,10 +97,13 @@ void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { insertInstrumentation(si, cond); } -// void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { -// auto addr = ibi.getAddress(); -// insertInstrumentation(ibi, addr); -// } +void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { + auto addr = ibi.getAddress(); + if (llvm::isa(addr)) { + return; + } + insertInstrumentation(ibi, addr); +} // void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { // auto func = ii.getCalledOperand(); From 0a7a30a54f581d3a35bfc27bb9f00f3a1b025ebb Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 07:58:32 +0000 Subject: [PATCH 092/112] adds invoke back --- polytracker/src/passes/tainted_control_flow.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 5ecabee8..b8be4e42 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -105,10 +105,13 @@ void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { insertInstrumentation(ibi, addr); } -// void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { -// auto func = ii.getCalledOperand(); -// insertInstrumentation(ii, func); -// } +void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { + auto called = ii.getCalledOperand(); + if (llvm::isa(called)) { + return; + } + insertInstrumentation(ii, called); +} void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { llvm::LLVMContext *context = &mod.getContext(); From 2a414b8e76cafaba4474ec8bab70e8ef29784efa Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 08:19:37 +0000 Subject: [PATCH 093/112] oops, uncomment out the indirect and invoke --- polytracker/include/polytracker/passes/tainted_control_flow.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index 2de556f1..9369f246 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -42,8 +42,8 @@ class TaintedControlFlowPass void visitBranchInst(llvm::BranchInst &bi); void visitSwitchInst(llvm::SwitchInst &si); void visitSelectInst(llvm::SelectInst &si); - // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); - // void visitInvokeInst(llvm::InvokeInst &ii); + void visitIndirectBrInst(llvm::IndirectBrInst &ibi); + void visitInvokeInst(llvm::InvokeInst &ii); void instrumentFunctionEnter(llvm::Function &func); void visitReturnInst(llvm::ReturnInst &ri); From 121968b49ee0d0f3c3bd6327f6093be655725ec6 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 08:21:19 +0000 Subject: [PATCH 094/112] try adding cflog back to mupdf example --- examples/Dockerfile-mupdf.demo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Dockerfile-mupdf.demo b/examples/Dockerfile-mupdf.demo index 30e3f549..afab29d9 100644 --- a/examples/Dockerfile-mupdf.demo +++ b/examples/Dockerfile-mupdf.demo @@ -22,7 +22,7 @@ WORKDIR /polytracker/the_klondike/mupdf RUN git checkout d00de0e96a4a5ec90ffc30837d40cd624a6a89e0 # Instrument mutool RUN polytracker build make -j$((`nproc`+1)) HAVE_X11=no HAVE_GLUT=no prefix=/usr/local build=release install -RUN polytracker instrument-targets mutool +RUN polytracker instrument-targets --cflog mutool RUN mv mutool.instrumented mutool_track # Note, the /workdir directory is intended to be mounted at runtime # VOLUME ["/workdir"] From 9f451dc4773fa524f668bfa75c9ba8541a8a6604 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 08:46:41 +0000 Subject: [PATCH 095/112] try without extra instrs --- .../polytracker/passes/tainted_control_flow.h | 6 +-- .../src/passes/tainted_control_flow.cpp | 46 +++++++++---------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index 9369f246..b39245b8 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -41,9 +41,9 @@ class TaintedControlFlowPass void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); void visitBranchInst(llvm::BranchInst &bi); void visitSwitchInst(llvm::SwitchInst &si); - void visitSelectInst(llvm::SelectInst &si); - void visitIndirectBrInst(llvm::IndirectBrInst &ibi); - void visitInvokeInst(llvm::InvokeInst &ii); + // void visitSelectInst(llvm::SelectInst &si); + // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); + // void visitInvokeInst(llvm::InvokeInst &ii); void instrumentFunctionEnter(llvm::Function &func); void visitReturnInst(llvm::ReturnInst &ri); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index b8be4e42..594db861 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -89,29 +89,29 @@ void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { insertInstrumentation(si, cond); } -void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - auto cond = si.getCondition(); - if (llvm::isa(cond)) { - return; - } - insertInstrumentation(si, cond); -} - -void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { - auto addr = ibi.getAddress(); - if (llvm::isa(addr)) { - return; - } - insertInstrumentation(ibi, addr); -} - -void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { - auto called = ii.getCalledOperand(); - if (llvm::isa(called)) { - return; - } - insertInstrumentation(ii, called); -} +// void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { +// auto cond = si.getCondition(); +// if (llvm::isa(cond)) { +// return; +// } +// insertInstrumentation(si, cond); +// } + +// void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { +// auto addr = ibi.getAddress(); +// if (llvm::isa(addr)) { +// return; +// } +// insertInstrumentation(ibi, addr); +// } + +// void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { +// auto called = ii.getCalledOperand(); +// if (llvm::isa(called)) { +// return; +// } +// insertInstrumentation(ii, called); +// } void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { llvm::LLVMContext *context = &mod.getContext(); From c1d0895103c6c3a8f15f8656e49ecd8b51725499 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 09:04:07 +0000 Subject: [PATCH 096/112] try without switch --- .../include/polytracker/passes/tainted_control_flow.h | 2 +- polytracker/src/passes/tainted_control_flow.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index b39245b8..58bb30c9 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -40,7 +40,7 @@ class TaintedControlFlowPass llvm::ModuleAnalysisManager &mam); void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); void visitBranchInst(llvm::BranchInst &bi); - void visitSwitchInst(llvm::SwitchInst &si); + // void visitSwitchInst(llvm::SwitchInst &si); // void visitSelectInst(llvm::SelectInst &si); // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); // void visitInvokeInst(llvm::InvokeInst &ii); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 594db861..f4ff9628 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -84,10 +84,10 @@ void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { insertInstrumentation(bi, cond); } -void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { - auto cond = si.getCondition(); - insertInstrumentation(si, cond); -} +// void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { +// auto cond = si.getCondition(); +// insertInstrumentation(si, cond); +// } // void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { // auto cond = si.getCondition(); From 4e2e77c52aaa9c6000fe4366321c47e696cdb082 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 09:19:39 +0000 Subject: [PATCH 097/112] try with just gep - and checking if the vector element type is constant --- .../polytracker/passes/tainted_control_flow.h | 2 +- .../src/passes/tainted_control_flow.cpp | 20 +++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index 58bb30c9..faa00fdf 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -39,7 +39,7 @@ class TaintedControlFlowPass llvm::PreservedAnalyses run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam); void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); - void visitBranchInst(llvm::BranchInst &bi); + // void visitBranchInst(llvm::BranchInst &bi); // void visitSwitchInst(llvm::SwitchInst &si); // void visitSelectInst(llvm::SelectInst &si); // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index f4ff9628..c4869fa4 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -56,6 +56,10 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val auto dummy_val{val}; if (llvm::isa(val->getType())) { + if (llvm::isa(val->getElementType())) { + return; + } + dummy_val = ir.CreateExtractElement(val, uint64_t(0)); } @@ -69,20 +73,20 @@ void TaintedControlFlowPass::visitGetElementPtrInst( llvm::GetElementPtrInst &gep) { // if an index is a constant, skip it for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { + if (llvm::isa(idx)) { continue; } insertInstrumentation(gep, idx); } } -void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { - if (bi.isUnconditional()) { - return; - } - auto cond = bi.getCondition(); - insertInstrumentation(bi, cond); -} +// void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { +// if (bi.isUnconditional()) { +// return; +// } +// auto cond = bi.getCondition(); +// insertInstrumentation(bi, cond); +// } // void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { // auto cond = si.getCondition(); From 8fb2938523327751bdc354b0bab1541bbfff8870 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 09:27:27 +0000 Subject: [PATCH 098/112] cast to vector first before getting element --- polytracker/src/passes/tainted_control_flow.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index c4869fa4..d75e3154 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -56,7 +56,8 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val auto dummy_val{val}; if (llvm::isa(val->getType())) { - if (llvm::isa(val->getElementType())) { + auto vec = llvm::cast(val)); + if (llvm::isa(vec.getElementType())) { return; } From 3ac4fab80f9639735d89dbc40e47dfb90aecf3e0 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 09:27:55 +0000 Subject: [PATCH 099/112] cast to vector first before getting element --- polytracker/src/passes/tainted_control_flow.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index d75e3154..07a271e2 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -56,7 +56,7 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val auto dummy_val{val}; if (llvm::isa(val->getType())) { - auto vec = llvm::cast(val)); + auto vec = llvm::cast(val); if (llvm::isa(vec.getElementType())) { return; } From 7242c02b9ce6aa8dacf14764f233251f6e947319 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 09:36:39 +0000 Subject: [PATCH 100/112] try checking vector value type --- polytracker/src/passes/tainted_control_flow.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 07a271e2..18009233 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -56,8 +56,9 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val auto dummy_val{val}; if (llvm::isa(val->getType())) { - auto vec = llvm::cast(val); - if (llvm::isa(vec.getElementType())) { + // constants aren't derived from input, so we don't need to taint them + if (llvm::isa(val) || + llvm::isa(val)) { return; } From 2cedbd3e12772c389a01d6e6167d800763439319 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 09:48:28 +0000 Subject: [PATCH 101/112] try commenting out the actual insertion and the sign extension to see if its literally just element extraction for gep that is borked --- polytracker/src/passes/tainted_control_flow.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 18009233..fa2bc47c 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -67,8 +67,8 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val // logs the label and the function id at this point; // data flow has affected control flow here. - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(dummy_val, label_ty), get_function_id_const(inst)}); + // ir.CreateCall(cond_br_log_fn, + // {ir.CreateSExtOrTrunc(dummy_val, label_ty), get_function_id_const(inst)}); } void TaintedControlFlowPass::visitGetElementPtrInst( From 8227b5e7548b79a25135cc33eac074a2f89598ea Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 10:15:27 +0000 Subject: [PATCH 102/112] oops, made the way we were visiting instrs transparent, but not the actual instrumentation --- polytracker/src/passes/tainted_control_flow.cpp | 8 ++++---- polytracker/src/polytracker/polytracker.cpp | 9 +++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index fa2bc47c..465230fe 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -62,13 +62,13 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val return; } - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); + dummy_val = ir.CreateExtractElement(val, ir.getInt32(0)); } // logs the label and the function id at this point; // data flow has affected control flow here. - // ir.CreateCall(cond_br_log_fn, - // {ir.CreateSExtOrTrunc(dummy_val, label_ty), get_function_id_const(inst)}); + ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(dummy_val, label_ty), get_function_id_const(inst)}); } void TaintedControlFlowPass::visitGetElementPtrInst( @@ -130,7 +130,7 @@ void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { {{llvm::AttributeList::FunctionIndex, llvm::Attribute::get(mod.getContext(), llvm::Attribute::ReadNone)}}), - ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); + ir.getVoidTy(), label_ty, ir.getInt32Ty()); enter_log_fn_type = llvm::FunctionType::get( llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index a00a724b..b18cbaf8 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -43,16 +43,13 @@ extern "C" void __polytracker_taint_argv(int argc, char *argv[]) { polytracker::taint_argv(argc, argv); } -extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( +extern "C" void __dfsw___polytracker_log_tainted_control_flow( uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, - dfsan_label function_label, dfsan_label *ret_label) { + dfsan_label function_label) { if (conditional_label > 0) { get_polytracker_tdag().log_tainted_control_flow(conditional_label, - functionid); + function_label); } - - *ret_label = conditional_label; - return conditional; } extern "C" void __polytracker_enter_function(uint32_t function_id, From 4a4681af887d3373e1f62f4786f41328e0508be0 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 11:09:06 +0000 Subject: [PATCH 103/112] align extraction index with the size of label_ty --- polytracker/src/passes/taint_tracking.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polytracker/src/passes/taint_tracking.cpp b/polytracker/src/passes/taint_tracking.cpp index 391c4cdd..565925a0 100644 --- a/polytracker/src/passes/taint_tracking.cpp +++ b/polytracker/src/passes/taint_tracking.cpp @@ -75,7 +75,7 @@ void TaintTrackingPass::insertCondBrLogCall(llvm::Instruction &inst, auto dummy_val{val}; if (llvm::isa(val->getType())) { - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); + dummy_val = ir.CreateExtractElement(val, ir.getInt32(0)); } ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); From 285753c818f4f287d6c41b15b9744ac92115ea00 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 11:09:24 +0000 Subject: [PATCH 104/112] turn select back on in tcf --- .../polytracker/passes/tainted_control_flow.h | 6 ++-- .../src/passes/tainted_control_flow.cpp | 36 +++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index faa00fdf..2de556f1 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -39,9 +39,9 @@ class TaintedControlFlowPass llvm::PreservedAnalyses run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam); void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); - // void visitBranchInst(llvm::BranchInst &bi); - // void visitSwitchInst(llvm::SwitchInst &si); - // void visitSelectInst(llvm::SelectInst &si); + void visitBranchInst(llvm::BranchInst &bi); + void visitSwitchInst(llvm::SwitchInst &si); + void visitSelectInst(llvm::SelectInst &si); // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); // void visitInvokeInst(llvm::InvokeInst &ii); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 465230fe..d6b7f860 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -82,26 +82,26 @@ void TaintedControlFlowPass::visitGetElementPtrInst( } } -// void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { -// if (bi.isUnconditional()) { -// return; -// } -// auto cond = bi.getCondition(); -// insertInstrumentation(bi, cond); -// } +void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { + if (bi.isUnconditional()) { + return; + } + auto cond = bi.getCondition(); + insertInstrumentation(bi, cond); +} -// void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { -// auto cond = si.getCondition(); -// insertInstrumentation(si, cond); -// } +void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { + auto cond = si.getCondition(); + insertInstrumentation(si, cond); +} -// void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { -// auto cond = si.getCondition(); -// if (llvm::isa(cond)) { -// return; -// } -// insertInstrumentation(si, cond); -// } +void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { + auto cond = si.getCondition(); + if (llvm::isa(cond)) { + return; + } + insertInstrumentation(si, cond); +} // void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { // auto addr = ibi.getAddress(); From 5bcc226c90de9c480e09f0820f59eaa8e6c6f08f Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 11:55:50 +0000 Subject: [PATCH 105/112] ughhhh try with all the instrs except gep commented out again --- .../polytracker/passes/tainted_control_flow.h | 6 ++-- .../src/passes/tainted_control_flow.cpp | 36 +++++++++---------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index 2de556f1..faa00fdf 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -39,9 +39,9 @@ class TaintedControlFlowPass llvm::PreservedAnalyses run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam); void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); - void visitBranchInst(llvm::BranchInst &bi); - void visitSwitchInst(llvm::SwitchInst &si); - void visitSelectInst(llvm::SelectInst &si); + // void visitBranchInst(llvm::BranchInst &bi); + // void visitSwitchInst(llvm::SwitchInst &si); + // void visitSelectInst(llvm::SelectInst &si); // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); // void visitInvokeInst(llvm::InvokeInst &ii); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index d6b7f860..465230fe 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -82,26 +82,26 @@ void TaintedControlFlowPass::visitGetElementPtrInst( } } -void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { - if (bi.isUnconditional()) { - return; - } - auto cond = bi.getCondition(); - insertInstrumentation(bi, cond); -} +// void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { +// if (bi.isUnconditional()) { +// return; +// } +// auto cond = bi.getCondition(); +// insertInstrumentation(bi, cond); +// } -void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { - auto cond = si.getCondition(); - insertInstrumentation(si, cond); -} +// void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { +// auto cond = si.getCondition(); +// insertInstrumentation(si, cond); +// } -void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - auto cond = si.getCondition(); - if (llvm::isa(cond)) { - return; - } - insertInstrumentation(si, cond); -} +// void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { +// auto cond = si.getCondition(); +// if (llvm::isa(cond)) { +// return; +// } +// insertInstrumentation(si, cond); +// } // void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { // auto addr = ibi.getAddress(); From bea7a67605b4663cd7b255d5db6b85633869113d Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 12:14:46 +0000 Subject: [PATCH 106/112] experiment with following the pattern used for the taint tracking instrumentation --- polytracker/src/polytracker/polytracker.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index b18cbaf8..c71994e4 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -43,15 +43,20 @@ extern "C" void __polytracker_taint_argv(int argc, char *argv[]) { polytracker::taint_argv(argc, argv); } -extern "C" void __dfsw___polytracker_log_tainted_control_flow( - uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, - dfsan_label function_label) { +extern "C" void ___polytracker_log_tainted_control_flow( + dfsan_label conditional_label, dfsan_label function_label) { if (conditional_label > 0) { get_polytracker_tdag().log_tainted_control_flow(conditional_label, function_label); } } +extern "C" void __dfsw___polytracker_log_tainted_control_flow( + uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, + dfsan_label function_label) { + ___polytracker_log_tainted_control_flow(conditional_label, function_label); +} + extern "C" void __polytracker_enter_function(uint32_t function_id, const char *function_name) { get_polytracker_tdag().record_function_name(function_id, From 0ef0523b61cd5ae45c5eeb9fbdc0bb279e8302b1 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 12:21:57 +0000 Subject: [PATCH 107/112] why do we explicitly declare the dsfw version? gonna fafo I suppose --- polytracker/src/polytracker/polytracker.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index c71994e4..e217cfd1 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -44,18 +44,18 @@ extern "C" void __polytracker_taint_argv(int argc, char *argv[]) { } extern "C" void ___polytracker_log_tainted_control_flow( - dfsan_label conditional_label, dfsan_label function_label) { + dfsan_label conditional_label, uint32_t function_label) { if (conditional_label > 0) { get_polytracker_tdag().log_tainted_control_flow(conditional_label, function_label); } } -extern "C" void __dfsw___polytracker_log_tainted_control_flow( - uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, - dfsan_label function_label) { - ___polytracker_log_tainted_control_flow(conditional_label, function_label); -} +// extern "C" void __dfsw___polytracker_log_tainted_control_flow( +// uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, +// uint32_t function_label) { +// ___polytracker_log_tainted_control_flow(conditional_label, function_label); +// } extern "C" void __polytracker_enter_function(uint32_t function_id, const char *function_name) { From 9d4f790b0144d31f9b9d605dd76d4bfea9b2ba04 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 12:39:21 +0000 Subject: [PATCH 108/112] okay, try being custom --- polytracker/src/polytracker/polytracker.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index e217cfd1..df0a874c 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -43,19 +43,19 @@ extern "C" void __polytracker_taint_argv(int argc, char *argv[]) { polytracker::taint_argv(argc, argv); } -extern "C" void ___polytracker_log_tainted_control_flow( - dfsan_label conditional_label, uint32_t function_label) { +extern "C" void __polytracker_log_tainted_control_flow( + dfsan_label conditional_label, uint32_t function_id) { if (conditional_label > 0) { get_polytracker_tdag().log_tainted_control_flow(conditional_label, - function_label); + function_id); } } -// extern "C" void __dfsw___polytracker_log_tainted_control_flow( -// uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, -// uint32_t function_label) { -// ___polytracker_log_tainted_control_flow(conditional_label, function_label); -// } +extern "C" void __dfsw___polytracker_log_tainted_control_flow( + uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, + uint32_t function_id_label) { + __polytracker_log_tainted_control_flow(conditional_label, function_id_label); +} extern "C" void __polytracker_enter_function(uint32_t function_id, const char *function_name) { From 1e6d9a4a925a89ca020ee2fc4a00f03f95f23cf0 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 13:13:27 +0000 Subject: [PATCH 109/112] try a static cast of what is passed from the dfsw to the main instr fn --- polytracker/src/polytracker/polytracker.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index df0a874c..8e2e6df8 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -53,8 +53,10 @@ extern "C" void __polytracker_log_tainted_control_flow( extern "C" void __dfsw___polytracker_log_tainted_control_flow( uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, - uint32_t function_id_label) { - __polytracker_log_tainted_control_flow(conditional_label, function_id_label); + dfsan_label function_id_label) { + // Convert function_id_label from dfsan_label to uint32_t + uint32_t fid_32 = static_cast(function_id_label); + __polytracker_log_tainted_control_flow(conditional_label, fid_32); } extern "C" void __polytracker_enter_function(uint32_t function_id, From 410ae10d7e499d54347484b206df3fb78e0bd72e Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 15:38:27 +0000 Subject: [PATCH 110/112] try with NO visitors to see if even that breaks --- .../polytracker/passes/tainted_control_flow.h | 2 +- .../src/passes/tainted_control_flow.cpp | 26 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index faa00fdf..d9a37784 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -38,7 +38,7 @@ class TaintedControlFlowPass llvm::PreservedAnalyses run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam); - void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); + // void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); // void visitBranchInst(llvm::BranchInst &bi); // void visitSwitchInst(llvm::SwitchInst &si); // void visitSelectInst(llvm::SelectInst &si); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index 465230fe..a1668490 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -65,22 +65,24 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val dummy_val = ir.CreateExtractElement(val, ir.getInt32(0)); } + auto label = ir.CreateSExtOrTrunc(dummy_val, label_ty); + auto function_id = get_function_id_const(inst); + // logs the label and the function id at this point; // data flow has affected control flow here. - ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(dummy_val, label_ty), get_function_id_const(inst)}); + ir.CreateCall(cond_br_log_fn, {label, function_id}); } -void TaintedControlFlowPass::visitGetElementPtrInst( - llvm::GetElementPtrInst &gep) { - // if an index is a constant, skip it - for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { - continue; - } - insertInstrumentation(gep, idx); - } -} +// void TaintedControlFlowPass::visitGetElementPtrInst( +// llvm::GetElementPtrInst &gep) { +// // if an index is a constant, skip it +// for (auto &idx : gep.indices()) { +// if (llvm::isa(idx)) { +// continue; +// } +// insertInstrumentation(gep, idx); +// } +// } // void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { // if (bi.isUnconditional()) { From 00fab6df7ba6b22b6dbec8f2e39c61598da65b06 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 16:17:43 +0000 Subject: [PATCH 111/112] uncomment branch --- .../polytracker/passes/tainted_control_flow.h | 2 +- .../src/passes/tainted_control_flow.cpp | 28 +++++++++---------- polytracker/src/polytracker/polytracker.cpp | 1 - 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index d9a37784..faa00fdf 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -38,7 +38,7 @@ class TaintedControlFlowPass llvm::PreservedAnalyses run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam); - // void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); + void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); // void visitBranchInst(llvm::BranchInst &bi); // void visitSwitchInst(llvm::SwitchInst &si); // void visitSelectInst(llvm::SelectInst &si); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index a1668490..c7c945f8 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -56,13 +56,11 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val auto dummy_val{val}; if (llvm::isa(val->getType())) { - // constants aren't derived from input, so we don't need to taint them - if (llvm::isa(val) || - llvm::isa(val)) { + dummy_val = ir.CreateExtractElement(val, ir.getInt32(0)); + + if (llvm::isa(dummy_val->getType())) { return; } - - dummy_val = ir.CreateExtractElement(val, ir.getInt32(0)); } auto label = ir.CreateSExtOrTrunc(dummy_val, label_ty); @@ -73,16 +71,16 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val ir.CreateCall(cond_br_log_fn, {label, function_id}); } -// void TaintedControlFlowPass::visitGetElementPtrInst( -// llvm::GetElementPtrInst &gep) { -// // if an index is a constant, skip it -// for (auto &idx : gep.indices()) { -// if (llvm::isa(idx)) { -// continue; -// } -// insertInstrumentation(gep, idx); -// } -// } +void TaintedControlFlowPass::visitGetElementPtrInst( + llvm::GetElementPtrInst &gep) { + // if an index is a constant, skip it + for (auto &idx : gep.indices()) { + if (llvm::isa(idx)) { + continue; + } + insertInstrumentation(gep, idx); + } +} // void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { // if (bi.isUnconditional()) { diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 8e2e6df8..d24708fb 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -54,7 +54,6 @@ extern "C" void __polytracker_log_tainted_control_flow( extern "C" void __dfsw___polytracker_log_tainted_control_flow( uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, dfsan_label function_id_label) { - // Convert function_id_label from dfsan_label to uint32_t uint32_t fid_32 = static_cast(function_id_label); __polytracker_log_tainted_control_flow(conditional_label, fid_32); } From a5ebbb37fb8a19a39277c41942bb7860d31a4403 Mon Sep 17 00:00:00 2001 From: Kelly Kaoudis Date: Fri, 4 Apr 2025 16:17:48 +0000 Subject: [PATCH 112/112] uncomment branch --- .../polytracker/passes/tainted_control_flow.h | 4 +-- .../src/passes/tainted_control_flow.cpp | 32 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index faa00fdf..996a4810 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -38,8 +38,8 @@ class TaintedControlFlowPass llvm::PreservedAnalyses run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam); - void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); - // void visitBranchInst(llvm::BranchInst &bi); + // void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); + void visitBranchInst(llvm::BranchInst &bi); // void visitSwitchInst(llvm::SwitchInst &si); // void visitSelectInst(llvm::SelectInst &si); // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index c7c945f8..71eb9b7a 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -71,25 +71,25 @@ TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Val ir.CreateCall(cond_br_log_fn, {label, function_id}); } -void TaintedControlFlowPass::visitGetElementPtrInst( - llvm::GetElementPtrInst &gep) { - // if an index is a constant, skip it - for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { - continue; - } - insertInstrumentation(gep, idx); - } -} - -// void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { -// if (bi.isUnconditional()) { -// return; +// void TaintedControlFlowPass::visitGetElementPtrInst( +// llvm::GetElementPtrInst &gep) { +// // if an index is a constant, skip it +// for (auto &idx : gep.indices()) { +// if (llvm::isa(idx)) { +// continue; +// } +// insertInstrumentation(gep, idx); // } -// auto cond = bi.getCondition(); -// insertInstrumentation(bi, cond); // } +void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { + if (bi.isUnconditional()) { + return; + } + auto cond = bi.getCondition(); + insertInstrumentation(bi, cond); +} + // void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { // auto cond = si.getCondition(); // insertInstrumentation(si, cond);