diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..0037e615 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +# Autoformatter friendly flake8 config (all formatting rules disabled) +[flake8] +extend-ignore = D1, D2, E1, E2, E3, E501, W1, W2, W3, W5 \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index da74c34e..8b2651cb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,11 +1,11 @@ name: Build - +permissions: {} on: workflow_dispatch: {} push: branches: - - "master" + - master tags: - "*" @@ -21,16 +21,16 @@ jobs: steps: - name: Clone polytracker repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Build and export - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: . tags: trailofbits/polytracker:latest @@ -57,7 +57,17 @@ jobs: - name: Load image run: docker load --input /tmp/polytracker.tar - - name: Run tests + - name: C++ (write side) tests + uses: addnab/docker-run-action@v3 + with: + image: trailofbits/polytracker:latest + run: | + cd /polytracker-build/unittests/src/taintdag && \ + pwd && \ + chmod +x tests-taintdag && \ + ./tests-taintdag + + - name: Python (integration) tests uses: addnab/docker-run-action@v3 with: image: trailofbits/polytracker:latest @@ -73,7 +83,7 @@ jobs: steps: - name: Clone polytracker repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: true fetch-depth: 1 @@ -88,10 +98,10 @@ jobs: run: docker load --input /tmp/polytracker.tar - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Build ${{ matrix.example }} - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: examples file: examples/Dockerfile-${{ matrix.example }}.demo diff --git a/.gitignore b/.gitignore index 798be876..7019f8c5 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,5 @@ __pycache__ /Default/ polytracker.egg-info /polytracker.egg-info -compile_commands.json \ No newline at end of file +compile_commands.json +/test_inputs/* diff --git a/.trunk/.gitignore b/.trunk/.gitignore index 1e246529..15966d08 100644 --- a/.trunk/.gitignore +++ b/.trunk/.gitignore @@ -6,3 +6,4 @@ plugins user_trunk.yaml user.yaml +tmp diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index ca85e5b9..a962cf27 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -1,44 +1,56 @@ version: 0.1 cli: - version: 1.14.1 + version: 1.22.11 plugins: sources: - id: trunk - ref: v1.2.1 + ref: v1.6.7 uri: https://github.com/trunk-io/plugins runtimes: enabled: - - go@1.19.5 - - node@18.12.1 + - go@1.21.0 + - node@18.20.5 - python@3.10.8 lint: + threshold: + - linters: [ALL] + level: high ignore: - linters: [ALL] paths: - polytracker/src/compiler-rt/** + - third_party/** + - linters: [bandit] + paths: + - tests/** + definitions: + - name: flake8 + direct_configs: + - .flake8 + enabled: - - taplo@0.8.1 - - actionlint@1.6.25 - - bandit@1.7.5 - - black@23.7.0 - - checkov@2.4.5 + - taplo@0.9.3 + - actionlint@1.7.7 + - bandit@1.8.3 + - black@25.1.0 + - checkov@3.2.390 - clang-format@16.0.3 - - flake8@6.1.0 + - flake8@7.1.2 - git-diff-check - - hadolint@2.12.0 - - isort@5.12.0 - - markdownlint@0.35.0 - - mypy@1.5.1 - - oxipng@8.0.0 - - prettier@3.0.2 - - ruff@0.0.285 - - shellcheck@0.9.0 + - hadolint@2.12.1-beta + - isort@6.0.1 + - markdownlint@0.44.0 + - mypy@1.15.0 + - oxipng@9.1.4 + - prettier@3.5.3 + - ruff@0.11.1 + - shellcheck@0.10.0 - shfmt@3.6.0 - - svgo@3.0.2 - - terrascan@1.18.3 - - trivy@0.44.1 - - trufflehog@3.48.0 - - yamllint@1.32.0 + - svgo@3.3.2 + - terrascan@1.19.1 + - trivy@0.60.0 + - trufflehog@3.88.18 + - yamllint@1.36.2 actions: disabled: - trunk-announce diff --git a/CMakeLists.txt b/CMakeLists.txt index 8178ad5f..b5f920ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,8 +22,6 @@ endif() set(CMAKE_CXX_STANDARD 20) -add_subdirectory(third_party/Catch2) -list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/Catch2/contrib") add_subdirectory(third_party/indicators) set(SPDLOG_NO_EXCEPTIONS TRUE) add_subdirectory(third_party/spdlog) @@ -31,4 +29,6 @@ add_subdirectory(third_party/spdlog) add_subdirectory(polytracker) enable_testing() +add_subdirectory(third_party/Catch2) +list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/third_party/Catch2/contrib") add_subdirectory(unittests/src/taintdag) diff --git a/Dockerfile b/Dockerfile index f7f7f7eb..3b8c8a80 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # Build base image -FROM ubuntu:jammy as base +FROM ubuntu:jammy AS base LABEL org.opencontainers.image.authors="evan.sultanik@trailofbits.com" @@ -41,7 +41,7 @@ RUN GO111MODULE=off go get github.com/SRI-CSL/gllvm/cmd/... ENV PATH=$PATH:/root/go/bin # Clone llvm to build `libc++` from source -FROM base as llvm-sources +FROM base AS llvm-sources RUN git clone --depth 1 --branch llvmorg-13.0.0 https://github.com/llvm/llvm-project.git /llvm-project @@ -54,7 +54,7 @@ RUN git clone --depth 1 --branch llvmorg-13.0.0 https://github.com/llvm/llvm-pro # Build "clean" `libc++` with `gclang`. Used to link the uninstrumented # target of the user project. Installed into `/cxx_lib/clean_build`. -FROM llvm-sources as clean-libcxx +FROM llvm-sources AS clean-libcxx ENV WLLVM_BC_STORE=/cxx_clean_bitcode RUN mkdir -p $WLLVM_BC_STORE @@ -78,7 +78,7 @@ RUN cmake --build $LIBCXX_BUILD_DIR --target install-cxx install-cxxabi -j$((`np # Build "poly" `libc++` with `gclang`. Used to link the instrumented # target of the user project. Installed into `/cxx_lib/poly_build`. -FROM clean-libcxx as poly-libcxx +FROM clean-libcxx AS poly-libcxx ENV WLLVM_BC_STORE=/cxx_poly_bitcode RUN mkdir -p $WLLVM_BC_STORE @@ -104,7 +104,7 @@ RUN cmake -GNinja \ RUN cmake --build $LIBCXX_BUILD_DIR --target install-cxx install-cxxabi -j$((`nproc`+1)) # Build and install the polytracker -FROM poly-libcxx as polytracker +FROM poly-libcxx AS polytracker ARG DFSAN_FILENAME_ARCH=x86_64 diff --git a/docs/tdag.md b/docs/tdag.md index 993f89fa..26705c2b 100644 --- a/docs/tdag.md +++ b/docs/tdag.md @@ -37,10 +37,9 @@ Some specifics: - [Sources](../polytracker/src/taint_sources/taint_sources.cpp) contains source labels (byte offsets into the input) - The Source Label Index is a bitmap that defines how to index the sources section. - [Sinks](../polytracker/include/taintdag/sink.h) contains sink labels (representing bytes of the output) -- [Strings](../polytracker/include/taintdag/string_table.h) todo(kaoudis) the string table is used in conjunction with the fnmapping to put together an earlier version of the control flow log used for grammar extraction -- [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this contains an early version of the function list part of the control flow log used for grammar extraction -- [Events](../polytracker/include/taintdag/fntrace.h) todo(kaoudis) this contains an early version of the entry and exit events used to structure the control flow log -- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow passed through. +- [Strings](../polytracker/include/taintdag/string_table.h) the string table is a section with two current purposes: it contains names of sources, and also the function names used with the functions section to map to the control flow log. Its design is general purpose: it can store any type of stringly data we eventually decide to store. +- [Functions](../polytracker/include/taintdag/fnmapping.h) todo(kaoudis) this section maps the low-level function identifiers used in the cflog to the actual (mangled) names in the strings table. see tests/test_cf_log.py for how this layout looks/works. +- [Control Flow Log](../polytracker/include/taintdag/control_flow_log.h): this consists of the function entry and exit records we need to reconstruct the call stack that data flow ("tainted control flow" or "control affecting data flow" are ways that we have referred to this subset of data flow) passed through, plus recorded function ids that can be mapped back through the Functions section to the String Table to get mangled symbols. see test_cf_log.py and unittests/src/taintdag/tdag.cpp for details of how this looks/works. ## TDAG Contents diff --git a/examples/Dockerfile-acropalypse.demo b/examples/Dockerfile-acropalypse.demo index fedbd39f..e4fd23db 100644 --- a/examples/Dockerfile-acropalypse.demo +++ b/examples/Dockerfile-acropalypse.demo @@ -27,5 +27,5 @@ RUN CPPFLAGS="-I$(pwd)/zlib-1.2.13/include" LDFLAGS="-L$(pwd)/zlib-1.2.13/lib" p RUN polytracker extract-bc -o pngtest.bc pngtest RUN llvm-link -o pngtest-linked.bc pngtest.bc libz.bc -RUN polytracker instrument-bc --taint --ftrace pngtest-linked.bc -o instrumented.bc +RUN polytracker instrument-bc --cflog pngtest-linked.bc -o instrumented.bc RUN polytracker lower-bc instrumented.bc -t pngtest -o pngtest.instrumented diff --git a/examples/Dockerfile-daedalus-nitf.demo b/examples/Dockerfile-daedalus-nitf.demo index 3e86d07f..e8ae88ae 100644 --- a/examples/Dockerfile-daedalus-nitf.demo +++ b/examples/Dockerfile-daedalus-nitf.demo @@ -30,5 +30,5 @@ RUN cabal run ../../:daedalus -- compile-c++ nitf_main.ddl --out-dir=cpp_parser WORKDIR /polytracker/the_klondike/daedalus/formats/nitf/cpp_parser RUN polytracker build make parser && \ - polytracker instrument-targets --taint --ftrace parser --ignore-lists gmp ssl libz && \ + polytracker instrument-targets --cflog parser --ignore-lists gmp ssl libz && \ mv parser.instrumented parser-track diff --git a/examples/Dockerfile-daedalus-pdf.demo b/examples/Dockerfile-daedalus-pdf.demo index a3ebca69..d709270c 100644 --- a/examples/Dockerfile-daedalus-pdf.demo +++ b/examples/Dockerfile-daedalus-pdf.demo @@ -4,11 +4,12 @@ RUN mkdir -p /polytracker/the_klondike WORKDIR /polytracker/the_klondike -ENV DEBIAN_FRONTEND=noninteractive +ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get -y upgrade && apt-get install -y git pkg-config RUN git clone --recursive https://github.com/GaloisInc/daedalus.git WORKDIR /polytracker/the_klondike/daedalus +RUN git checkout 46a2502a8c68b17534079356a71af90e87801fdc FROM trailofbits/polytracker:latest LABEL org.opencontainers.image.authors="evan.sultanik@trailofbits.com" @@ -30,5 +31,5 @@ WORKDIR /polytracker/the_klondike/daedalus/formats/pdf/new/c++ RUN polytracker build cmake -S . -B build RUN polytracker build cmake --build build --target parser-test -j$(nproc) -RUN polytracker instrument-targets --taint --ftrace parser-test --ignore-lists gmp ssl libz +RUN polytracker instrument-targets --cflog parser-test --ignore-lists gmp ssl libz RUN mv parser-test.instrumented parser-test-track \ No newline at end of file diff --git a/examples/Dockerfile-ffmpeg.demo b/examples/Dockerfile-ffmpeg.demo index 5e19b135..a8d90d50 100644 --- a/examples/Dockerfile-ffmpeg.demo +++ b/examples/Dockerfile-ffmpeg.demo @@ -27,7 +27,7 @@ RUN ../configure --disable-everything \ --disable-asm RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace ffmpeg_g --ignore-lists libz +RUN polytracker instrument-targets --cflog ffmpeg_g --ignore-lists libz RUN mv ffmpeg_g.instrumented ffmpeg_track # Use the following command to transcode a `.mov` h264/aac file to an `.avi` raw/aac file diff --git a/examples/Dockerfile-file.demo b/examples/Dockerfile-file.demo index 46c48492..09c779f8 100644 --- a/examples/Dockerfile-file.demo +++ b/examples/Dockerfile-file.demo @@ -16,5 +16,5 @@ RUN git fetch --tags && \ RUN autoreconf -fiv RUN ./configure --prefix=/polytracker/the_klondike/bin/ --disable-shared RUN polytracker build make -j$((`nproc`+1)) install -RUN polytracker instrument-targets --taint --ftrace file --ignore-lists libz +RUN polytracker instrument-targets --cflog file --ignore-lists libz RUN mv file.instrumented file_track diff --git a/examples/Dockerfile-jq.demo b/examples/Dockerfile-jq.demo index 54b68464..4d1c260f 100644 --- a/examples/Dockerfile-jq.demo +++ b/examples/Dockerfile-jq.demo @@ -11,4 +11,4 @@ WORKDIR /polytracker/the_klondike/jq RUN autoreconf -fi RUN ./configure --with-oniguruma=builtin CC=clang RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace jq \ No newline at end of file +RUN polytracker instrument-targets --cflog jq \ No newline at end of file diff --git a/examples/Dockerfile-libjpeg.demo b/examples/Dockerfile-libjpeg.demo index 39fcc192..09f341e2 100644 --- a/examples/Dockerfile-libjpeg.demo +++ b/examples/Dockerfile-libjpeg.demo @@ -18,6 +18,6 @@ WORKDIR /polytracker/the_klondike/jpeg-9e/build RUN ../configure LDFLAGS="-static" # Build and instrument RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace djpeg +RUN polytracker instrument-targets --cflog djpeg # Create `djpeg_track` RUN mv djpeg.instrumented djpeg_track \ No newline at end of file diff --git a/examples/Dockerfile-listgen.demo b/examples/Dockerfile-listgen.demo index 0b1417e8..6f1aa272 100644 --- a/examples/Dockerfile-listgen.demo +++ b/examples/Dockerfile-listgen.demo @@ -11,7 +11,7 @@ RUN apt update #Update pkg-config/util-linux (needed for FontConfig) RUN apt update RUN apt install pkg-config uuid-dev gperf libtool \ - gettext autopoint autoconf -y + gettext autopoint autoconf wget -y RUN apt-get install python3-dev @@ -38,7 +38,7 @@ RUN make -j5 install WORKDIR /polytracker/the_klondike #zlib -RUN wget https://www.zlib.net/zlib-1.2.11.tar.gz +RUN wget https://www.zlib.net/fossils/zlib-1.2.11.tar.gz RUN tar -xzvf zlib-1.2.11.tar.gz WORKDIR zlib-1.2.11 RUN ./configure --prefix=/usr && make -j$(nproc) test && make -j$(nproc) install diff --git a/examples/Dockerfile-mupdf.demo b/examples/Dockerfile-mupdf.demo index e3c2da1c..afab29d9 100644 --- a/examples/Dockerfile-mupdf.demo +++ b/examples/Dockerfile-mupdf.demo @@ -4,7 +4,7 @@ RUN mkdir -p /polytracker/the_klondike WORKDIR /polytracker/the_klondike -ENV DEBIAN_FRONTEND=noninteractive +ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get -y upgrade && apt-get install -y git pkg-config RUN git clone --recursive --branch 1.20.0 git://git.ghostscript.com/mupdf.git @@ -22,8 +22,8 @@ WORKDIR /polytracker/the_klondike/mupdf RUN git checkout d00de0e96a4a5ec90ffc30837d40cd624a6a89e0 # Instrument mutool RUN polytracker build make -j$((`nproc`+1)) HAVE_X11=no HAVE_GLUT=no prefix=/usr/local build=release install -RUN polytracker instrument-targets --taint --ftrace mutool +RUN polytracker instrument-targets --cflog mutool RUN mv mutool.instrumented mutool_track # Note, the /workdir directory is intended to be mounted at runtime -VOLUME ["/workdir"] -WORKDIR /workdir \ No newline at end of file +# VOLUME ["/workdir"] +# WORKDIR /workdir \ No newline at end of file diff --git a/examples/Dockerfile-nitro-nitf.demo b/examples/Dockerfile-nitro-nitf.demo index 3a84b0d0..d13b3181 100644 --- a/examples/Dockerfile-nitro-nitf.demo +++ b/examples/Dockerfile-nitro-nitf.demo @@ -18,9 +18,6 @@ RUN polytracker build cmake .. \ RUN polytracker build cmake --build . -j$((`nproc`+1)) --target show_nitf++ -RUN polytracker instrument-targets \ - --taint \ - --ftrace \ - show_nitf++ +RUN polytracker instrument-targets --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_track diff --git a/examples/Dockerfile-openjpeg.demo b/examples/Dockerfile-openjpeg.demo index daa7be32..ba2cd59b 100644 --- a/examples/Dockerfile-openjpeg.demo +++ b/examples/Dockerfile-openjpeg.demo @@ -20,5 +20,5 @@ RUN polytracker extract-bc bin/opj_decompress -o opj_decompress.bc RUN polytracker extract-bc bin/libopenjp2.a -o libopenjp2.a.bc RUN llvm-link -only-needed opj_decompress.bc libopenjp2.a.bc -o exec.bc RUN polytracker opt-bc exec.bc -o exec.bc -RUN polytracker instrument-bc --taint --ftrace exec.bc -o exec.bc -o exec.instrumented.bc +RUN polytracker instrument-bc --cflog exec.bc -o exec.bc -o exec.instrumented.bc RUN polytracker lower-bc exec.instrumented.bc -t opj_decompress -o opj_decompress_track diff --git a/examples/Dockerfile-pdfium.demo b/examples/Dockerfile-pdfium.demo index 6d536b52..77942a3a 100644 --- a/examples/Dockerfile-pdfium.demo +++ b/examples/Dockerfile-pdfium.demo @@ -9,6 +9,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata \ RUN DEBIAN_FRONTEND=noninteractive apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y \ + curl \ git \ pkg-config \ sudo \ diff --git a/examples/Dockerfile-poppler.demo b/examples/Dockerfile-poppler.demo index fd58f962..530d2fbb 100644 --- a/examples/Dockerfile-poppler.demo +++ b/examples/Dockerfile-poppler.demo @@ -3,7 +3,7 @@ FROM ubuntu:focal AS poppler-sources WORKDIR /polytracker/the_klondike ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y git -RUN git clone --depth=1 --branch poppler-23.06.0 https://anongit.freedesktop.org/git/poppler/poppler.git +RUN git clone --depth=1 --branch poppler-23.06.0 https://gitlab.freedesktop.org/poppler/poppler.git # Now, build the qpdf image using previously downloaded source FROM trailofbits/polytracker:latest @@ -21,6 +21,7 @@ RUN polytracker build cmake -S . -B build \ -DBUILD_SHARED_LIBS=OFF \ -DBUILD_GTK_TESTS=OFF \ -DBUILD_QT5_TESTS=OFF \ + -DBUILD_QT6_TESTS=OFF \ -DBUILD_CPP_TESTS=OFF \ -DENABLE_BOOST=OFF \ -DENABLE_CPP=OFF \ @@ -44,7 +45,7 @@ RUN polytracker build cmake -S . -B build \ RUN polytracker build cmake --build build -j$(nproc) # pdftotext (separate for measurement purposes only) -RUN polytracker instrument-targets --taint --ftrace pdftotext --ignore-lists freetype fontconfig +RUN polytracker instrument-targets pdftotext --ignore-lists freetype fontconfig # pdftops (separate for measurement purposes only) -RUN polytracker instrument-targets --taint --ftrace pdftops --ignore-lists freetype fontconfig \ No newline at end of file +RUN polytracker instrument-targets pdftops --ignore-lists freetype fontconfig \ No newline at end of file diff --git a/examples/Dockerfile-qpdf.demo b/examples/Dockerfile-qpdf.demo index 7a98a612..de10c39f 100644 --- a/examples/Dockerfile-qpdf.demo +++ b/examples/Dockerfile-qpdf.demo @@ -2,7 +2,7 @@ FROM ubuntu:focal AS qpdf-sources WORKDIR /polytracker/the_klondike ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && apt-get install -y git wget +RUN apt-get update && apt-get install -y git wget RUN git clone --depth=1 --branch 11.5 https://github.com/qpdf/qpdf.git RUN wget https://www.ijg.org/files/jpegsrc.v9e.tar.gz && tar xf jpegsrc.v9e.tar.gz @@ -23,5 +23,5 @@ WORKDIR /polytracker/the_klondike/qpdf RUN polytracker build cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DBUILD_STATIC_LIBS=ON #Instrument and build track target RUN polytracker build cmake --build build -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace qpdf --ignore-lists libz +RUN polytracker instrument-targets qpdf --ignore-lists libz RUN mv qpdf.instrumented qpdf_track diff --git a/examples/Dockerfile-xpdf.demo b/examples/Dockerfile-xpdf.demo index b6b440a4..e80fb571 100644 --- a/examples/Dockerfile-xpdf.demo +++ b/examples/Dockerfile-xpdf.demo @@ -91,8 +91,8 @@ RUN polytracker build make -j$(nproc) install # pdftops.instrumented, pdftotext.instrumented, and pdfinfo.instrumented # These commands are split up for timing / debugging purposes but you could # run them all as one big instrument-targets as well. -RUN polytracker instrument-targets --taint --ftrace pdftotext --ignore-lists freetype fontconfig xml2 libz +RUN polytracker instrument-targets --cflog pdftotext --ignore-lists freetype fontconfig xml2 libz -RUN polytracker instrument-targets --taint --ftrace pdfinfo --ignore-lists freetype fontconfig xml2 libz +RUN polytracker instrument-targets --cflog pdfinfo --ignore-lists freetype fontconfig xml2 libz -RUN polytracker instrument-targets --taint --ftrace pdftops --ignore-lists freetype fontconfig xml2 libz \ No newline at end of file +RUN polytracker instrument-targets --cflog pdftops --ignore-lists freetype fontconfig xml2 libz \ No newline at end of file diff --git a/examples/analysis/timings.md b/examples/analysis/timings.md new file mode 100644 index 00000000..bd69b16d --- /dev/null +++ b/examples/analysis/timings.md @@ -0,0 +1,117 @@ +# What is this change doing? + +My goal is for taint tracking to work exactly as before, but to clean up the ftrace/cflog/events side of the house, unifying `--cflog` and `--ftrace` options (cleaning up / simplifying how we are writing to the Functions, Events, Control Flow Log, and String Table sections overall) so we don't add duplicate instrumentation to software or write duplicate data to the TDAG and/or separate files (i.e., functionid.json) anymore. + +Everything that I could build got run on example inputs to make sure it worked as expected. As a part of these changes we don't write to functionid.json anymore and just use the space we were allocating and not filling in in the tdag, since it's a humongous region we don't use all of anyway. TDAG size is fixed, but our usage of it is slightly more efficient currently. A future goal could be to only mmap the space we need so file size can be smaller. + +## What did this change break? + +Hopefully nothing? :D + +## Instrumentation Time and Resulting Bitcode Sizes + +These experiments reproduce the measurements from the +[PolyTracker paper](https://github.com/trailofbits/publications/blob/master/papers/issta24-polytracker.pdf), +but on different hardware. For uniformity, experiments were all conducted in an Ubuntu 24.04 cloud VM with + +- 500 GB disk +- 64 GiB RAM +- 8 vCPUs + +I'm comparing the before-and-after of the TDAG condensation changes on `kaoudis/merge-function-sections` with `master` at `e618c4d6d7481326d0ea76073d663d2b867e0e9d`, the hash of the work included in the camera ready version of the prior paper. The question I'm answering here is "what is the net result of these changes in terms of how the software works". + +All the current example Dockerfiles on `master` that work right now (we/I need to clean up the others a bit; they're a bit bitrotted) are included here for completeness. The following measurements aren't terribly scientific, they are from one run of the Dockerfile each (whereas for the paper I averaged ten runs apiece). + +### Bitcode sizes + +The "in" .bc file is the whole-program .bc file that gets the first layer of instrumentation applied to it. The CFlog .bc is the "in" .bc with CFlog instrumentation, pre-optimization (if optimization occurs in the PolyTracker build). the final .bc file is the instrumented .bc file ending in `.instrumented.bc` that we lower to an executable. bc size may have changed because what instrumentation we use changed: I removed the separate function name recording / events pass-level code, and added function name recording to the tdag into the cflog pass. I also removed the separate `--ftrace` and `--taint` options: we do `--taint` by default, and `--ftrace` is part of `--cflog` now. + +Also note that some dockerfiles did not compile on the `master` branch prior to these changes with the `--cflog` option and I'm not sure why, but because of this I did not record cflog-inclusive bc size for them on `master`. + +As measured by `ls -lb` in the container, and normalized into MiB: + +| Dockerfile | In .bc size | Final .bc BEFORE (taint, ftrace, events) | Final .bc BEFORE (cflog, taint, ftrace, events) | CFlog-_only_ .bc | Final .bc AFTER (cflog, taint) | Final .bc AFTER (taint only) | +| ----------------------------------- | ----------- | ---------------------------------------- | ----------------------------------------------- | ---------------- | ------------------------------ | ---------------------------- | +| Dockerfile-acropalypse.demo | 1.65 MiB | 1.89 MiB | | 1.89 MiB | 4.4 MiB | 3.94 MiB | +| Dockerfile-daedalus-pdf.demo | 4.15 MiB | 4.76 MiB | 17.83 MiB | 4.95 MiB | 17.62 MiB | 16.39 MiB | +| Dockerfile-ffmpeg.demo | 30.52 MiB | 33.80 MiB | | 33.64 MiB | 84.3 MiB | 84.72 MiB | +| Dockerfile-file.demo | 0.85 MiB | 0.95 MiB | | 0.96 MiB | 1.98 MiB | 1.99 MiB | +| Dockerfile-libjpeg.demo | 1.25 MiB | 1.36 MiB | | 1.36 MiB | 3.33 MiB | 3.62 MiB | +| Dockerfile-mupdf.demo | 14.56 MiB | 18.19 MiB | | 18.19 MiB | 66 MiB | 82.72 MiB | +| Dockerfile-nitro-nitf.demo | 5.79 MiB | 8.23 MiB | 20.64 MiB | 6.57 MiB | 20.62 MiB | 18 MiB | +| Dockerfile-openjpeg.demo | 0.89 MiB | 1.15 MiB | | 1.13 MiB | 4.29 MiB | 3.71 MiB | +| Dockerfile-poppler.demo `pdftops` | 8.82 MiB | 10.25 MiB | 35.58 MiB | 10.17 MiB | 35.77 MiB | 35.99 MiB | +| Dockerfile-poppler.demo `pdftotext` | 8.04 MiB | 9.29 MiB | 31.82 MiB | 9.26 MiB | 32.01 MiB | 32.09 MiB | +| Dockerfile-qpdf.demo | 10.92 MiB | 13.14 MiB | | 13.14 MiB | 49.21 MiB | 47.65 MiB | +| Dockerfile-xpdf.demo `pdfinfo` | 3.78 MiB | 4.56 MiB | 17.14 MiB | 4.37 MiB | 16.88 MiB | 17.80 MiB | +| Dockerfile-xpdf.demo `pdftops` | 4.75 MiB | 5.78 MiB | 22.52 MiB | 5.55 MiB | 22.25 MiB | 23.85 MiB | +| Dockerfile-xpdf.demo `pdftotext` | 3.98 MiB | 4.85 MiB | 18.67 MiB | 4.64 MiB | 18.41 MiB | 19.37 MiB | + +### TDAG sizes + +TDAG size is fixed because of how we write TDAGs right now; it didn't change. + +### Total instrumentation time + +"Instrumentation time" here refers either to the time Docker takes to run `polytracker instrument-targets`, which includes how long it takes to do both cflog and taint label instrumentation placement as well as executable creation, or the time to do equivalent steps. + +Also note that some dockerfiles did not compile on the `master` branch prior to these changes with the `--cflog` option and I'm not sure why, but because of this I did not record cflog-inclusive instrumentation time for them on `master`. + +As measured by Docker: + +| Dockerfile | Instrumentation time (taint, ftrace, events) BEFORE | Instrumentation time (cflog, taint, ftrace, events) BEFORE | Instrumentation time (cflog, taint) AFTER | Instrumentation time (taint only) AFTER | +| ----------------------------------- | --------------------------------------------------- | ---------------------------------------------------------- | ----------------------------------------- | --------------------------------------- | +| Dockerfile-acropalypse.demo | 26.7\* s | | 30.3\* s | 27.3\* s | +| Dockerfile-daedalus-pdf.demo | 34.2 s | 39.1 s | 37.5 s | 35.2 s | +| Dockerfile-ffmpeg.demo | 150.7 s | | 156.5 s | 158.3 s | +| Dockerfile-file.demo | 12.1 s | | 12.4 s | 12.6 s | +| Dockerfile-libjpeg.demo | 22.7 s | | 21.2 s | 23.6 s | +| Dockerfile-mupdf.demo | 152.4 s | | 129.2 s | 154.8 s | +| Dockerfile-nitro-nitf.demo | 30 s | 33.7 s | 33.8 s | 29.5 s | +| Dockerfile-openjpeg.demo | 45.3\* s | | 51.3\* s | 49.6\* s | +| Dockerfile-poppler.demo `pdftops` | 291.2 s | 279.1 s | 290 s | 305.9 s | +| Dockerfile-poppler.demo `pdftotext` | 255.5 s | 249 s | 255.3 s | 268.5 s | +| Dockerfile-qpdf.demo | 382.9 s | | 393.8 s | 391.9 s | +| Dockerfile-xpdf.demo `pdfinfo` | 154.5 s | 141.9 s | 143.3 s | 164.2 s | +| Dockerfile-xpdf.demo `pdftops` | 206.9 s | 189.9 s | 187.2 s | 217.2 s | +| Dockerfile-xpdf.demo `pdftotext` | 169.1 s | 157.1 s | 154.4 s | 184.3 s | + +## What's weird here + +The sizes of bitcode when instrumented with all our passes before AND after these changes seem like they could be indicative of extra instrumentation (perhaps the labels pass instrumenting the cflog and/or functions pass?), though I haven't dug into whether this is truly happening yet. It doesn't _seem like_ this is exactly hurting anything at the moment, but I would be curious if others notice the same. + +## Notes + +### \* + +I combined the times recorded by Docker for extraction, linking, instrumentation, optimization (if included), and lowering to get this figure since that's everything `instrument-targets` would do. + +NB Dockerfile-acropalypse.demo does not run the typical bitcode optimization step as part of instrumentation and lowering. + +### N/As + +The following Dockerfiles did not build on master or on the new branch. Here's minimal notes on why. These should be investigated later if we care to keep them up to date. + +#### DaeDaLus NITF + +DaeDaLus NITF parser fails on the Cabal build of DaeDaLus, and also did at the time I did the prior eval work for the paper. I think this is because the DaeDaLus repository main branch is broken, and we need to pin a prior working commit in that Dockerfile. I don't know what this commit would be - the one mentioned in the Dockerfile doesn't build, either. + +#### jq + +Linking failed for build defined in Dockerfile. Also to investigate later; was not included in paper. + +#### libgen + +`go get` is no longer a supported command outside a module, and the Go setup in this Dockerfile would need to be updated. + +#### listgen + +After solving a couple minor errors due to zlib URL changing etc, building the libxml2-2.9.10 codebase failed with Python macro errors. + +#### pdfium + +pdfium's build halted and prompted me for the country of my keyboard. This will need to be fixed so the build is completely automated. I think I recall doing some work here that I didn't save to make the build more automated - I think a particular commit might need to be pinned in the source repo. + +#### png + +The png Dockerfile seeisms unfinished and doesn't instrument anything. I have a different version in a volume saved from a different cloud provider that I can pull out and use later. diff --git a/examples/analysis/ubet/Dockerfile.nitro b/examples/analysis/ubet/Dockerfile.nitro index 3549b341..761346cb 100644 --- a/examples/analysis/ubet/Dockerfile.nitro +++ b/examples/analysis/ubet/Dockerfile.nitro @@ -30,11 +30,7 @@ RUN polytracker build cmake --build . -j$((`nproc`+1)) --target show_nitf++ --co RUN cp modules/c++/nitf/show_nitf++ nitro_Release -RUN polytracker instrument-targets \ - --taint \ - --ftrace \ - --cflog \ - show_nitf++ +RUN polytracker instrument-targets --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_trackRelease @@ -49,11 +45,7 @@ RUN polytracker build cmake ../.. \ RUN polytracker build cmake --build . -j$((`nproc`+1)) --clean-first --target show_nitf++ --config Debug RUN cp modules/c++/nitf/show_nitf++ nitro_Debug -RUN polytracker instrument-targets \ - --taint \ - --ftrace \ - --cflog \ - show_nitf++ +RUN polytracker instrument-targets --cflog show_nitf++ RUN mv show_nitf++.instrumented nitro_trackDebug diff --git a/examples/analysis/ubet/build_nitro.sh b/examples/analysis/ubet/build_nitro.sh index d9a13a2a..6e88735d 100755 --- a/examples/analysis/ubet/build_nitro.sh +++ b/examples/analysis/ubet/build_nitro.sh @@ -14,7 +14,7 @@ opt -load "${COMPILER_DIR}/pass/libPolytrackerPass.so" -load-pass-plugin "${COMP echo "Optmize bitcode" polytracker opt-bc --output O3.bc after_preoptO3.bc echo "Instrument optimized bitcode" -polytracker instrument-bc --ftrace --taint --output instrumentedO3.bc O3.bc +polytracker instrument-bc --cflog --output instrumentedO3.bc O3.bc echo "Lower optimized bitcode" polytracker lower-bc -t show_nitf++ -o nitro_trackRelease instrumentedO3.bc @@ -36,7 +36,7 @@ opt -load "${COMPILER_DIR}/pass/libPolytrackerPass.so" -load-pass-plugin "${COMP cp after_preoptO0.bc O0.bc echo "Instrument non-optimized bitcode" -polytracker instrument-bc --ftrace --taint --output instrumentedO0.bc O0.bc +polytracker instrument-bc --cflog --output instrumentedO0.bc O0.bc echo "Lower non-optimized bitcode" polytracker lower-bc -t show_nitf++ -o nitro_trackDebug instrumentedO0.bc diff --git a/examples/analysis/ubet/eval.py b/examples/analysis/ubet/eval.py index 36344f34..c8ef1328 100644 --- a/examples/analysis/ubet/eval.py +++ b/examples/analysis/ubet/eval.py @@ -1,14 +1,13 @@ # /usr/bin/python import os import random -import sys import subprocess -from typing import List, Tuple +import sys from pathlib import Path +from typing import List, Tuple from polytracker import PolyTrackerTrace - src_arg = Path(sys.argv[1]) no_build = "nobuild" == sys.argv[2] if len(sys.argv) > 2 else False src_dir = src_arg.parent @@ -37,7 +36,7 @@ def polytracker_build(cmdline): def polytracker_instrument(bin): - command = ["/usr/bin/env", "polytracker", "instrument-targets", "--taint", bin] + command = ["/usr/bin/env", "polytracker", "instrument-targets", "--cflog", bin] target_name = f"{bin}.instrumented" if not no_build: subprocess.call(command, cwd=src_dir) diff --git a/examples/analysis/ubet/eval_nitro.py b/examples/analysis/ubet/eval_nitro.py index c188aabb..8652e745 100644 --- a/examples/analysis/ubet/eval_nitro.py +++ b/examples/analysis/ubet/eval_nitro.py @@ -1,19 +1,20 @@ import argparse -from collections import defaultdict -import subprocess import os +import subprocess # nosec B404 import sys -from typing import Optional, Set, Iterator, Tuple, Dict -from polytracker import PolyTrackerTrace, taint_dag -from polytracker.taint_dag import TDFile, TDNode, TDSourceNode, TDUnionNode, TDRangeNode -from polytracker.mapping import InputOutputMapping +from collections import defaultdict +from functools import partialmethod from pathlib import Path +from typing import Dict, Iterator, Optional, Set, Tuple + +import cxxfilt # To Silence TQDM! from tqdm import tqdm -from functools import partialmethod -import cxxfilt +from polytracker import PolyTrackerTrace, taint_dag +from polytracker.mapping import InputOutputMapping +from polytracker.taint_dag import TDFile, TDNode, TDRangeNode, TDSourceNode, TDUnionNode tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) @@ -210,7 +211,6 @@ def run_instrumented(is_debug: bool, inputfile: Path, targetdir: Path): e = { "POLYDB": str(db), "POLYTRACKER_STDOUT_SINK": "1", - "POLYTRACKER_LOG_CONTROL_FLOW": "1", } ret = subprocess.run(args, env=e, stdout=subprocess.PIPE, stderr=subprocess.PIPE) os.rename(db, targetdir / db) @@ -275,7 +275,7 @@ def get_cflog_entires(tdfile, is_debug): map( lambda e: (input_offsets(e.label, tdfile), e.callstack), filter( - lambda e: isinstance(e, taint_dag.TDTaintedControlFlowEvent), cflog + lambda e: isinstance(e, taint_dag.TaintedControlFlowEvent), cflog ), ) ) @@ -312,9 +312,11 @@ def get_cflog_entires(tdfile, is_debug): print_cols( str(dbg_entry[0]), str(rel_entry[0]), - f" !!! DBG: {dbg_callstack} != REL: {rel_callstack}" - if dbg_callstack != rel_callstack - else "", + ( + f" !!! DBG: {dbg_callstack} != REL: {rel_callstack}" + if dbg_callstack != rel_callstack + else "" + ), ) dbgidx += 1 relidx += 1 diff --git a/examples/http/httpd/Dockerfile b/examples/http/httpd/Dockerfile index 8c250356..46385bc6 100644 --- a/examples/http/httpd/Dockerfile +++ b/examples/http/httpd/Dockerfile @@ -11,7 +11,7 @@ WORKDIR /polytracker/examples/http/httpd/httpd RUN mkdir -p srclib/apr srclib/apr-util srclib/pcre srclib/expat RUN curl https://archive.apache.org/dist/apr/apr-1.7.0.tar.gz -o apr-1.7.0.tar.gz \ && tar xfz apr-1.7.0.tar.gz -C srclib/apr --strip-components 1 \ - && rm apr-1.7.0.tar.gz + && rm apr-1.7.0.tar.gz RUN curl https://archive.apache.org/dist/apr/apr-util-1.6.1.tar.gz -o apr-util-1.6.1.tar.gz \ && tar xfz apr-util-1.6.1.tar.gz -C srclib/apr-util --strip-components 1 \ && rm apr-util-1.6.1.tar.gz @@ -33,19 +33,19 @@ RUN polytracker build make # apr, apr-util are configured via httpd's configure script WORKDIR /polytracker/examples/http/httpd/httpd -RUN polytracker build ./buildconf +RUN polytracker build ./buildconf RUN CFLAGS="-I$(pwd)/srclib/pcre -I$(pwd)/srclib/expat/lib" \ LDFLAGS="-L$(pwd)/srclib/pcre/.libs -L$(pwd)/srclib/expat/lib/.libs" \ polytracker build ./configure --disable-shared --with-mpm=prefork --with-pcre=srclib/pcre/pcre-config --with-included-apr \ --enable-mods-static='authz_core unixd' RUN CFLAGS="-I$(pwd)/srclib/pcre -I$(pwd)/srclib/expat/lib" \ - LDFLAGS="-L$(pwd)/srclib/pcre/.libs -L$(pwd)/srclib/expat/lib/.libs" \ + LDFLAGS="-L$(pwd)/srclib/pcre/.libs -L$(pwd)/srclib/expat/lib/.libs" \ polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace httpd +RUN polytracker instrument-targets --cflog httpd RUN mv httpd.instrumented httpd_track -# overwrite binary to be installed with our polytracker-instrumented version -RUN cp httpd_track httpd +# overwrite binary to be installed with our polytracker-instrumented version +RUN cp httpd_track httpd RUN polytracker build make install COPY harness_httpd.sh /polytracker/examples/http/httpd/ diff --git a/examples/http/picohttpparser/Dockerfile b/examples/http/picohttpparser/Dockerfile index 89cb0fe0..9e4624c9 100644 --- a/examples/http/picohttpparser/Dockerfile +++ b/examples/http/picohttpparser/Dockerfile @@ -10,7 +10,7 @@ COPY Makefile example_picohttpparser.c /polytracker/examples/http/picohttpparser # Build and instrument RUN polytracker build make -j$((`nproc`+1)) -RUN polytracker instrument-targets --taint --ftrace example_picohttpparser +RUN polytracker instrument-targets --cflog example_picohttpparser RUN mv example_picohttpparser.instrumented example_picohttpparser_track # Note, the /workdir and /testcase directories are intended to be mounted at runtime diff --git a/polytracker/build.py b/polytracker/build.py index eeded59c..af5b2771 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -1,9 +1,9 @@ import argparse -import subprocess -import os import json +import os +import subprocess # nosec B404 from pathlib import Path -from typing import List, Dict, Tuple +from typing import Dict, List, Tuple from .plugins import Command @@ -147,10 +147,16 @@ def _optimize_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: subprocess.check_call(cmd) -def _preopt_instrument_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: +def _preopt_instrument_bitcode( + input_bitcode: Path, output_bitcode: Path, ignore_lists: List[str] +) -> None: POLY_PASS_PATH: Path = _ensure_path_exists( _compiler_dir_path() / "pass" / "libPolytrackerPass.so" ) + POLY_ABI_LIST_PATH: Path = _ensure_path_exists( + _compiler_dir_path() / "abi_lists" / "polytracker_abilist.txt" + ) + ABI_PATH: Path = _ensure_path_exists(_compiler_dir_path() / "abi_lists") cmd = [ "opt", @@ -163,16 +169,19 @@ def _preopt_instrument_bitcode(input_bitcode: Path, output_bitcode: Path) -> Non "-o", str(output_bitcode), ] + + if ignore_lists and len(ignore_lists) > 0: + # ignore lists for `pt-tcf` (function tracing for control flow logging) + cmd.append(f"-pt-ftrace-ignore-list={POLY_ABI_LIST_PATH}") + for item in ignore_lists: + cmd.append(f"-pt-ftrace-ignore-list={ABI_PATH}/{item}") + # execute `cmd` subprocess.check_call(cmd) def _instrument_bitcode( - input_bitcode: Path, - output_bitcode: Path, - ignore_lists: List[str], - add_taint_tracking: bool, - add_function_tracing: bool, + input_bitcode: Path, output_bitcode: Path, ignore_lists: List[str] ) -> None: POLY_PASS_PATH: Path = _ensure_path_exists( _compiler_dir_path() / "pass" / "libPolytrackerPass.so" @@ -193,35 +202,19 @@ def _instrument_bitcode( str(POLY_PASS_PATH), ] - pass_pipeline: List[str] = [] - if add_taint_tracking: - pass_pipeline.append("pt-taint") - - if add_function_tracing: - pass_pipeline.append("pt-ftrace") - - if add_taint_tracking: - pass_pipeline += ["pt-dfsan", "pt-rm-fn-attr"] - + pass_pipeline: List[str] = ["pt-taint", "pt-dfsan", "pt-rm-fn-attr"] cmd.append(f"-passes={','.join(pass_pipeline)}") - if add_taint_tracking: - # ignore lists for `pt-taint` - cmd.append( - f"-pt-taint-ignore-list={POLY_ABI_LIST_PATH}", - ) - for item in ignore_lists: - cmd.append(f"-pt-taint-ignore-list={ABI_PATH}/{item}") - # abi lists for `dfsan` - cmd.append(f"-pt-dfsan-abilist={DFSAN_ABI_LIST_PATH}") - for item in ignore_lists: - cmd.append(f"-pt-dfsan-abilist={ABI_PATH}/{item}") - - if add_function_tracing: - # ignore lists for `pt-ftrace` - cmd.append(f"-pt-ftrace-ignore-list={POLY_ABI_LIST_PATH}") - for item in ignore_lists: - cmd.append(f"-pt-ftrace-ignore-list={ABI_PATH}/{item}") + # ignore lists for `pt-taint` + cmd.append( + f"-pt-taint-ignore-list={POLY_ABI_LIST_PATH}", + ) + for item in ignore_lists: + cmd.append(f"-pt-taint-ignore-list={ABI_PATH}/{item}") + # abi lists for `dfsan` + cmd.append(f"-pt-dfsan-abilist={DFSAN_ABI_LIST_PATH}") + for item in ignore_lists: + cmd.append(f"-pt-dfsan-abilist={ABI_PATH}/{item}") # input and output files cmd += [str(input_bitcode), "-o", str(output_bitcode)] @@ -316,18 +309,6 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="output bitcode file", ) - parser.add_argument( - "--taint", - action="store_true", - help="instrument with taint tracking", - ) - - parser.add_argument( - "--ftrace", - action="store_true", - help="instrument with function tracing", - ) - parser.add_argument( "--ignore-lists", nargs="+", @@ -335,15 +316,32 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="specify additional ignore lists to polytracker", ) - def run(self, args: argparse.Namespace): - _instrument_bitcode( - args.input, - args.output, - args.ignore_lists, - args.taint, - args.ftrace, + parser.add_argument( + "--cflog", + action="store_true", + help="also instrument with function tracing and control affecting dataflow logging IN ADDITION TO the default dynamic taint analysis instrumentation passes", ) + def run(self, args: argparse.Namespace): + if args.cflog: + cflog_output = Path(f"{args.output.stem}.cflog_instrumented.bc") + _preopt_instrument_bitcode( + input_bitcode=args.input, + output_bitcode=cflog_output, + ignore_lists=args.ignore_lists, + ) + _instrument_bitcode( + input_bitcode=cflog_output, + output_bitcode=args.output, + ignore_lists=args.ignore_lists, + ) + else: + _instrument_bitcode( + input_bitcode=args.input, + output_bitcode=args.output, + ignore_lists=args.ignore_lists, + ) + class LowerBitcode(Command): name = "lower-bc" @@ -382,7 +380,7 @@ def run(self, args: argparse.Namespace): class InstrumentTargets(Command): name = "instrument-targets" - help = "instruments blight journal build targets with polytracker" + help = "instruments blight journal build targets with polytracker for dynamic taint analysis" def __init_arguments__(self, parser: argparse.ArgumentParser): parser.add_argument( @@ -399,18 +397,6 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="path to blight journal", ) - parser.add_argument( - "--taint", - action="store_true", - help="instrument with taint tracking", - ) - - parser.add_argument( - "--ftrace", - action="store_true", - help="instrument with function tracing", - ) - parser.add_argument( "--ignore-lists", nargs="+", @@ -421,7 +407,7 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): parser.add_argument( "--cflog", action="store_true", - help="instrument with control affecting dataflow logging", + help="also instrument with function tracing and control affecting dataflow logging IN ADDITION TO the default dynamic taint analysis instrumentation passes", ) def run(self, args: argparse.Namespace): @@ -432,16 +418,23 @@ def run(self, args: argparse.Namespace): opt_bc = bc_path.with_suffix(".opt.bc") _extract_bitcode(target_path, bc_path) if args.cflog: - # Control affecting data flow logging happens before optimization - _preopt_instrument_bitcode(bc_path, bc_path) + # Control affecting data flow logging instrumentation happens + # before optimization + cflog_bc_path = Path(f"{bc_path.stem}.cflog_instrumented.bc") + _preopt_instrument_bitcode( + input_bitcode=bc_path, + output_bitcode=cflog_bc_path, + ignore_lists=args.ignore_lists, + ) + + _optimize_bitcode(input_bitcode=cflog_bc_path, output_bitcode=opt_bc) + else: + _optimize_bitcode(input_bitcode=bc_path, output_bitcode=opt_bc) - _optimize_bitcode(bc_path, opt_bc) inst_bc_path = Path(f"{bc_path.stem}.instrumented.bc") _instrument_bitcode( - opt_bc, - inst_bc_path, - args.ignore_lists, - args.taint, - args.ftrace, + input_bitcode=opt_bc, + output_bitcode=inst_bc_path, + ignore_lists=args.ignore_lists, ) _lower_bitcode(inst_bc_path, Path(inst_bc_path.stem), target_cmd) diff --git a/polytracker/include/polytracker/passes/taint_tracking.h b/polytracker/include/polytracker/passes/taint_tracking.h index 6ff676d6..e2f156cc 100644 --- a/polytracker/include/polytracker/passes/taint_tracking.h +++ b/polytracker/include/polytracker/passes/taint_tracking.h @@ -15,7 +15,7 @@ namespace polytracker { class TaintTrackingPass : public llvm::PassInfoMixin, public llvm::InstVisitor { - // + // represents the taint label type llvm::IntegerType *label_ty{nullptr}; // Taint tracking startup llvm::FunctionCallee taint_start_fn; @@ -32,6 +32,9 @@ class TaintTrackingPass : public llvm::PassInfoMixin, void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); void visitBranchInst(llvm::BranchInst &bi); void visitSwitchInst(llvm::SwitchInst &si); + void visitSelectInst(llvm::SelectInst &si); + void visitIndirectBrInst(llvm::IndirectBrInst &ibi); + void visitInvokeInst(llvm::InvokeInst &ii); }; } // namespace polytracker \ No newline at end of file diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index b9d22f6a..996a4810 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -13,26 +13,21 @@ #include namespace polytracker { -namespace detail { -struct FunctionMappingJSONWriter; -} class TaintedControlFlowPass : public llvm::PassInfoMixin, public llvm::InstVisitor { - // + // represents the taint label type llvm::IntegerType *label_ty{nullptr}; - // Taint tracking startup - llvm::FunctionCallee taint_start_fn; // Log taint label affecting control flow llvm::FunctionCallee cond_br_log_fn; // Log enter/leave functions llvm::FunctionCallee fn_enter_log_fn; + llvm::FunctionType *enter_log_fn_type; llvm::FunctionCallee fn_leave_log_fn; // Helpers - void insertCondBrLogCall(llvm::Instruction &inst, llvm::Value *val); - void insertTaintStartupCall(llvm::Module &mod); + void insertInstrumentation(llvm::Instruction &inst, llvm::Value *val); void declareLoggingFunctions(llvm::Module &mod); llvm::ConstantInt *get_function_id_const(llvm::Function &f); @@ -41,26 +36,20 @@ class TaintedControlFlowPass public: using function_id = uint32_t; - TaintedControlFlowPass(); - TaintedControlFlowPass(TaintedControlFlowPass &&); - ~TaintedControlFlowPass(); - llvm::PreservedAnalyses run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam); - void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); + // void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); void visitBranchInst(llvm::BranchInst &bi); - void visitSwitchInst(llvm::SwitchInst &si); - void visitSelectInst(llvm::SelectInst &si); + // void visitSwitchInst(llvm::SwitchInst &si); + // void visitSelectInst(llvm::SelectInst &si); + // void visitIndirectBrInst(llvm::IndirectBrInst &ibi); + // void visitInvokeInst(llvm::InvokeInst &ii); void instrumentFunctionEnter(llvm::Function &func); void visitReturnInst(llvm::ReturnInst &ri); - function_id function_mapping(llvm::Function &func); - std::unordered_map function_ids_; function_id function_counter_{0}; - - std::unique_ptr function_mapping_writer_; }; } // namespace polytracker \ No newline at end of file diff --git a/polytracker/include/taintdag/control_flow_log.h b/polytracker/include/taintdag/control_flow_log.h index 1bcff380..95884353 100644 --- a/polytracker/include/taintdag/control_flow_log.h +++ b/polytracker/include/taintdag/control_flow_log.h @@ -1,4 +1,3 @@ - /* * Copyright (c) 2022-present, Trail of Bits, Inc. * All rights reserved. @@ -9,6 +8,7 @@ #pragma once +#include "taintdag/control_flow_log_encoding.h" #include "taintdag/outputfile.h" #include "taintdag/section.h" #include "taintdag/taint.h" @@ -16,22 +16,6 @@ namespace taintdag { -namespace detail { -// A uint32_t varint encoded by setting highest bit for all but the final byte. -// Requires up to 5 bytes of storage as each output byte uses 7 input bits. -// Total maximum need is floor(32/7) = 5. Returns number of bytes required. -size_t varint_encode(uint32_t val, uint8_t *buffer) { - auto orig_buffer = buffer; - while (val >= 0x80) { - *buffer++ = 0x80 | (val & 0x7f); - val >>= 7; - } - *buffer++ = val & 0x7f; - return buffer - orig_buffer; -} -// TODO (hbrodin): Should probably used std::span -} // namespace detail - struct ControlFlowLog : public SectionBase { enum EventType { EnterFunction = 0, @@ -49,7 +33,7 @@ struct ControlFlowLog : public SectionBase { void function_event(EventType evt, uint32_t function_id) { uint8_t buffer[6]; buffer[0] = static_cast(evt); - auto used = detail::varint_encode(function_id, &buffer[1]); + auto used = varint_encode(function_id, &buffer[1]); auto total = used + 1; if (auto wctx = write(total)) { @@ -71,9 +55,9 @@ struct ControlFlowLog : public SectionBase { // 1 byte event, <= 5 bytes function id, <= 5 bytes label uint8_t buffer[11]; buffer[0] = static_cast(TaintedControlFlow); - auto used = detail::varint_encode(function_id, &buffer[1]); + auto used = varint_encode(function_id, &buffer[1]); auto total = used + 1; - used = detail::varint_encode(label, &buffer[total]); + used = varint_encode(label, &buffer[total]); total += used; if (auto wctx = write(total)) { diff --git a/polytracker/include/taintdag/control_flow_log_encoding.h b/polytracker/include/taintdag/control_flow_log_encoding.h new file mode 100644 index 00000000..4339c1c4 --- /dev/null +++ b/polytracker/include/taintdag/control_flow_log_encoding.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +// Separate from control_flow_log.h to avoid duplicate symbol inclusion in +// testing +namespace taintdag { +// For inclusion in the control flow log, we use varint_encode to bit-pack +// each entry. Returns number of bytes required, which is also included in +// the section so that we know entry boundaries. +size_t varint_encode(uint32_t val, uint8_t *buffer); +} // namespace taintdag \ No newline at end of file diff --git a/polytracker/include/taintdag/fnmapping.h b/polytracker/include/taintdag/fnmapping.h index 1141be71..09d50d5e 100644 --- a/polytracker/include/taintdag/fnmapping.h +++ b/polytracker/include/taintdag/fnmapping.h @@ -21,14 +21,17 @@ namespace taintdag { struct Function { -public: using offset_t = StringTable::offset_t; offset_t name_offset; + uint32_t function_id; + + Function(offset_t name_ofs, uint32_t f_id) + : name_offset(name_ofs), function_id(f_id){}; }; class Functions : public FixedSizeAlloc { public: - using index_t = uint16_t; + using index_t = StringTable::offset_t; static constexpr uint8_t tag{6}; static constexpr size_t allocation_size{std::numeric_limits::max() * @@ -39,11 +42,13 @@ class Functions : public FixedSizeAlloc { : FixedSizeAlloc{of.range}, string_table{of.output_file.template section()} {} - std::optional add_mapping(std::string_view name); + std::optional add_mapping(uint32_t function_id, + std::string_view function_name); private: StringTable &string_table; std::mutex mappings_mutex; + // look up Function index in the Functions section by function name std::unordered_map mappings; }; diff --git a/polytracker/include/taintdag/fntrace.h b/polytracker/include/taintdag/fntrace.h deleted file mode 100644 index c150ab44..00000000 --- a/polytracker/include/taintdag/fntrace.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2022-present, Trail of Bits, Inc. - * All rights reserved. - * - * This source code is licensed in accordance with the terms specified in - * the LICENSE file found in the root directory of this source tree. - */ - -#pragma once - -#include "taintdag/fnmapping.h" - -#include - -namespace taintdag { - -struct Event { -public: - enum class kind_t : uint8_t { entry, exit }; - kind_t kind; - Functions::index_t function; -}; - -class Events : public FixedSizeAlloc { -public: - static constexpr uint8_t tag{7}; - static constexpr size_t allocation_size{std::numeric_limits::max() * - sizeof(Event)}; - - template Events(SectionArg of) : FixedSizeAlloc{of.range} {} - - void log_fn_event(Event::kind_t kind, Functions::index_t idx); -}; - -} // namespace taintdag \ No newline at end of file diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index 751f6d17..7e10f69b 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -14,7 +14,6 @@ #include "taintdag/bitmap_section.h" #include "taintdag/control_flow_log.h" #include "taintdag/fnmapping.h" -#include "taintdag/fntrace.h" #include "taintdag/labels.h" #include "taintdag/sink.h" #include "taintdag/stream_offset.h" @@ -60,26 +59,20 @@ class PolyTracker { void log_tainted_control_flow(label_t taint_label, uint32_t function_id); // Instrumentation callback for when execution enters a function - // NOTE: There is a overlap in functionality between this and `function_entry` - // they will co-exist for now as they operate slightly different. The - // underlying reason is that this was developed separately to support the - // Tainted Control Flow logging mechanism. void enter_function(uint32_t function_id); // Instrumentation callback for when execution leaves a function - // NOTE: Se `enter_function` comment about overlap. void leave_function(uint32_t function_id); + // Log function name + void record_function_name(uint32_t function_id, + std::string_view function_name); + // Log tainted data flowed into the sink void taint_sink(int fd, util::Offset offset, void const *mem, size_t length); // Same as before, but use same label for all data void taint_sink(int fd, util::Offset offset, label_t label, size_t length); - // Log function entry - Functions::index_t function_entry(std::string_view name); - // Log function exit - void function_exit(Functions::index_t index); - private: taint_range_t create_source_taint(source_index_t src, std::span dst, @@ -95,7 +88,7 @@ class PolyTracker { // sections and in which order they appear. using ConcreteOutputFile = OutputFile; + SourceLabelIndexSection, Functions, ControlFlowLog>; ConcreteOutputFile output_file_; // Tracking source offsets for streams (where offsets can be determined by diff --git a/polytracker/include/taintdag/section.h b/polytracker/include/taintdag/section.h index cc699007..d4682026 100644 --- a/polytracker/include/taintdag/section.h +++ b/polytracker/include/taintdag/section.h @@ -142,7 +142,8 @@ template struct FixedSizeAlloc : SectionBase { .t = *new (&*(write_context->mem.begin())) T{std::forward(args)...}}; } - // Failed to allocate memory + spdlog::error("Failed to allocate memory in the section for the object, so " + "could not construct the object in the tdag section"); return {}; } diff --git a/polytracker/include/taintdag/string_table.h b/polytracker/include/taintdag/string_table.h index 9146c427..5c25318b 100644 --- a/polytracker/include/taintdag/string_table.h +++ b/polytracker/include/taintdag/string_table.h @@ -17,12 +17,9 @@ #include "taintdag/util.h" namespace taintdag { - -template -struct StringTableBase : public SectionBase { - using offset_t = OffsetT; - using length_t = LengthT; +struct StringTable : public SectionBase { + using offset_t = uint32_t; + using length_t = uint16_t; static_assert(sizeof(length_t) <= sizeof(offset_t), "offset_t should be larger than or equal to length_t"); @@ -31,36 +28,43 @@ struct StringTableBase : public SectionBase { // Max string length is limited by either length-type or by maximum offset // that can be expressed. - static constexpr size_t max_string_len = + static constexpr size_t max_entry_size = std::min(static_cast(std::numeric_limits::max()), max_offset - sizeof(length_t)); - static constexpr uint8_t tag{Tag}; - static constexpr size_t allocation_size{AllocationSize}; + static constexpr uint8_t tag{3}; + static constexpr size_t allocation_size{0x100000}; static constexpr size_t align_of = alignof(length_t); template - StringTableBase(SectionArg of) : SectionBase{of.range} { - if (of.range.size() > max_offset) { - error_exit("Tried to use an allocation of size ", of.range.size(), - " max in current offset_t config is ", max_offset); - } - } - - // Appends the string `sv` to the string table. - // Returns the offset of the string entry. Note that this is not the - // string, but the offset to the size of it. Recover the string - // by using `from_offset`. + StringTable(SectionArg output_file) : SectionBase{output_file.range} {} + + // Adds the string `sv` to the string table. + // Returns the offset in bytes from the beginning of the section of the string + // entry. Note that this is not the string, but the offset to the size of it. + // Recover the string by using `from_offset`. If a string is bigger than the + // maximum size allowed for an entry it will be truncated. If the string table + // allocation is full, the string will not be stored and no offset will be + // returned. std::optional add_string(std::string_view sv) { - if (sv.size() > max_string_len) { - error_exit("Tried to store a string of size ", sv.size(), " max is ", - max_string_len); - // Doesn't return from here. + if ((sv.size() + sizeof(length_t)) > max_entry_size) { + spdlog::info("Tried to store a string of size {0:d} but max is {1:d} " + "(will truncate string)", + sv.size(), max_entry_size); + + size_t to_truncate = max_entry_size - sizeof(length_t) - 1; + sv = sv.substr(0, to_truncate); + + if ((sv.size() + sizeof(length_t)) > max_entry_size) { + error_exit("Truncated string was too big: ", + sv.size() + sizeof(length_t)); + } } auto len = allocated_len(sv.size()); if (auto write_context = write(len)) { - // prefix with length + // todo(kaoudis) this is possibly a type confusion issue resulting in + // truncation since size_t is bigger than the current length_t *reinterpret_cast(&*(write_context->mem.begin())) = sv.size(); // copy string @@ -135,6 +139,4 @@ struct StringTableBase : public SectionBase { } }; -using StringTable = StringTableBase<>; - } // namespace taintdag diff --git a/polytracker/src/CMakeLists.txt b/polytracker/src/CMakeLists.txt index 66070107..38a1193c 100644 --- a/polytracker/src/CMakeLists.txt +++ b/polytracker/src/CMakeLists.txt @@ -39,11 +39,11 @@ set(TAINTDAG_DIR taintdag) set(TAINTDAG_SOURCES ${TAINTDAG_DIR}/encoding.cpp ${TAINTDAG_DIR}/error.cpp + ${TAINTDAG_DIR}/fnmapping.cpp ${TAINTDAG_DIR}/polytracker.cpp ${TAINTDAG_DIR}/print.cpp - ${TAINTDAG_DIR}/fnmapping.cpp - ${TAINTDAG_DIR}/fntrace.cpp - ${TAINTDAG_DIR}/util.cpp) + ${TAINTDAG_DIR}/util.cpp + ${TAINTDAG_DIR}/control_flow_log_encoding.cpp) add_library(Polytracker STATIC ${POLYTRACKER_SOURCES} ${TAINT_SOURCES} ${TAINTDAG_SOURCES}) diff --git a/polytracker/src/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/polytracker/src/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp index d29438cf..9238b0f5 100644 --- a/polytracker/src/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp +++ b/polytracker/src/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp @@ -164,8 +164,12 @@ bool SupportsColoredOutput(fd_t fd) { } #if !SANITIZER_GO -// TODO(glider): different tools may require different altstack size. -static const uptr kAltStackSize = SIGSTKSZ * 4; // SIGSTKSZ is not enough. +static uptr GetAltStackSize() { + // Note: since GLIBC_2.31, SIGSTKSZ may be a function call, so this may be + // more costly that you think. However GetAltStackSize is only call 2-3 times + // per thread so don't cache the evaluation. + return SIGSTKSZ * 4; +} void SetAlternateSignalStack() { stack_t altstack, oldstack; @@ -176,10 +180,10 @@ void SetAlternateSignalStack() { // TODO(glider): the mapped stack should have the MAP_STACK flag in the // future. It is not required by man 2 sigaltstack now (they're using // malloc()). - void* base = MmapOrDie(kAltStackSize, __func__); + void* base = MmapOrDie(GetAltStackSize(), __func__); altstack.ss_sp = (char*) base; altstack.ss_flags = 0; - altstack.ss_size = kAltStackSize; + altstack.ss_size = GetAltStackSize(); CHECK_EQ(0, sigaltstack(&altstack, nullptr)); } @@ -187,7 +191,7 @@ void UnsetAlternateSignalStack() { stack_t altstack, oldstack; altstack.ss_sp = nullptr; altstack.ss_flags = SS_DISABLE; - altstack.ss_size = kAltStackSize; // Some sane value required on Darwin. + altstack.ss_size = GetAltStackSize(); // Some sane value required on Darwin. CHECK_EQ(0, sigaltstack(&altstack, &oldstack)); UnmapOrDie(oldstack.ss_sp, oldstack.ss_size); } diff --git a/polytracker/src/passes/CMakeLists.txt b/polytracker/src/passes/CMakeLists.txt index af6aaa9d..bd68ca52 100644 --- a/polytracker/src/passes/CMakeLists.txt +++ b/polytracker/src/passes/CMakeLists.txt @@ -6,7 +6,7 @@ endif(APPLE) add_library( PolytrackerPass SHARED - taint_tracking.cpp remove_fn_attr.cpp function_tracing.cpp tainted_control_flow.cpp + taint_tracking.cpp remove_fn_attr.cpp tainted_control_flow.cpp DataFlowSanitizer.cpp utils.cpp pass_plugin.cpp) target_link_libraries( diff --git a/polytracker/src/passes/function_tracing.cpp b/polytracker/src/passes/function_tracing.cpp deleted file mode 100644 index 9fb228e1..00000000 --- a/polytracker/src/passes/function_tracing.cpp +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2022-present, Trail of Bits, Inc. - * All rights reserved. - * - * This source code is licensed in accordance with the terms specified in - * the LICENSE file found in the root directory of this source tree. - */ - -#include "polytracker/passes/function_tracing.h" - -#include -#include - -#include "polytracker/passes/utils.h" -#include "taintdag/fnmapping.h" - -static llvm::cl::list ignore_lists( - "pt-ftrace-ignore-list", - llvm::cl::desc( - "File that specifies functions that pt-ftrace should ignore")); - -namespace polytracker { - -void FunctionTracingPass::declareLoggingFunctions(llvm::Module &mod) { - llvm::IRBuilder<> ir(mod.getContext()); - auto fn_index_t{ir.getIntNTy(sizeof(taintdag::Functions::index_t) * 8)}; - func_entry_log_fn = - mod.getOrInsertFunction("__polytracker_log_func_entry", fn_index_t, - ir.getInt8PtrTy(), ir.getInt16Ty()); - func_exit_log_fn = mod.getOrInsertFunction("__polytracker_log_func_exit", - ir.getVoidTy(), fn_index_t); -} - -void FunctionTracingPass::visitReturnInst(llvm::ReturnInst &ri) { - llvm::IRBuilder<> ir(&ri); - ir.CreateCall(func_exit_log_fn, log_entry_calls[ri.getFunction()]); -} - -llvm::PreservedAnalyses -FunctionTracingPass::run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam) { - declareLoggingFunctions(mod); - auto ignore{readIgnoreLists(ignore_lists)}; - for (auto &fn : mod) { - auto fname{fn.getName()}; - if (fn.isDeclaration() || ignore.count(fname.str())) { - continue; - } - llvm::IRBuilder<> ir(&*fn.getEntryBlock().begin()); - auto fname_ptr{ir.CreateGlobalStringPtr(fname)}; - log_entry_calls[&fn] = ir.CreateCall( - func_entry_log_fn, {fname_ptr, ir.getInt16(fname.size())}); - visit(fn); - } - return llvm::PreservedAnalyses::none(); -} - -} // namespace polytracker \ No newline at end of file diff --git a/polytracker/src/passes/pass_plugin.cpp b/polytracker/src/passes/pass_plugin.cpp index e8ad4a1e..5de824cf 100644 --- a/polytracker/src/passes/pass_plugin.cpp +++ b/polytracker/src/passes/pass_plugin.cpp @@ -10,7 +10,6 @@ #include #include "polytracker/passes/DataFlowSanitizer.h" -#include "polytracker/passes/function_tracing.h" #include "polytracker/passes/remove_fn_attr.h" #include "polytracker/passes/taint_tracking.h" #include "polytracker/passes/tainted_control_flow.h" @@ -33,10 +32,6 @@ llvm::PassPluginLibraryInfo getPolyTrackerPluginInfo() { mpm.addPass(polytracker::RemoveFnAttrsPass()); return true; } - if (name == "pt-ftrace") { - mpm.addPass(polytracker::FunctionTracingPass()); - return true; - } if (name == "pt-tcf") { mpm.addPass(polytracker::TaintedControlFlowPass()); return true; diff --git a/polytracker/src/passes/taint_tracking.cpp b/polytracker/src/passes/taint_tracking.cpp index 266cfa71..565925a0 100644 --- a/polytracker/src/passes/taint_tracking.cpp +++ b/polytracker/src/passes/taint_tracking.cpp @@ -73,9 +73,11 @@ void TaintTrackingPass::insertCondBrLogCall(llvm::Instruction &inst, llvm::Value *val) { llvm::IRBuilder<> ir(&inst); auto dummy_val{val}; - if (inst.getType()->isVectorTy()) { - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); + + if (llvm::isa(val->getType())) { + dummy_val = ir.CreateExtractElement(val, ir.getInt32(0)); } + ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); } @@ -86,7 +88,7 @@ void TaintTrackingPass::insertTaintStartupCall(llvm::Module &mod) { void TaintTrackingPass::visitGetElementPtrInst(llvm::GetElementPtrInst &gep) { for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { + if (llvm::isa(idx)) { continue; } insertCondBrLogCall(gep, idx); @@ -104,6 +106,30 @@ void TaintTrackingPass::visitSwitchInst(llvm::SwitchInst &si) { insertCondBrLogCall(si, si.getCondition()); } +void TaintTrackingPass::visitSelectInst(llvm::SelectInst &si) { + auto cond = si.getCondition(); + if (llvm::isa(cond)) { + return; + } + insertCondBrLogCall(si, cond); +} + +void TaintTrackingPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { + auto addr = ibi.getAddress(); + if (llvm::isa(addr)) { + return; + } + insertCondBrLogCall(ibi, addr); +} + +void TaintTrackingPass::visitInvokeInst(llvm::InvokeInst &ii) { + auto called = ii.getCalledOperand(); + if (llvm::isa(called)) { + return; + } + insertCondBrLogCall(ii, called); +} + void TaintTrackingPass::declareLoggingFunctions(llvm::Module &mod) { llvm::IRBuilder<> ir(mod.getContext()); taint_start_fn = mod.getOrInsertFunction("__taint_start", ir.getVoidTy()); diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp index d8142794..71eb9b7a 100644 --- a/polytracker/src/passes/tainted_control_flow.cpp +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -20,65 +20,28 @@ #include -namespace polytracker { - -namespace detail { -// Helper type to produce the json file of function names by functionid -class FunctionMappingJSONWriter { -public: - FunctionMappingJSONWriter(std::string_view filename) - : file(filename.data(), std::ios::binary) { - file << "["; - } +static llvm::cl::list ignore_lists( + "pt-ftrace-ignore-list", + llvm::cl::desc("File that specifies functions that pt-tcf should ignore")); - ~FunctionMappingJSONWriter() { - // Back up and erase the last ",\n" - file.seekp(-2, std::ios::cur); - file << "\n]\n"; - } - - void append(std::string_view name) { - // Will cause an additional ',' but don't care about that right now... - // The destructor will back up two steps and replace the ',' with a newline - // and array termination. - file << "\"" << name << "\",\n"; - } - -private: - std::ofstream file; -}; -} // namespace detail +namespace polytracker { namespace { -uint32_t -get_or_add_mapping(uintptr_t key, std::unordered_map &m, - uint32_t &counter, std::string_view name, - polytracker::detail::FunctionMappingJSONWriter &js) { - if (auto it = m.find(key); it != m.end()) { +uint32_t get_or_add_mapping(uintptr_t key, + std::unordered_map &mapping, + uint32_t &counter) { + if (auto it = mapping.find(key); it != mapping.end()) { return it->second; } else { - js.append(name); - return m[key] = counter++; + return mapping[key] = counter++; } } - } // namespace -void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, - llvm::Value *val) { - llvm::IRBuilder<> ir(&inst); - auto dummy_val{val}; - if (inst.getType()->isVectorTy()) { - dummy_val = ir.CreateExtractElement(val, uint64_t(0)); - } - ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); -} llvm::ConstantInt * TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { auto func_address = reinterpret_cast(&func); - std::string_view name = func.getName(); - auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_, - name, *function_mapping_writer_); + auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_); return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); } @@ -87,65 +50,79 @@ TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { return get_function_id_const(*(i.getParent()->getParent())); } -void TaintedControlFlowPass::visitGetElementPtrInst( - llvm::GetElementPtrInst &gep) { - llvm::IRBuilder<> ir(&gep); - for (auto &idx : gep.indices()) { - if (llvm::isa(idx)) { - continue; +void +TaintedControlFlowPass::insertInstrumentation(llvm::Instruction &inst, llvm::Value *val) { + llvm::IRBuilder<> ir(&inst); + auto dummy_val{val}; + + if (llvm::isa(val->getType())) { + dummy_val = ir.CreateExtractElement(val, ir.getInt32(0)); + + if (llvm::isa(dummy_val->getType())) { + return; } + } - auto callret = ir.CreateCall(cond_br_log_fn, - {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), - get_function_id_const(gep)}); + auto label = ir.CreateSExtOrTrunc(dummy_val, label_ty); + auto function_id = get_function_id_const(inst); - idx = ir.CreateSExtOrTrunc(callret, idx->getType()); - } + // logs the label and the function id at this point; + // data flow has affected control flow here. + ir.CreateCall(cond_br_log_fn, {label, function_id}); } +// void TaintedControlFlowPass::visitGetElementPtrInst( +// llvm::GetElementPtrInst &gep) { +// // if an index is a constant, skip it +// for (auto &idx : gep.indices()) { +// if (llvm::isa(idx)) { +// continue; +// } +// insertInstrumentation(gep, idx); +// } +// } + void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { if (bi.isUnconditional()) { return; } - - llvm::IRBuilder<> ir(&bi); auto cond = bi.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); - - bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); -} - -void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); + insertInstrumentation(bi, cond); } -void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { - // TODO(hbrodin): Can't handle atm. - if (si.getType()->isVectorTy()) { - return; - } - llvm::IRBuilder<> ir(&si); - auto cond = si.getCondition(); - - auto callret = ir.CreateCall( - cond_br_log_fn, - {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); - - si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); -} +// void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { +// auto cond = si.getCondition(); +// insertInstrumentation(si, cond); +// } + +// void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { +// auto cond = si.getCondition(); +// if (llvm::isa(cond)) { +// return; +// } +// insertInstrumentation(si, cond); +// } + +// void TaintedControlFlowPass::visitIndirectBrInst(llvm::IndirectBrInst &ibi) { +// auto addr = ibi.getAddress(); +// if (llvm::isa(addr)) { +// return; +// } +// insertInstrumentation(ibi, addr); +// } + +// void TaintedControlFlowPass::visitInvokeInst(llvm::InvokeInst &ii) { +// auto called = ii.getCalledOperand(); +// if (llvm::isa(called)) { +// return; +// } +// insertInstrumentation(ii, called); +// } void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { - llvm::IRBuilder<> ir(mod.getContext()); + llvm::LLVMContext *context = &mod.getContext(); + llvm::IRBuilder<> ir(*context); + cond_br_log_fn = mod.getOrInsertFunction( "__polytracker_log_tainted_control_flow", llvm::AttributeList::get( @@ -153,10 +130,14 @@ void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { {{llvm::AttributeList::FunctionIndex, llvm::Attribute::get(mod.getContext(), llvm::Attribute::ReadNone)}}), - ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); + ir.getVoidTy(), label_ty, ir.getInt32Ty()); + + enter_log_fn_type = llvm::FunctionType::get( + llvm::Type::getVoidTy(*context), llvm::Type::getInt32Ty(*context), + llvm::Type::getInt8PtrTy(*context)); fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", - ir.getVoidTy(), ir.getInt32Ty()); + enter_log_fn_type); fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", ir.getVoidTy(), ir.getInt32Ty()); @@ -167,7 +148,9 @@ void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { return; } llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); - ir.CreateCall(fn_enter_log_fn, get_function_id_const(func)); + + ir.CreateCall(fn_enter_log_fn, {get_function_id_const(func), + ir.CreateGlobalStringPtr(func.getName())}); } void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { @@ -180,19 +163,19 @@ TaintedControlFlowPass::run(llvm::Module &mod, llvm::ModuleAnalysisManager &mam) { label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); declareLoggingFunctions(mod); + auto fnsToIgnore{readIgnoreLists(ignore_lists)}; + for (auto &fn : mod) { - instrumentFunctionEnter(fn); - visit(fn); + auto fname{fn.getName()}; + if (fnsToIgnore.count(fname.str())) { + continue; + } else { + instrumentFunctionEnter(fn); + visit(fn); + } } + return llvm::PreservedAnalyses::none(); } -TaintedControlFlowPass::TaintedControlFlowPass() - : function_mapping_writer_( - std::make_unique( - "functionid.json")) {} - -TaintedControlFlowPass::~TaintedControlFlowPass() = default; -TaintedControlFlowPass::TaintedControlFlowPass(TaintedControlFlowPass &&) = - default; } // namespace polytracker \ No newline at end of file diff --git a/polytracker/src/polytracker/main.cpp b/polytracker/src/polytracker/main.cpp index 9f095409..4653afe6 100644 --- a/polytracker/src/polytracker/main.cpp +++ b/polytracker/src/polytracker/main.cpp @@ -87,7 +87,7 @@ polytrackers settings 3. Set rest to default if possible and error if no polypath. */ void polytracker_get_settings() { - DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_db_name) + DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_db_name); DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_stderr_sink); DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_stdout_sink); DO_EARLY_DEFAULT_CONSTRUCT(std::string, polytracker_stdin_source); diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 56064e3b..d24708fb 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -13,43 +13,16 @@ EARLY_CONSTRUCT_EXTERN_GETTER(taintdag::PolyTracker, polytracker_tdag); static std::atomic_flag polytracker_init_flag = ATOMIC_FLAG_INIT; -static bool polytracker_is_initialized() { - return polytracker_init_flag.test(std::memory_order_relaxed); -} - static void polytracker_initialize() { polytracker_init_flag.test_and_set(std::memory_order_relaxed); } -extern "C" taintdag::Functions::index_t -__polytracker_log_func_entry(char *name, uint16_t len) { - if (!polytracker_is_initialized()) { - return 0; - } - return get_polytracker_tdag().function_entry({name, len}); -} - -extern "C" void -__polytracker_log_func_exit(taintdag::Functions::index_t func_index) { - if (!polytracker_is_initialized()) { - return; - } - get_polytracker_tdag().function_exit(func_index); -} - extern "C" dfsan_label __polytracker_union_table(const dfsan_label &l1, const dfsan_label &l2) { - if (!polytracker_is_initialized()) { - return 0; - } return get_polytracker_tdag().union_labels(l1, l2); } extern "C" void __polytracker_log_conditional_branch(dfsan_label label) { - if (!polytracker_is_initialized()) { - return; - } - if (label > 0) { get_polytracker_tdag().affects_control_flow(label); } @@ -58,9 +31,6 @@ extern "C" void __polytracker_log_conditional_branch(dfsan_label label) { extern "C" void __dfsw___polytracker_log_conditional_branch(uint64_t conditional, dfsan_label conditional_label) { - if (!polytracker_is_initialized()) { - return; - } __polytracker_log_conditional_branch(conditional_label); } @@ -73,30 +43,28 @@ extern "C" void __polytracker_taint_argv(int argc, char *argv[]) { polytracker::taint_argv(argc, argv); } -extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( - uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, - dfsan_label function_label, dfsan_label *ret_label) { - if (!polytracker_is_initialized()) { - return 0; - } +extern "C" void __polytracker_log_tainted_control_flow( + dfsan_label conditional_label, uint32_t function_id) { if (conditional_label > 0) { get_polytracker_tdag().log_tainted_control_flow(conditional_label, - functionid); + function_id); } - *ret_label = conditional_label; - return conditional; } -extern "C" void __polytracker_enter_function(uint32_t function_id) { - if (!polytracker_is_initialized()) { - return; - } +extern "C" void __dfsw___polytracker_log_tainted_control_flow( + uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, + dfsan_label function_id_label) { + uint32_t fid_32 = static_cast(function_id_label); + __polytracker_log_tainted_control_flow(conditional_label, fid_32); +} + +extern "C" void __polytracker_enter_function(uint32_t function_id, + const char *function_name) { + get_polytracker_tdag().record_function_name(function_id, + std::string_view(function_name)); get_polytracker_tdag().enter_function(function_id); } extern "C" void __polytracker_leave_function(uint32_t function_id) { - if (!polytracker_is_initialized()) { - return; - } get_polytracker_tdag().leave_function(function_id); } \ No newline at end of file diff --git a/polytracker/src/taintdag/control_flow_log_encoding.cpp b/polytracker/src/taintdag/control_flow_log_encoding.cpp new file mode 100644 index 00000000..cb6f383d --- /dev/null +++ b/polytracker/src/taintdag/control_flow_log_encoding.cpp @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include "taintdag/control_flow_log_encoding.h" + +// Separate from control_flow_log.h to avoid duplicate symbol inclusion in +// testing +namespace taintdag { +// A uint32_t varint encoded by setting highest bit for all but the final byte. +// Requires up to 5 bytes of storage as each output byte uses 7 input bits. +// Total maximum need is floor(32/7) = 5. +size_t varint_encode(uint32_t val, uint8_t *buffer) { + auto orig_buffer = buffer; + while (val >= 0x80) { + *buffer++ = 0x80 | (val & 0x7f); + val >>= 7; + } + *buffer++ = val & 0x7f; + return buffer - orig_buffer; +} +} // namespace taintdag \ No newline at end of file diff --git a/polytracker/src/taintdag/fnmapping.cpp b/polytracker/src/taintdag/fnmapping.cpp index 8ce76a29..c57e4ed5 100644 --- a/polytracker/src/taintdag/fnmapping.cpp +++ b/polytracker/src/taintdag/fnmapping.cpp @@ -19,27 +19,31 @@ using index_t = Functions::index_t; } // namespace -std::optional Functions::add_mapping(std::string_view name) { - // Lock `mappings` - // std::cout << "BREAK 1" << std::endl; +std::optional Functions::add_mapping(uint32_t function_id, + std::string_view function_name) { std::unique_lock mappings_lock(mappings_mutex); - // See if we already have a mapping of `name` - if (auto it{mappings.find(name)}; it != mappings.end()) { + + if (auto it{mappings.find(function_name)}; it != mappings.end()) { return it->second; } - // Write `name` into the string table section - auto maybe_name_offset{string_table.add_string(name)}; - if (!maybe_name_offset) { + + std::optional maybe_name_offset = + string_table.add_string(function_name); + if (!maybe_name_offset.has_value()) { + spdlog::error("Could not write function name to strings table"); return {}; } - // Write a `Function` via `construct` - auto name_offset{*maybe_name_offset}; - auto maybe_ctx{construct(name_offset)}; - if (!maybe_ctx) { + + auto maybe_ctx = construct(Function(maybe_name_offset.value(), function_id)); + if (!maybe_ctx.has_value()) { + spdlog::error("Could not write Function {0} with id {1:d}, string table " + "ofs {2:d} to the tdag functions section", + function_name, function_id, maybe_name_offset.value()); return {}; } - // Return index of `Function` in `Functions` - return mappings[name] = index(maybe_ctx->t); + + // Return index of the `Function` in `Functions` + return mappings[function_name] = index(maybe_ctx->t); } } // namespace taintdag \ No newline at end of file diff --git a/polytracker/src/taintdag/fntrace.cpp b/polytracker/src/taintdag/fntrace.cpp deleted file mode 100644 index 745fda53..00000000 --- a/polytracker/src/taintdag/fntrace.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2022-present, Trail of Bits, Inc. - * All rights reserved. - * - * This source code is licensed in accordance with the terms specified in - * the LICENSE file found in the root directory of this source tree. - */ - -#include "taintdag/fntrace.h" - -#include - -#include "taintdag/error.h" - -namespace taintdag { - -void Events::log_fn_event(Event::kind_t kind, Functions::index_t idx) { - // Write an `Event` via `construct` - if (!construct(kind, idx)) { - error_exit("Failed to log event with id: ", count()); - } -} - -} // namespace taintdag \ No newline at end of file diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index 65b683a8..fa905187 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -15,7 +15,6 @@ #include "taintdag/error.h" #include "taintdag/fnmapping.h" -#include "taintdag/fntrace.h" namespace taintdag { @@ -179,6 +178,11 @@ void PolyTracker::log_tainted_control_flow(label_t lbl, uint32_t function_id) { output_file_.section().tainted_control_flow(lbl, function_id); } +void PolyTracker::record_function_name(uint32_t function_id, + std::string_view function_name) { + output_file_.section().add_mapping(function_id, function_name); +} + void PolyTracker::enter_function(uint32_t function_id) { output_file_.section().enter_function(function_id); } @@ -187,20 +191,4 @@ void PolyTracker::leave_function(uint32_t function_id) { output_file_.section().leave_function(function_id); } -Functions::index_t PolyTracker::function_entry(std::string_view name) { - auto &functions{output_file_.section()}; - auto maybe_index{functions.add_mapping(name)}; - if (!maybe_index) { - error_exit("Failed to add function mapping for: ", name); - } - auto &events{output_file_.section()}; - events.log_fn_event(Event::kind_t::entry, *maybe_index); - return *maybe_index; -} - -void PolyTracker::function_exit(Functions::index_t index) { - auto &events{output_file_.section()}; - events.log_fn_event(Event::kind_t::exit, index); -} - } // namespace taintdag \ No newline at end of file diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 744e5761..970ba3b3 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -1,45 +1,47 @@ +from ctypes import ( + Structure, + c_char, + c_int32, + c_int64, + c_uint8, + c_uint16, + c_uint32, + c_uint64, + sizeof, +) +from enum import Enum +from mmap import PROT_READ, mmap +from pathlib import Path from typing import ( BinaryIO, - Union, + Dict, Iterable, Iterator, - Optional, - Dict, - Tuple, List, + Optional, Set, + Tuple, Type, + Union, cast, ) -from enum import Enum -from pathlib import Path -from mmap import mmap, PROT_READ -from ctypes import ( - Structure, - c_char, - c_int64, - c_uint64, - c_int32, - c_uint32, - c_uint8, - c_uint16, - sizeof, -) +from cxxfilt import demangle +from typing_extensions import deprecated +from .inputs import Input from .plugins import Command -from .repl import PolyTrackerREPL from .polytracker import ProgramTrace -from .inputs import Input +from .repl import PolyTrackerREPL from .taint_forest import TaintForest, TaintForestNode from .tracing import ( BasicBlock, ByteOffset, Function, TaintAccess, - TraceEvent, TaintOutput, Taints, + TraceEvent, ) @@ -93,11 +95,48 @@ def enumerate(self): yield TDFDHeader.from_buffer_copy(self.mem[offset:]) +@deprecated("Use ControlFlowEvent instead, TDEvents are no longer written") +class TDEvent(Structure): + """TDEvent is an old version of the ControlFlowEvent kept for backward + compatibility only. + """ + + _fields_ = [("kind", c_uint8), ("fnidx", c_uint16)] + + class Kind(Enum): + ENTRY = 0 + EXIT = 1 + + def __repr__(self) -> str: + return f"kind: {self.Kind(self.kind).name} fnidx: {self.fnidx}" + + +@deprecated("Use TDControlFlowLog; the TDEvents section is no longer written") +class TDEventsSection: + """TDEventsSection is an old version of the CFLog kept for backward + compatibility only. + """ + + def __init__(self, mem, hdr): + self.section = mem[hdr.offset : hdr.offset + hdr.size] # nosec E203 + + def __iter__(self): + for offset in range(0, len(self.section), sizeof(TDEvent)): + yield TDEvent.from_buffer_copy(self.section, offset) + + class TDStringSection: - """TDAG String Table section + """TDAG String Table section. Interprets the String Table section in a TDAG file. Corresponds to StringTableBase in string_table.h. + + The string table will contain information like the following: + - source names + - function names + - additional label metadata + Check usages of StringTableBase in the C++ ("write side") part of the + codebase. """ def __init__(self, mem, hdr): @@ -106,7 +145,11 @@ def __init__(self, mem, hdr): def read_string(self, offset): n = c_uint16.from_buffer_copy(self.section[offset:]).value - assert len(self.section) >= offset + sizeof(c_uint16) + n + if not (len(self.section) >= offset + sizeof(c_uint16) + n): + raise AssertionError( + """Section out of alignment with c_uint16 + so string could not be read""" + ) return str( self.section[offset + sizeof(c_uint16) : offset + sizeof(c_uint16) + n], "utf-8", @@ -130,68 +173,75 @@ def count(self): return len(self.section) // sizeof(c_uint64) -class TDEnterFunctionEvent: +class CFEnterFunctionEvent: """Emitted whenever execution enters a function. The callstack member is the callstack right before entering the function, having the function just entered as the last member of the callstack. """ - def __init__(self, callstack): - """Callstack after entering function""" + def __init__(self, callstack: List): self.callstack = callstack def __repr__(self) -> str: - return f"Enter: {self.callstack}" + return f"CFEnterFunctionEvent: {self.callstack}" def __eq__(self, __o: object) -> bool: - if isinstance(__o, TDEnterFunctionEvent): + if isinstance(__o, CFEnterFunctionEvent): return self.callstack == __o.callstack return False -class TDLeaveFunctionEvent: +class CFLeaveFunctionEvent: """Emitted whenever execution leaves a function. The callstack member is the callstack right before leaving the function, having the function about to leave as the last member of the callstack. """ - def __init__(self, callstack): - """Callstack before leaving function""" + def __init__(self, callstack: List): self.callstack = callstack def __repr__(self) -> str: - return f"Leave: {self.callstack}" + return f"CFLeaveFunctionEvent: {self.callstack}" def __eq__(self, __o: object) -> bool: - if isinstance(__o, TDLeaveFunctionEvent): + if isinstance(__o, CFLeaveFunctionEvent): return self.callstack == __o.callstack return False -class TDTaintedControlFlowEvent: +class TaintedControlFlowEvent: """Emitted whenever a control flow change is influenced by tainted data. The label that influenced the control flow is available in the `label` member. Current callstack (including the function the control flow happened in) is available in the `callstack` member.""" - def __init__(self, callstack, label): + def __init__(self, callstack: List, label: Optional[int] = None): self.callstack = callstack self.label = label def __repr__(self) -> str: - return f"TaintedControlFlow label {self.label} callstack {self.callstack}" + return f"TaintedControlFlowEvent: taint label {self.label} | {self.callstack}" - def __eq__(self, __o: object) -> bool: - if isinstance(__o, TDTaintedControlFlowEvent): - return self.label == __o.label and self.callstack == __o.callstack + def __eq__(self, other) -> bool: + if isinstance(other, TaintedControlFlowEvent) and self.label is not None: + return self.label == other.label and self.callstack == other.callstack + elif self.label is None and other.label is None: + return self.callstack == other.callstack return False +ControlFlowEvent = Union[ + CFEnterFunctionEvent, + CFLeaveFunctionEvent, + TaintedControlFlowEvent, +] + + class TDControlFlowLogSection: - """TDAG Control flow log section + """TDAG Control flow log section. Interprets the control flow log section in a TDAG file. - Enables enumeration/random access of items + Enables enumeration/random access of items in the cflog. """ # NOTE: MUST correspond to the members in the `ControlFlowLog::EventType`` in `control_flog_log.h`. @@ -216,26 +266,30 @@ def _decode_varint(buffer): @staticmethod def _align_callstack(target_function_id, callstack): while callstack and callstack[-1] != target_function_id: - yield TDLeaveFunctionEvent(callstack[:]) + yield CFLeaveFunctionEvent(callstack[:]) callstack.pop() def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] - self.funcmapping = None - def __iter__(self): + def __iter__(self) -> Iterator[ControlFlowEvent]: + """Produce the cflog entries in order from the mmapped buffer.""" buffer = self.section callstack = [] while buffer: event = c_uint8.from_buffer_copy(buffer, 0).value buffer = buffer[1:] + + # A function id is a uint32_t that the functions + # section maps to an index into the strings table + # If you need function names, you should be working + # with the tdfile, rather than with a singular section + # of the tdag directly. function_id, buffer = TDControlFlowLogSection._decode_varint(buffer) - if self.funcmapping != None: - function_id = self.funcmapping[function_id] if event == TDControlFlowLogSection.ENTER_FUNCTION: callstack.append(function_id) - yield TDEnterFunctionEvent(callstack[:]) + yield CFEnterFunctionEvent(callstack[:]) elif event == TDControlFlowLogSection.LEAVE_FUNCTION: # Align call stack, if needed yield from TDControlFlowLogSection._align_callstack( @@ -243,7 +297,7 @@ def __iter__(self): ) # TODO(hbrodin): If the callstack doesn't contain function_id at all, this will break. - yield TDLeaveFunctionEvent(callstack[:]) + yield CFLeaveFunctionEvent(callstack[:]) callstack.pop() else: # Align call stack, if needed @@ -252,15 +306,11 @@ def __iter__(self): ) label, buffer = TDControlFlowLogSection._decode_varint(buffer) - yield TDTaintedControlFlowEvent(callstack[:], label) + yield TaintedControlFlowEvent(callstack[:], label) # Drain callstack with artifical TDLeaveFunction events (using a dummy function id that doesn't exist) yield from TDControlFlowLogSection._align_callstack(-1, callstack) - def function_id_mapping(self, id_to_name_array): - """This method stores an array used to translate from function id to symbolic names""" - self.funcmapping = id_to_name_array - class TDSinkSection: """TDAG Sinks section @@ -287,7 +337,12 @@ class TDBitmapSection: def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] - assert len(self.section) % 8 == 0 # Multiple of uint64_t + if len(self.section) % 8 != 0: + # Multiple of uint64_t + raise AssertionError( + """Bitmap Section out of alignment and + cannot be read""" + ) def enumerate_set_bits(self): """Enumerates all bits that are set @@ -318,21 +373,23 @@ def __init__(self, mem, hdr): class TDFunctionsSection: - def __init__(self, mem, hdr): - self.section = mem[hdr.offset : hdr.offset + hdr.size] - - def __iter__(self): - for offset in range(0, len(self.section), sizeof(TDFnHeader)): - yield TDFnHeader.from_buffer_copy(self.section, offset) - + """TDFunctionsSection holds the mapping between the function IDs + stored in callstack form in the cflog section, and the function + names stored in the string table. See fnmapping in the C++ part + of the codebase for the "write" side part of Polytracker that + pertains to this section. Each entry is an uint32_t as set in + fnmapping.cpp, but a TDFnHeader will then contain *two* of these: + the function_id and the name_offset. + + Structure in memory: |offset|function id|... + """ -class TDEventsSection: def __init__(self, mem, hdr): self.section = mem[hdr.offset : hdr.offset + hdr.size] def __iter__(self): - for offset in range(0, len(self.section), sizeof(TDEvent)): - yield TDEvent.from_buffer_copy(self.section, offset) + for entry in range(0, len(self.section), sizeof(TDFnHeader)): + yield TDFnHeader.from_buffer_copy(self.section, entry) class TDFDHeader(Structure): @@ -352,7 +409,9 @@ def invalid_fd(self): class TDFnHeader(Structure): - _fields_ = [("name_offset", c_uint32)] + # This corresponds to the Function inline constructor in fnmapping.h. + # Anything using Structure needs to be in sync with the corresponding C++. + _fields_ = [("name_offset", c_uint32), ("function_id", c_uint32)] class TDNode: @@ -413,17 +472,6 @@ def __repr__(self) -> str: return f"TDSink fdidx: {self.fdidx} offset: {self.offset} label: {self.label}" -class TDEvent(Structure): - _fields_ = [("kind", c_uint8), ("fnidx", c_uint16)] - - class Kind(Enum): - ENTRY = 0 - EXIT = 1 - - def __repr__(self) -> str: - return f"kind: {self.Kind(self.kind).name} fnidx: {self.fnidx}" - - TDSection = Union[ TDLabelSection, TDSourceSection, @@ -454,7 +502,7 @@ def __init__(self, file: BinaryIO) -> None: section_offset = sizeof(TDFileMeta) self.sections: List[TDSection] = [] self.sections_by_type: Dict[Type[TDSection], TDSection] = {} - for i in range(0, self.filemeta.section_count): + for _ in range(0, self.filemeta.section_count): hdr = TDSectionMeta.from_buffer_copy(self.buffer, section_offset) if hdr.tag == 1: self.sections.append(TDSourceSection(self.buffer, hdr)) @@ -489,48 +537,87 @@ def __init__(self, file: BinaryIO) -> None: self.sink_cache: Dict[int, TDSink] = {} self.fd_headers: List[Tuple[Path, TDFDHeader]] = list(self.read_fd_headers()) - self.fn_headers: List[Tuple[str, TDFnHeader]] = list(self.read_fn_headers()) - - def _get_section(self, wanted_type: Type[TDSection]) -> TDSection: - return self.sections_by_type[wanted_type] def read_fd_headers(self) -> Iterator[Tuple[Path, TDFDHeader]]: sources = self.sections_by_type[TDSourceSection] + if not isinstance(sources, TDSourceSection): + raise AssertionError("Sources Section could not be read") strings = self.sections_by_type[TDStringSection] - assert isinstance(sources, TDSourceSection) - assert isinstance(strings, TDStringSection) + if not isinstance(strings, TDStringSection): + raise AssertionError("Strings Table could not be read") - yield from ( - (Path(strings.read_string(x.name_offset)), x) for x in sources.enumerate() - ) + for source in sources.enumerate(): + yield Path(strings.read_string(source.name_offset)), source - def read_fn_headers(self) -> Iterator[Tuple[str, TDFnHeader]]: + @property + def mangled_fn_symbol_lookup(self) -> Dict[int, str]: + """Unordered! map of dynamically observed function IDs to clang + symbols. You can demangle the symbols with cxxfilt.demangle. + """ + lookup = {} functions = self.sections_by_type[TDFunctionsSection] + if not isinstance(functions, TDFunctionsSection): + raise AssertionError("Functions Section could not be read") strings = self.sections_by_type[TDStringSection] - assert isinstance(functions, TDFunctionsSection) - assert isinstance(strings, TDStringSection) + if not isinstance(strings, TDStringSection): + raise AssertionError("String Table could not be read") + + for entry in functions: + lookup[entry.function_id] = strings.read_string(entry.name_offset) - for header in functions: - name = strings.read_string(header.name_offset) - yield name, header + return lookup + + def _maybe_demangle(self, function_id: int) -> Union[str, int]: + """Depending on the age of the tdag, it may not contain a function + mapping. If the tdag doesn't contain a function mapping, this will + only return function ids and you'll need to manually map them against + symbols gathered statically from the compiled instrumented binary. + """ + maybe_symbol = self.mangled_fn_symbol_lookup.get(function_id) + if maybe_symbol is not None: + return demangle(maybe_symbol) + else: + return function_id + + def cflog(self, demangle_symbols=False) -> Iterator[ControlFlowEvent]: + """Presents the control flow log. Does not demangle symbols by default, + for performance. + """ + cflog_section = self.sections_by_type[TDControlFlowLogSection] + if not isinstance(cflog_section, TDControlFlowLogSection): + raise AssertionError("CFLog section not correctly read from TDAG?") + + if demangle_symbols: + for cflog_entry in cflog_section: + cflog_entry.callstack[:] = [ + self._maybe_demangle(function_id) + for function_id in cflog_entry.callstack + ] + + yield cflog_entry + else: + cflog_section.__iter__() def input_labels(self) -> Iterator[int]: """Enumerates all taint labels that are input labels (source taint)""" source_index_section = self.sections_by_type[TDSourceIndexSection] - assert isinstance(source_index_section, TDSourceIndexSection) + if not isinstance(source_index_section, TDSourceIndexSection): + raise AssertionError("Source Index not correctly read from TDAG?") return source_index_section.enumerate_set_bits() @property def label_count(self): label_section = self.sections_by_type[TDLabelSection] - assert isinstance(label_section, TDLabelSection) + if not isinstance(label_section, TDLabelSection): + raise AssertionError("Could not read Label Section from TDAG?") return label_section.count() def read_node(self, label: int) -> int: if label in self.raw_nodes: return self.raw_nodes[label] label_section = self.sections_by_type[TDLabelSection] - assert isinstance(label_section, TDLabelSection) + if not isinstance(label_section, TDLabelSection): + raise AssertionError("Could not read Label Section from TDAG?") result = label_section.read_raw(label) self.raw_nodes[label] = result @@ -566,18 +653,10 @@ def nodes(self) -> Iterator[TDNode]: @property def sinks(self) -> Iterator[TDSink]: sink_section = self.sections_by_type[TDSinkSection] - assert isinstance(sink_section, TDSinkSection) + if not isinstance(sink_section, TDSinkSection): + raise AssertionError("Could not read Sink Section from TDAG?") yield from sink_section.enumerate() - def read_event(self, offset: int) -> TDEvent: - return TDEvent.from_buffer_copy(self.buffer, offset) - - @property - def events(self) -> Iterator[TDEvent]: - events_section = self.sections_by_type[TDEventsSection] - assert isinstance(events_section, TDEventsSection) - yield from events_section - class TDTaintOutput(TaintOutput): def __init__(self, source: Input, output_offset: int, label: int): @@ -613,9 +692,14 @@ def basic_blocks(self) -> Iterable[BasicBlock]: raise NotImplementedError() def file_offset(self, node: TaintForestNode) -> ByteOffset: - assert node.source is not None + if node.source is None: + raise AssertionError( + """ + No source could be found from which offset could be calculated""" + ) tdnode: TDNode = self.tdfile.decode_node(node.label) - assert isinstance(tdnode, TDSourceNode) + if not isinstance(tdnode, TDSourceNode): + raise AssertionError("Source Node could not be decoded") return ByteOffset(node.source, tdnode.offset) @property @@ -656,7 +740,8 @@ def inputs(self) -> Iterator[Input]: seen: Set[int] = set() for source_label in self.tdfile.input_labels(): source_node = self.tdfile.decode_node(source_label) - assert isinstance(source_node, TDSourceNode) + if not isinstance(source_node, TDSourceNode): + raise AssertionError("Source Node could not be decoded?") if source_node.idx not in seen: path, fd_header = self.tdfile.fd_headers[source_node.idx] yield Input(fd_header.fd, str(path), fd_header.size) @@ -793,10 +878,11 @@ def create_node(self, label: int) -> TDTaintForestNode: (curr, node.last), ) - assert False + raise AssertionError("TDTaintForestNode could not be created") def get_node(self, label: int, source: Optional[Input] = None) -> TDTaintForestNode: - assert source is None + if source is not None: + raise AssertionError("Node could not be retrieved from label") if self.node_cache[label] is not None: return cast(TDTaintForestNode, self.node_cache[label]) @@ -824,13 +910,7 @@ def __init_arguments__(self, parser): "--print-fd-headers", "-f", action="store_true", - help="print file descriptor headers", - ) - parser.add_argument( - "--print-fn-headers", - "-x", - action="store_true", - help="print function headers", + help="print file descriptor headers (sources)", ) parser.add_argument( "--print-taint-sinks", @@ -849,14 +929,14 @@ def __init_arguments__(self, parser): "--print-function-trace", "-t", action="store_true", - help="print function trace events", + help="print function trace", ) parser.add_argument( "--print-control-flow-log", "-c", action="store_true", - help="print function trace events", + help="print control flow log events", ) def run(self, args): @@ -869,11 +949,6 @@ def run(self, args): path = h[0] print(f"{i}: {path}") - if args.print_fn_headers: - for i, h in enumerate(tdfile.fn_headers): - name = h[0] - print(f"{i}: {name}") - if args.print_taint_sinks: for s in tdfile.sinks: print(f"{s} -> {tdfile.decode_node(s.label)}") @@ -883,11 +958,20 @@ def run(self, args): print(f"Label {lbl}: {tdfile.decode_node(lbl)}") if args.print_function_trace: - for e in tdfile.events: - print(f"{e}") + if ( + TDFunctionsSection in tdfile.sections_by_type.keys() + and len(tdfile.mangled_fn_symbol_lookup) > 0 + ): + for k, v in tdfile.mangled_fn_symbol_lookup: + print(f"function_id '{k}': function '{demangle(v)}'") + else: + print("Error: no Functions section was read from tdag!") + print(f"Sections that could be read: {tdfile.sections}") if args.print_control_flow_log: - cflog = tdfile._get_section(TDControlFlowLogSection) - assert isinstance(cflog, TDControlFlowLogSection) - for obj in cflog: - print(f"{obj}") + if TDControlFlowLogSection in tdfile.sections_by_type.keys(): + for event in tdfile.cflog(demangle_symbols=True): + print(event) + else: + print("Error: no Control Flow Log section read from tdag!") + print(f"Sections that could be read: {tdfile.sections}") diff --git a/.ruff.toml b/ruff.toml similarity index 100% rename from .ruff.toml rename to ruff.toml diff --git a/tests/conftest.py b/tests/conftest.py index 8114bafc..627da284 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,12 @@ -import sys -import pytest import subprocess -import polytracker - +import sys from pathlib import Path from typing import List +import pytest + +import polytracker + def pytest_configure(config): config.addinivalue_line( @@ -26,16 +27,17 @@ def build(target: Path, binary: Path) -> None: cmd = ["build"] if target.suffix == ".cpp": - cmd.append("clang++") + cmd += ["clang++", "-std=c++20"] else: cmd.append("clang") + # debugging and want symbols? add -O0 here cmd += ["-g", "-o", str(binary), str(target)] run_polytracker(cmd) def instrument(target: str) -> None: - cmd = ["instrument-targets", "--taint", "--ftrace", "--cflog", target] + cmd = ["instrument-targets", "--cflog", target] run_polytracker(cmd) @@ -79,11 +81,8 @@ def program_trace(input_file, trace_file, instrumented_binary, monkeypatch): monkeypatch.chdir(input_file.parent) monkeypatch.setenv("POLYDB", str(trace_file)) cmd = [ - # instrumented binary instrumented_binary, - # input data str(input_file), ] subprocess.check_call(cmd) - # Read the trace file return polytracker.PolyTrackerTrace.load(trace_file) diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py index 5316fac7..39b6e03f 100644 --- a/tests/test_cf_log.py +++ b/tests/test_cf_log.py @@ -1,20 +1,63 @@ +import subprocess +from pathlib import Path +from typing import List + import cxxfilt -import json import pytest -import subprocess import polytracker -from pathlib import Path - from polytracker.taint_dag import ( - TDEnterFunctionEvent, - TDLeaveFunctionEvent, - TDTaintedControlFlowEvent, + CFEnterFunctionEvent, + CFLeaveFunctionEvent, + ControlFlowEvent, + TaintedControlFlowEvent, + TDControlFlowLogSection, + TDNode, ) +@pytest.mark.program_trace("test_fntrace.cpp") +def test_function_mapping(program_trace) -> None: + mangled_symbols = list(program_trace.tdfile.mangled_fn_symbol_lookup.values()) + + assert mangled_symbols == ["main", "_Z9factoriali"] + expected_names = ["main", "factorial(int)"] + for symbol in mangled_symbols: + assert cxxfilt.demangle(symbol) in expected_names + + +@pytest.mark.program_trace("test_fntrace.cpp") +def test_callstack_mapping(program_trace) -> None: + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ + TDControlFlowLogSection + ] + + for cflog_entry in cflog: + assert len(cflog_entry.callstack) > 0 + # a callstack entry (if not mapped and demangled) is just a function id + for callstack_entry in cflog_entry.callstack: + # when we look up the function id it should map to a name we traced + assert callstack_entry in program_trace.tdfile.mangled_fn_symbol_lookup + + +@pytest.mark.program_trace("test_fntrace.cpp") +def test_label_mapping(program_trace) -> None: + cflog: TDControlFlowLogSection = program_trace.tdfile.sections_by_type[ + TDControlFlowLogSection + ] + + for cflog_entry in cflog: + if type(cflog_entry) is TaintedControlFlowEvent: + assert hasattr(cflog_entry, "label") + node: TDNode = program_trace.tdfile.decode_node(cflog_entry.label) + assert node.affects_control_flow + else: + assert not hasattr(cflog_entry, "label") + + @pytest.mark.program_trace("test_cf_log.cpp") -def test_cf_log(instrumented_binary: Path, trace_file: Path): +def test_cf_log(instrumented_binary: Path, trace_file: Path) -> None: + """Demonstrates how the cflog should work end to end, integrated with the fn mapping and the function symbols from the strings table.""" # Data to write to stdin, one byte at a time stdin_data = "abcdefgh" @@ -24,46 +67,43 @@ def test_cf_log(instrumented_binary: Path, trace_file: Path): env={ "POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": "1", - "POLYTRACKER_LOG_CONTROL_FLOW": "1", }, ) program_trace = polytracker.PolyTrackerTrace.load(trace_file) - cflog = program_trace.tdfile._get_section( - polytracker.taint_dag.TDControlFlowLogSection - ) - - # The functionid mapping is available next to the built binary - with open(instrumented_binary.parent / "functionid.json", "rb") as f: - functionid_mapping = list(map(cxxfilt.demangle, json.load(f))) - - # Apply the id to function mappign - cflog.function_id_mapping(functionid_mapping) - expected_seq = [ - TDEnterFunctionEvent(["main"]), - TDTaintedControlFlowEvent(["main"], 1), - TDTaintedControlFlowEvent(["main"], 2), - TDTaintedControlFlowEvent(["main"], 3), - TDTaintedControlFlowEvent(["main"], 4), - TDTaintedControlFlowEvent(["main"], 5), - TDTaintedControlFlowEvent(["main"], 6), - TDTaintedControlFlowEvent(["main"], 7), - TDTaintedControlFlowEvent(["main"], 8), - TDTaintedControlFlowEvent(["main"], 15), - TDTaintedControlFlowEvent(["main"], 3), - TDEnterFunctionEvent(["main", "f1(unsigned char)"]), - TDTaintedControlFlowEvent(["main", "f1(unsigned char)"], 7), - TDEnterFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), - TDTaintedControlFlowEvent( - ["main", "f1(unsigned char)", "f2(unsigned char)"], 7 - ), - TDLeaveFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), - TDLeaveFunctionEvent(["main", "f1(unsigned char)"]), - TDLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit + CFEnterFunctionEvent(["main"]), + TaintedControlFlowEvent(["main"], 1), + TaintedControlFlowEvent(["main"], 2), + TaintedControlFlowEvent(["main"], 3), + TaintedControlFlowEvent(["main"], 4), + TaintedControlFlowEvent(["main"], 5), + TaintedControlFlowEvent(["main"], 6), + TaintedControlFlowEvent(["main"], 7), + TaintedControlFlowEvent(["main"], 8), + TaintedControlFlowEvent(["main"], 15), + TaintedControlFlowEvent(["main"], 3), + CFEnterFunctionEvent(["main", "f1(unsigned char)"]), + TaintedControlFlowEvent(["main", "f1(unsigned char)"], 7), + CFEnterFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), + TaintedControlFlowEvent(["main", "f1(unsigned char)", "f2(unsigned char)"], 7), + CFLeaveFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), + CFLeaveFunctionEvent(["main", "f1(unsigned char)"]), + CFLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit ] - # NOTE(hbrodin): Could have done assert list(cflog) == expected_seq, but this provides the failed element + cflog: List[ControlFlowEvent] = program_trace.tdfile.cflog(demangle_symbols=True) for got, expected in zip(cflog, expected_seq): assert got == expected + + if type(got) is TaintedControlFlowEvent: + assert got.label is not None + + assert len(got.callstack) > 0 + + for entry in cflog: + for callstack_entry in entry.callstack: + assert callstack_entry in list( + program_trace.tdfile.mangled_fn_symbol_lookup.values() + ) diff --git a/tests/test_fntrace.py b/tests/test_fntrace.py deleted file mode 100644 index 71f7c47d..00000000 --- a/tests/test_fntrace.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from collections import defaultdict -from typing import Dict - -from polytracker import taint_dag, ProgramTrace - - -@pytest.mark.program_trace("test_fntrace.cpp") -def test_fn_headers(program_trace: ProgramTrace): - assert isinstance(program_trace, taint_dag.TDProgramTrace) - functions = list(program_trace.tdfile.fn_headers) - names = set(map(lambda f: f[0], functions)) - assert names == set(["main", "_Z9factoriali"]) - - -@pytest.mark.program_trace("test_fntrace.cpp") -def test_fntrace(program_trace: ProgramTrace): - assert isinstance(program_trace, taint_dag.TDProgramTrace) - events = list(program_trace.tdfile.events) - assert len(events) == 10 - kinds: Dict[taint_dag.TDEvent.Kind, int] = defaultdict(int) - for e in events: - kinds[e.kind] += 1 - assert kinds[taint_dag.TDEvent.Kind.ENTRY] == kinds[taint_dag.TDEvent.Kind.EXIT] diff --git a/tests/test_polytracker.py b/tests/test_program_trace.py similarity index 97% rename from tests/test_polytracker.py rename to tests/test_program_trace.py index 13bf6e29..2ec6906e 100644 --- a/tests/test_polytracker.py +++ b/tests/test_program_trace.py @@ -1,14 +1,10 @@ from collections import defaultdict -import pytest from subprocess import CalledProcessError from typing import Dict, Union -from polytracker import ( - BasicBlockEntry, - FunctionEntry, - FunctionReturn, - ProgramTrace, -) +import pytest + +from polytracker import BasicBlockEntry, FunctionEntry, FunctionReturn, ProgramTrace @pytest.mark.skip(reason="taint_dag does not support traces yet") @@ -183,6 +179,7 @@ def test_cxx_global_object(program_trace: ProgramTrace): assert taints[0].length == 1 +@pytest.mark.skip(reason="the Taint Forest is currently not implemented") @pytest.mark.program_trace("test_simple_union.cpp", input="ABCDEFGH\n11235878\n") def test_taint_forest(program_trace: ProgramTrace): had_taint_union = False diff --git a/tests/test_stdin.cpp b/tests/test_stdin.cpp index 105b041f..632f59b0 100644 --- a/tests/test_stdin.cpp +++ b/tests/test_stdin.cpp @@ -1,7 +1,7 @@ -#include #include +#include +#include #include -#include int stdin_read() { char inbyte; @@ -97,26 +97,27 @@ int stdin_getchar_unlocked() { } // Reads from stdin using different methods based on argv[1] -// the following functions can be used -// read, int main(int argc, char *argv[]) { - assert(argc == 2); - std::string_view method{argv[1]}; + if (argc != 2) { + exit(EXIT_FAILURE); + } - if (method == "read") { + if (std::strncmp(argv[1], "read", 4) == 0) { + printf("got read\n"); stdin_read(); - } else if (method == "fread") { + } else if (std::strncmp(argv[1], "fread", 5) == 0) { stdin_fread(); - } else if (method == "getc") { + } else if (std::strncmp(argv[1], "getc", 4) == 0) { stdin_getc(); - } else if (method == "getc_unlocked") { + } else if (std::strncmp(argv[1], "getc_unlocked", 13) == 0) { stdin_getc_unlocked(); - } else if (method == "getchar") { + } else if (std::strncmp(argv[1], "getchar", 7) == 0) { stdin_getchar(); - } else if (method == "getchar_unlocked") { + } else if (std::strncmp(argv[1], "getchar_unlocked", 16) == 0) { stdin_getchar_unlocked(); - } else if (method == "fgetc") { + } else if (std::strncmp(argv[1], "fgetc", 5) == 0) { stdin_fgetc(); } - return 0; + + exit(EXIT_SUCCESS); } \ No newline at end of file diff --git a/tests/test_stdin.py b/tests/test_stdin.py index ef6c2034..38c3b1d0 100644 --- a/tests/test_stdin.py +++ b/tests/test_stdin.py @@ -1,44 +1,90 @@ +import subprocess # nosec B404 +from pathlib import Path +from random import choice +from string import printable as chars + import pytest -import subprocess import polytracker -from pathlib import Path +# Ensure stdin reads in multiple ways are verified +# examples: getc, fgetc, fread, fread_unlocked, +# fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw + +_stdin_data = "\n".join(choice(chars) for _ in range(40)) # nosec B311 -@pytest.mark.program_trace("test_stdin.cpp") -@pytest.mark.parametrize( - "method", - ["read", "fread", "getc", "getc_unlocked", "getchar", "getchar_unlocked", "fgetc"], -) -def test_stdin_read(instrumented_binary: Path, trace_file: Path, method: str): - # Data to write to stdin, one byte at a time - stdin_data = "abcdefghi\njklmnopqr" - - subprocess.run( - [str(instrumented_binary), method], - input=stdin_data.encode("utf-8"), - env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": str(1)}, + +def _create_tdag_trace( + instrumented_binary: Path, trace_file: Path, method: str +) -> None: + """Rather than using pytest.mark.parametrize on this setup function, split + out DRY from the test framework so it's easy to see when an individual test + fails.""" + # https://docs.python.org/3/library/subprocess.html#subprocess.CalledProcessError.returncode + subprocess.run( # nosec B603 + args=[str(instrumented_binary), method], + env={"POLYDB": str(trace_file), "POLYTRACKER_STDIN_SOURCE": "1"}, + stderr=subprocess.STDOUT, + input=_stdin_data.encode("utf-8"), + close_fds=False, ).check_returncode() - program_trace = polytracker.PolyTrackerTrace.load(trace_file) - # Ensure /dev/stdin is in the list of inputs - assert "/dev/stdin" in [x.path for x in program_trace.inputs] - n = 0 +def _test_trace(trace_file: Path) -> None: + """Test the tdag output, checking its inputs to make sure we tainted and + tracked every byte of stdin. Offsets must be ordered as they were read.""" + + program_trace = polytracker.PolyTrackerTrace.load(trace_file) + assert "/dev/stdin" in [input.path for input in program_trace.inputs] + + expected_offset = 0 for input_label in program_trace.tdfile.input_labels(): src_node = program_trace.tdfile.decode_node(input_label) assert isinstance(src_node, polytracker.taint_dag.TDSourceNode) + assert src_node.offset == expected_offset + assert program_trace.tdfile.fd_headers[src_node.idx][0] == Path("/dev/stdin") + expected_offset += 1 - # Requires that offsets are ordered according to read - assert src_node.offset == n + assert expected_offset == len(_stdin_data) - # Ensure all source labels originate from stdin - assert program_trace.tdfile.fd_headers[src_node.idx][0] == Path("/dev/stdin") - n += 1 - # Should be as many source labels as the length of stdin_data - assert n == len(stdin_data) +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_read(instrumented_binary: Path, trace_file: Path): + _create_tdag_trace(instrumented_binary, trace_file, "read") + _test_trace(trace_file) -# Ensure stdin reads in multiple ways are verified -# examples: getc, fgetc, fread, fread_unlocked, fgetc_unlocked, gets, fgets, getdelim, __getdelim, getw +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_fread(instrumented_binary: Path, trace_file: Path): + _create_tdag_trace(instrumented_binary, trace_file, "fread") + _test_trace(trace_file) + + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_getc(instrumented_binary: Path, trace_file: Path): + _create_tdag_trace(instrumented_binary, trace_file, "getc") + _test_trace(trace_file) + + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_getc_unlocked(instrumented_binary: Path, trace_file: Path): + _create_tdag_trace(instrumented_binary, trace_file, "getc_unlocked") + _test_trace(trace_file) + + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_getchar(instrumented_binary: Path, trace_file: Path): + _create_tdag_trace(instrumented_binary, trace_file, "getchar") + _test_trace(trace_file) + + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_getchar_unlocked(instrumented_binary: Path, trace_file: Path): + _create_tdag_trace(instrumented_binary, trace_file, "getchar_unlocked") + _test_trace(trace_file) + + +@pytest.mark.program_trace("test_stdin.cpp") +def test_stdin_fgetc(instrumented_binary: Path, trace_file: Path): + _create_tdag_trace(instrumented_binary, trace_file, "fgetc") + _test_trace(trace_file) diff --git a/unittests/src/taintdag/CMakeLists.txt b/unittests/src/taintdag/CMakeLists.txt index b620b84a..fd2daf02 100644 --- a/unittests/src/taintdag/CMakeLists.txt +++ b/unittests/src/taintdag/CMakeLists.txt @@ -6,13 +6,16 @@ add_executable( tdag.cpp taintdag.cpp bitmap_section.cpp - encoding.cpp + control_flow_log_encoding.cpp + taint_label_encoding.cpp fnmapping.cpp - fntrace.cpp union.cpp labeldeq.cpp + section.cpp stream_offset.cpp - control_flow_log.cpp) + string_table.cpp + storage.cpp + ) target_include_directories(${TAINTDAG_UNITTEST} PRIVATE ${CMAKE_SOURCE_DIR}/polytracker/include) diff --git a/unittests/src/taintdag/control_flow_log.cpp b/unittests/src/taintdag/control_flow_log_encoding.cpp similarity index 90% rename from unittests/src/taintdag/control_flow_log.cpp rename to unittests/src/taintdag/control_flow_log_encoding.cpp index fcafe61b..97ce2fb2 100644 --- a/unittests/src/taintdag/control_flow_log.cpp +++ b/unittests/src/taintdag/control_flow_log_encoding.cpp @@ -7,12 +7,13 @@ * the LICENSE file found in the root directory of this source tree. */ -#include "taintdag/control_flow_log.h" +#include "taintdag/control_flow_log_encoding.h" #include "taintdag/section.h" + #include -TEST_CASE("Simple varint encoding") { - using namespace taintdag::detail; +TEST_CASE("Simple control flow log varint encoding") { + using namespace taintdag; uint8_t buffer[5]; SECTION("Encode 0") { diff --git a/unittests/src/taintdag/fnmapping.cpp b/unittests/src/taintdag/fnmapping.cpp index 5352da2b..2fabac9f 100644 --- a/unittests/src/taintdag/fnmapping.cpp +++ b/unittests/src/taintdag/fnmapping.cpp @@ -15,26 +15,26 @@ TEST_CASE("Test fnmapping operations") { SECTION("Add unique functions, functions are successfully inserted") { td::OutputFile of{std::tmpnam(nullptr)}; auto &functions{of.section()}; - REQUIRE(functions.add_mapping("foo")); - REQUIRE(functions.add_mapping("bar")); - REQUIRE(functions.add_mapping("baz")); + REQUIRE(functions.add_mapping(4, "foo")); + REQUIRE(functions.add_mapping(55, "bar")); + REQUIRE(functions.add_mapping(1, "baz")); } SECTION("Add unique functions, functions have successive indices") { td::OutputFile of{std::tmpnam(nullptr)}; auto &functions{of.section()}; - REQUIRE(functions.add_mapping("foo").value_or(3) == 0); - REQUIRE(functions.add_mapping("bar").value_or(3) == 1); - REQUIRE(functions.add_mapping("baz").value_or(3) == 2); + REQUIRE(functions.add_mapping(4, "foo").value_or(3) == 0); + REQUIRE(functions.add_mapping(55, "bar").value_or(3) == 1); + REQUIRE(functions.add_mapping(1, "baz").value_or(3) == 2); } SECTION("Add duplicate functions, duplicate functions have the same index") { td::OutputFile of{std::tmpnam(nullptr)}; auto &functions{of.section()}; - auto foo_1{functions.add_mapping("foo").value_or(3)}; - functions.add_mapping("bar"); - functions.add_mapping("baz"); - auto foo_2{functions.add_mapping("foo").value_or(4)}; + auto foo_1{functions.add_mapping(4, "foo").value_or(3)}; + functions.add_mapping(55, "bar"); + functions.add_mapping(1, "baz"); + auto foo_2{functions.add_mapping(4, "foo").value_or(4)}; REQUIRE(foo_1 == foo_2); } } \ No newline at end of file diff --git a/unittests/src/taintdag/fntrace.cpp b/unittests/src/taintdag/fntrace.cpp deleted file mode 100644 index a1600e38..00000000 --- a/unittests/src/taintdag/fntrace.cpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2022-present, Trail of Bits, Inc. - * All rights reserved. - * - * This source code is licensed in accordance with the terms specified in - * the LICENSE file found in the root directory of this source tree. - */ - -#include "taintdag/fntrace.h" - -#include - -TEST_CASE("Test fntrace operations") { - namespace td = taintdag; - SECTION("Log unique events") { - td::OutputFile of{std::tmpnam(nullptr)}; - auto &events{of.section()}; - td::Functions::index_t fnidx{0}; - events.log_fn_event(td::Event::kind_t::entry, fnidx); - events.log_fn_event(td::Event::kind_t::exit, fnidx); - SECTION("Events are successfully written") { - REQUIRE(events.count() == 2); - td::Event entry{*events.begin()}; - REQUIRE(entry.kind == td::Event::kind_t::entry); - REQUIRE(entry.function == fnidx); - td::Event exit{*(events.begin() + 1)}; - REQUIRE(exit.kind == td::Event::kind_t::exit); - REQUIRE(exit.function == fnidx); - } - } -} \ No newline at end of file diff --git a/unittests/src/taintdag/section.cpp b/unittests/src/taintdag/section.cpp new file mode 100644 index 00000000..892590bf --- /dev/null +++ b/unittests/src/taintdag/section.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include +#include + +#include "taintdag/section.h" + +#include "utils.h" + +namespace taintdag { +TEST_CASE("SectionBase operations are consistent", "[SectionBase]") { + + // To be able to capture error_exits + test::ErrorExitReplace errthrow; + + // Exposing the members of SectionBase + struct TestSectionBase : public SectionBase { + TestSectionBase(span_t t) : SectionBase{t} {} + + auto write(size_t s) { return SectionBase::write(s); } + + auto offset(SectionBase::span_t::iterator o) { + return SectionBase::offset(o); + } + + auto offset(uint8_t const *p) { return SectionBase::offset(p); } + }; + + std::uint8_t backing[64]; + TestSectionBase sb{backing}; + SectionBase::span_t last; + + REQUIRE(sb.size() == 0); + + // Allocate 1 byte + { + auto ctx = sb.write(1); + REQUIRE(ctx); + last = ctx->mem; + } + REQUIRE(sb.size() == 1); + REQUIRE(sb.offset(last.begin()) == 0); + REQUIRE(sb.offset(&*last.begin()) == 0); + REQUIRE(last.size() == 1); + + // Allocate remainder but 1 byte + auto n = sizeof(backing) - 2; + { + auto ctx = sb.write(n); + REQUIRE(ctx); + // Allocation is compact + REQUIRE(ctx->mem.begin() == last.end()); + last = ctx->mem; + } + + REQUIRE(sb.size() == n + 1); + REQUIRE(sb.offset(last.begin()) == 1); + REQUIRE(sb.offset(&*last.begin()) == 1); + REQUIRE(last.size() == n); + + // Allocate last byte + { + auto ctx = sb.write(1); + REQUIRE(ctx); + // Allocation is compact + REQUIRE(ctx->mem.begin() == last.end()); + last = ctx->mem; + } + + REQUIRE(sb.size() == n + 1 + 1); + REQUIRE(sb.offset(last.begin()) == n + 1); + REQUIRE(sb.offset(&*last.begin()) == n + 1); + REQUIRE(last.size() == 1); + + // Attempt additional allocation, should fail. + auto ctx = sb.write(1); + REQUIRE(!ctx); + + // If offset is requested for out of bounds memory, just abort. Something + // is seriously wrong. + REQUIRE_THROWS_AS(sb.offset(SectionBase::span_t::iterator{}), + test::ErrorExit); + REQUIRE_THROWS_AS(sb.offset(last.end()), test::ErrorExit); + + REQUIRE_THROWS_AS(sb.offset(static_cast(nullptr)), + test::ErrorExit); + REQUIRE_THROWS_AS( + sb.offset(reinterpret_cast(&backing + sizeof(backing))), + test::ErrorExit); +} + +TEST_CASE("FixedSizeAlloc operations are consistent", "[FixedSizeAlloc]") { + + // To be able to capture error_exits + test::ErrorExitReplace errthrow; + + struct Dummy { + int32_t i; + char c; + + Dummy(int32_t ii, char cc) : i{ii}, c{cc} {} + }; + + // Assumptions for the test case. + REQUIRE(alignof(Dummy) == 4); + REQUIRE(sizeof(Dummy) == 8); + + using Section = FixedSizeAlloc; + + const size_t backing_count = 3; + const size_t backing_bytes = backing_count * sizeof(Dummy); + + // To ensure we get correct alignment of the backing + alignas(Dummy) std::uint8_t backing[backing_bytes]; + Section s{backing}; + + REQUIRE(s.entry_size() == sizeof(Dummy)); + REQUIRE(s.align_of == alignof(Dummy)); + REQUIRE(s.size() == 0); + REQUIRE(s.count() == 0); + REQUIRE(s.begin() == s.end()); + + SECTION("Adding instances affect size, count and constructed instance is " + "available") { + // Can add first entry + { + auto ctx = s.construct(999, 'A'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == 999); + REQUIRE(ctx->t.c == 'A'); + REQUIRE(s.index(ctx->t) == 0); + } + REQUIRE(s.count() == 1); + REQUIRE(s.size() == sizeof(Dummy)); + + // Can add when there is already an entry but not full. + { + auto ctx = s.construct(33, 'B'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == 33); + REQUIRE(ctx->t.c == 'B'); + REQUIRE(s.index(ctx->t) == 1); + } + REQUIRE(s.count() == 2); + REQUIRE(s.size() == 2 * sizeof(Dummy)); + + // Can fill the backing store with entries + { + auto ctx = s.construct(-1, 'C'); + REQUIRE(ctx); + REQUIRE(ctx->t.i == -1); + REQUIRE(ctx->t.c == 'C'); + REQUIRE(s.index(ctx->t) == 2); + } + REQUIRE(s.count() == 3); + REQUIRE(s.size() == 3 * sizeof(Dummy)); + + // Can't insert beyound capacity + auto ctx = s.construct(-5, 'D'); + REQUIRE(!ctx); + } + + SECTION("Require aligned construction") { + SectionBase::span_t b1{&backing[1], sizeof(backing) - 7}; + REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); + + SectionBase::span_t b2{&backing[2], sizeof(backing) - 6}; + REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); + + SectionBase::span_t b3{&backing[3], sizeof(backing) - 5}; + REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); + } + + SECTION("Require size to be a multiple of align_of") { + SectionBase::span_t b1{&backing[0], sizeof(backing) - 1}; + REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); + + SectionBase::span_t b2{&backing[0], sizeof(backing) - 2}; + REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); + + SectionBase::span_t b3{&backing[0], sizeof(backing) - 3}; + REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); + } + + SECTION("Iteration") { + s.construct(-1, 'a'); + REQUIRE(std::distance(s.begin(), s.end()) == 1); + s.construct(-2, 'b'); + REQUIRE(std::distance(s.begin(), s.end()) == 2); + s.construct(-3, 'c'); + REQUIRE(std::distance(s.begin(), s.end()) == 3); + + // Know that begin is valid due to above + auto &first = *s.begin(); + REQUIRE(first.i == -1); + REQUIRE(first.c == 'a'); + } +} +} // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/storage.cpp b/unittests/src/taintdag/storage.cpp new file mode 100644 index 00000000..a94bf001 --- /dev/null +++ b/unittests/src/taintdag/storage.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include +#include + +#include "taintdag/storage.h" + +#include "utils.h" + +namespace taintdag { +TEST_CASE("Type properties of FixedSizeFile", "[FixedSizeFile]") { + // Don't want multiple copies referring to the same file + REQUIRE(!std::is_copy_constructible_v); + REQUIRE(!std::is_copy_assignable_v); + + // NOTE(hbrodin): The FixedSizeFile is currently not move + // constructible/assignable. There is nothing preventing such an + // implementation. Currently there is no need so leave this as is. + REQUIRE(!std::is_move_assignable_v); + REQUIRE(!std::is_move_constructible_v); +} + +TEST_CASE("Type properties of MMapFile", "[MMapFile]") { + // Don't want multiple copies referring to the same regions + REQUIRE(!std::is_copy_constructible_v); + REQUIRE(!std::is_copy_assignable_v); + + // NOTE(hbrodin): The MMapFile is currently not move constructible/assignable. + // Behavior is currently inherited from FixedSizeFile. Should that change, + // the MMapFile would change as well. + REQUIRE(!std::is_move_assignable_v); + REQUIRE(!std::is_move_constructible_v); +} +} // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/string_table.cpp b/unittests/src/taintdag/string_table.cpp new file mode 100644 index 00000000..64c26890 --- /dev/null +++ b/unittests/src/taintdag/string_table.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include + +#include "taintdag/outputfile.h" +#include "taintdag/string_table.h" +#include "taintdag/taint_source.h" + +#include "utils.h" + +namespace taintdag { +TEST_CASE("The Sources and StringTable sections can store source entries", + "[Sources, StringTable]") { + OutputFile of{std::tmpnam(nullptr)}; + auto &sources_section{of.section()}; + auto &string_table{of.section()}; + + SECTION("Can add taint-source entries to the Sources section", + "[Sources, StringTable]") { + int fd = 3; + REQUIRE(!sources_section.mapping_idx(fd)); + + auto s1 = sources_section.add_source("test", fd, 122); + REQUIRE(s1.has_value()); + + auto m = sources_section.mapping_idx(fd); + REQUIRE(m.has_value()); + REQUIRE(*s1 == *m); + + auto m1 = sources_section.get(*m); + REQUIRE(m1.fd == fd); + + REQUIRE(m1.name(string_table) == "test"); + REQUIRE(m1.size == 122); + + int fd2 = 99; + auto s2 = + sources_section.add_source("test2", fd2, SourceEntry::InvalidSize); + REQUIRE(s2.has_value()); + + auto idx2 = sources_section.mapping_idx(fd2); + REQUIRE(idx2.has_value()); + + auto m2 = sources_section.get(*idx2); + REQUIRE(m2.fd == fd2); + REQUIRE(m2.name(string_table) == "test2"); + + REQUIRE(m2.size == SourceEntry::InvalidSize); + } + + WHEN("Adding taint-sources to the Sources section and the string table") { + THEN("Latest wins in terms in case output_file has multiple mappings for " + "the same fd") { + int fd = 1; + sources_section.add_source("first", fd); + sources_section.add_source("second", fd); + + auto mm = sources_section.mapping_idx(fd); + REQUIRE(mm); + + auto m = sources_section.get(*mm); + REQUIRE(m.fd == fd); + REQUIRE(m.name(string_table) == "second"); + } + } +} + +TEST_CASE("StringTable add/iterate", "[StringTable]") { + // To be able to capture error_exits + test::ErrorExitReplace errthrow; + + OutputFile of{std::tmpnam(nullptr)}; + auto &string_table{of.section()}; + + SECTION("StringTable properties") { + // squish everything together as close as we can + REQUIRE(StringTable::align_of == 2UL); + // no elements in the string table to start + REQUIRE(string_table.size() == 0); + REQUIRE(string_table.begin() == string_table.end()); + } + + WHEN("A string is added") { + THEN("It should also be retrievable from the offset of its length") { + auto ofs = string_table.add_string("Hello"); + REQUIRE(ofs); + REQUIRE(string_table.from_offset(*ofs) == "Hello"); + + auto ofs2 = string_table.add_string("World"); + REQUIRE(ofs2); + REQUIRE(string_table.from_offset(*ofs2) == "World"); + } + } + + WHEN("Multiple strings are added") { + THEN("They should be iterable using begin() and end()") { + string_table.add_string("a"); + string_table.add_string("b"); + string_table.add_string("c"); + string_table.add_string("d"); + + std::vector res; + std::copy(string_table.begin(), string_table.end(), + std::back_inserter(res)); + REQUIRE(res.size() == 4); + REQUIRE(res[0] == "a"); + REQUIRE(res[1] == "b"); + REQUIRE(res[2] == "c"); + REQUIRE(res[3] == "d"); + } + } + + WHEN("Adding to the string table") { + THEN("A string bigger than the maximum string size will be truncated and " + "stored") { + // display the info logging + spdlog::set_level(spdlog::level::debug); + + auto len = StringTable::max_entry_size + 10; + std::string too_big(len, 'A'); + REQUIRE_NOTHROW([&]() { + auto offset = string_table.add_string(too_big); + REQUIRE(offset.has_value()); + + std::string_view result = string_table.from_offset(offset.value()); + REQUIRE(result.size() + sizeof(StringTable::length_t) == + StringTable::max_entry_size - 1); + }()); + } + + THEN("Can fill the remainder of the string table to capacity with many " + "short strings") { + std::string s{"a"}; + while (auto os = string_table.add_string(s)) { + if (!os.has_value()) { + break; + } + + auto offset = os.value(); + REQUIRE(string_table.size() > offset); + REQUIRE(offset <= string_table.max_offset); + + auto result = string_table.from_offset(offset); + REQUIRE(s.compare(result.data()) == 0); + } + } + + THEN("Cannot add more strings if the table is full") { + std::string onemore{"excuse me may I have another\n"}; + REQUIRE_NOTHROW([&]() { + auto should_be_empty = string_table.add_string(onemore); + REQUIRE(!should_be_empty.has_value()); + }); + } + } +} +} // namespace taintdag \ No newline at end of file diff --git a/unittests/src/taintdag/encoding.cpp b/unittests/src/taintdag/taint_label_encoding.cpp similarity index 89% rename from unittests/src/taintdag/encoding.cpp rename to unittests/src/taintdag/taint_label_encoding.cpp index 9085394b..749854aa 100644 --- a/unittests/src/taintdag/encoding.cpp +++ b/unittests/src/taintdag/taint_label_encoding.cpp @@ -7,8 +7,8 @@ using namespace taintdag; TEST_CASE("Encoding decoding") { - for (auto i=0;i<100000;i++) { - auto [t,_] = test::rand_taint(); + for (auto i = 0; i < 100000; i++) { + auto [t, _] = test::rand_taint(); auto encoded = taintdag::encode(t); Taint decoded = taintdag::decode(encoded); REQUIRE(decoded == t); @@ -25,7 +25,7 @@ TEST_CASE("Affects control flow") { } TEST_CASE("Basic sanity checks") { - for (size_t i=0;i<100000;i++) { + for (size_t i = 0; i < 100000; i++) { auto [st, _] = test::random_source_taint(); auto encoded = encode(st); REQUIRE((encoded >> source_taint_bit_shift)); @@ -34,15 +34,13 @@ TEST_CASE("Basic sanity checks") { } } - TEST_CASE("Compare equal ignore cf") { - for (size_t i=0;i<1000;i++) { + for (size_t i = 0; i < 1000; i++) { auto [t1, _1] = test::rand_taint(); auto [t2, _2] = test::rand_taint(); if (t1 == t2) continue; - auto e1 = encode(t1); // Affects control flow auto e1cf = add_affects_control_flow(e1); @@ -54,6 +52,5 @@ TEST_CASE("Compare equal ignore cf") { REQUIRE(equal_ignore_cf(e1, e1cf)); REQUIRE(!equal_ignore_cf(e1, e2)); REQUIRE(!equal_ignore_cf(e1cf, e2)); - } } \ No newline at end of file diff --git a/unittests/src/taintdag/tdag.cpp b/unittests/src/taintdag/tdag.cpp index 6e6285c7..b6527d52 100644 --- a/unittests/src/taintdag/tdag.cpp +++ b/unittests/src/taintdag/tdag.cpp @@ -1,392 +1,112 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + #include +#include -#include "taintdag/outputfile.h" -#include "taintdag/section.h" -#include "taintdag/storage.h" +#include "taintdag/bitmap_section.h" +#include "taintdag/control_flow_log.h" +#include "taintdag/fnmapping.h" +#include "taintdag/labels.h" +#include "taintdag/sink.h" +#include "taintdag/stream_offset.h" #include "taintdag/string_table.h" +#include "taintdag/taint.h" #include "taintdag/taint_source.h" -#include "taintdag/labels.h" +#include "taintdag/util.h" #include "utils.h" namespace taintdag { - -TEST_CASE("Test TDAG", "Integration") { - OutputFile tdg("filename.bin"); - auto offset1 = tdg.section().add_string("Hello"); - auto offset2 = tdg.section().add_string("World!"); - REQUIRE(offset1 != offset2); - - auto idx = tdg.section().add_source("sourcename", -1); - REQUIRE(idx); - REQUIRE(*idx == 0); - auto idx2 = tdg.section().add_source("next-source", 2); - REQUIRE(*idx2 == 1); -} - -TEST_CASE("Type properties FixedSizeFile", "[FixedSizeFile]") { - // Don't want multiple copies referring to the same file - REQUIRE(!std::is_copy_constructible_v); - REQUIRE(!std::is_copy_assignable_v); - - // NOTE(hbrodin): The FixedSizeFile is currently not move - // constructible/assignable. There is nothing preventing such an - // implementation. Currently there is no need so leave this as is. - REQUIRE(!std::is_move_assignable_v); - REQUIRE(!std::is_move_constructible_v); -} - -TEST_CASE("Type properties MMapFile", "[MMapFile]") { - // Don't want multiple copies referring to the same regions - REQUIRE(!std::is_copy_constructible_v); - REQUIRE(!std::is_copy_assignable_v); - - // NOTE(hbrodin): The MMapFile is currently not move constructible/assignable. - // Behavior is currently inherited from FixedSizeFile. Should that change, - // the MMapFile would change as well. - REQUIRE(!std::is_move_assignable_v); - REQUIRE(!std::is_move_constructible_v); -} - -TEST_CASE("SectionBase operations are consistent", "[SectionBase]") { - - // To be able to capture error_exits - test::ErrorExitReplace errthrow; - - // Exposing the members of SectionBase - struct TestSectionBase : public SectionBase { - TestSectionBase(span_t t) : SectionBase{t} {} - - auto write(size_t s) { return SectionBase::write(s); } - - auto offset(SectionBase::span_t::iterator o) { - return SectionBase::offset(o); - } - - auto offset(uint8_t const *p) { return SectionBase::offset(p); } - }; - - std::uint8_t backing[64]; - TestSectionBase sb{backing}; - SectionBase::span_t last; - - REQUIRE(sb.size() == 0); - - // Allocate 1 byte - { - auto ctx = sb.write(1); - REQUIRE(ctx); - last = ctx->mem; - } - REQUIRE(sb.size() == 1); - REQUIRE(sb.offset(last.begin()) == 0); - REQUIRE(sb.offset(&*last.begin()) == 0); - REQUIRE(last.size() == 1); - - // Allocate remainder but 1 byte - auto n = sizeof(backing) - 2; - { - auto ctx = sb.write(n); - REQUIRE(ctx); - // Allocation is compact - REQUIRE(ctx->mem.begin() == last.end()); - last = ctx->mem; - } - - REQUIRE(sb.size() == n + 1); - REQUIRE(sb.offset(last.begin()) == 1); - REQUIRE(sb.offset(&*last.begin()) == 1); - REQUIRE(last.size() == n); - - // Allocate last byte - { - auto ctx = sb.write(1); - REQUIRE(ctx); - // Allocation is compact - REQUIRE(ctx->mem.begin() == last.end()); - last = ctx->mem; - } - - REQUIRE(sb.size() == n + 1 + 1); - REQUIRE(sb.offset(last.begin()) == n + 1); - REQUIRE(sb.offset(&*last.begin()) == n + 1); - REQUIRE(last.size() == 1); - - // Attempt additional allocation, should fail. - auto ctx = sb.write(1); - REQUIRE(!ctx); - - // If offset is requirested for out of bounds memory, just abort. Something - // is seriously wrong. - REQUIRE_THROWS_AS(sb.offset(SectionBase::span_t::iterator{}), - test::ErrorExit); - REQUIRE_THROWS_AS(sb.offset(last.end()), test::ErrorExit); - - REQUIRE_THROWS_AS(sb.offset(static_cast(nullptr)), - test::ErrorExit); - REQUIRE_THROWS_AS( - sb.offset(reinterpret_cast(&backing + sizeof(backing))), - test::ErrorExit); -} - -TEST_CASE("FixedSizeAlloc operations are consistent", "[FixedSizeAlloc]") { - - // To be able to capture error_exits - test::ErrorExitReplace errthrow; - - struct Dummy { - int32_t i; - char c; - - Dummy(int32_t ii, char cc) : i{ii}, c{cc} {} - }; - - // Assumptions for the test case. - REQUIRE(alignof(Dummy) == 4); - REQUIRE(sizeof(Dummy) == 8); - - using Section = FixedSizeAlloc; - - const size_t backing_count = 3; - const size_t backing_bytes = backing_count * sizeof(Dummy); - - // To ensure we get correct alignment of the backing - alignas(Dummy) std::uint8_t backing[backing_bytes]; - Section s{backing}; - - REQUIRE(s.entry_size() == sizeof(Dummy)); - REQUIRE(s.align_of == alignof(Dummy)); - REQUIRE(s.size() == 0); - REQUIRE(s.count() == 0); - REQUIRE(s.begin() == s.end()); - - SECTION("Adding instances affect size, count and constructed instance is " - "available") { - // Can add first entry - { - auto ctx = s.construct(999, 'A'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == 999); - REQUIRE(ctx->t.c == 'A'); - REQUIRE(s.index(ctx->t) == 0); - } - REQUIRE(s.count() == 1); - REQUIRE(s.size() == sizeof(Dummy)); - - // Can add when there is already an entry but not full. - { - auto ctx = s.construct(33, 'B'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == 33); - REQUIRE(ctx->t.c == 'B'); - REQUIRE(s.index(ctx->t) == 1); - } - REQUIRE(s.count() == 2); - REQUIRE(s.size() == 2 * sizeof(Dummy)); - - // Can fill the backing store with entries - { - auto ctx = s.construct(-1, 'C'); - REQUIRE(ctx); - REQUIRE(ctx->t.i == -1); - REQUIRE(ctx->t.c == 'C'); - REQUIRE(s.index(ctx->t) == 2); - } - REQUIRE(s.count() == 3); - REQUIRE(s.size() == 3 * sizeof(Dummy)); - - // Can't insert beyound capacity - auto ctx = s.construct(-5, 'D'); - REQUIRE(!ctx); +TEST_CASE("Test basic TDAG construction", "[Integration]") { + using SourceLabelIndexSection = BitmapSectionBase<5, BitCount{max_label} + 1>; + using ConcreteOutputFile = + OutputFile; + ConcreteOutputFile tdg("test.tdag"); + + SECTION("Sources") { + auto idx = tdg.section().add_source("sourcename", -1); + REQUIRE(idx); + REQUIRE(*idx == 0); + REQUIRE(tdg.section().count() == 1); + auto idx2 = tdg.section().add_source("next-source", 2); + REQUIRE(*idx2 == 1); + REQUIRE(tdg.section().count() == 2); } - SECTION("Require aligned construction") { - SectionBase::span_t b1{&backing[1], sizeof(backing) - 7}; - REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); + SECTION("Labels") { + // 25 is randomly chosen; ranges can be bigger + unsigned long length = rand() % 25 + 1; - SectionBase::span_t b2{&backing[2], sizeof(backing) - 6}; - REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); + // label range represents a data structure like an array + auto test_range = + tdg.section().create_source_labels(-1, -1, length); + REQUIRE(test_range.first != test_range.second); - SectionBase::span_t b3{&backing[3], sizeof(backing) - 5}; - REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); - } + // todo(kaoudis) this seems like it should be specific, on the order of the + // number of items in the range. why isn't it? + auto size_with_range = tdg.section().count(); + REQUIRE(size_with_range > 0); - SECTION("Require size to be a multiple of align_of") { - SectionBase::span_t b1{&backing[0], sizeof(backing) - 1}; - REQUIRE_THROWS_AS(Section{b1}, test::ErrorExit); + tdg.section().set_range(BitIndex{test_range.first}, + BitCount{length}); + REQUIRE(tdg.section().size() > 0); - SectionBase::span_t b2{&backing[0], sizeof(backing) - 2}; - REQUIRE_THROWS_AS(Section{b2}, test::ErrorExit); + // label union represents a step in the progression of taint + auto test_union = + tdg.section().union_taint(test_range.first, test_range.second); + REQUIRE(test_union != test_range.first); + REQUIRE(test_union != test_range.second); - SectionBase::span_t b3{&backing[0], sizeof(backing) - 3}; - REQUIRE_THROWS_AS(Section{b3}, test::ErrorExit); + // added just one new label - the union + REQUIRE(tdg.section().count() == size_with_range + 1); } - SECTION("Iteration") { - s.construct(-1, 'a'); - REQUIRE(std::distance(s.begin(), s.end()) == 1); - s.construct(-2, 'b'); - REQUIRE(std::distance(s.begin(), s.end()) == 2); - s.construct(-3, 'c'); - REQUIRE(std::distance(s.begin(), s.end()) == 3); - - // Know that begin is valid due to above - auto &first = *s.begin(); - REQUIRE(first.i == -1); - REQUIRE(first.c == 'a'); + SECTION("String Table") { + auto offset1 = tdg.section().add_string("Hello"); + auto offset2 = tdg.section().add_string("World!"); + REQUIRE(offset1 != offset2); + // for the string table, size() yields the size of all included entries, + // plus the size of the offsets to them + REQUIRE(tdg.section().size() == 16); } -} - -// Dummy OutputFile, to allow retrieving the StringTable -struct DummyOf { - template T §ion() { return st; } - - StringTable &st; -}; - -TEST_CASE("Taint sources basic usage", "[Sources]") { - - const size_t max_sources = 4; - const size_t allocation_size = max_sources * sizeof(SourceEntry); - alignas(SourceEntry) uint8_t backing[allocation_size]; - const size_t strings_size = 128; - uint8_t string_backing[strings_size]; - - // NOTE(hbrodin): .output_file arg is not used in StringTable so just - // construct an int. - int dummy = 1; - StringTable st( - SectionArg{.output_file = dummy, .range = string_backing}); - - DummyOf of{st}; - - Sources src{SectionArg{.output_file = of, .range = backing}}; - - // TODO(hbrodin): Refactor below. - - SECTION("Add and retrieve mappings") { - int fd = 3; - REQUIRE(!src.mapping_idx(fd)); - - auto s1 = src.add_source("test", fd, 122); - REQUIRE(s1); - auto m = src.mapping_idx(fd); - REQUIRE(m); - REQUIRE(*s1 == *m); - - auto m1 = src.get(*m); - REQUIRE(m1.fd == fd); - REQUIRE(m1.name(st) == "test"); - REQUIRE(m1.size == 122); - - int fd2 = 99; - auto s2 = src.add_source("test2", fd2, SourceEntry::InvalidSize); - REQUIRE(s2); - auto idx2 = src.mapping_idx(fd2); - REQUIRE(idx2); - auto m2 = src.get(*idx2); - REQUIRE(m2.fd == fd2); - REQUIRE(m2.name(st) == "test2"); - REQUIRE(m2.size == SourceEntry::InvalidSize); + SECTION("Sinks") { + tdg.section().log_single(-1, -1, 0); + REQUIRE(tdg.section().count() == 1); } - SECTION("Latest wins in case of multiple mappings for same fd") { - int fd = 1; - src.add_source("first", fd); - src.add_source("second", fd); - - auto mm = src.mapping_idx(fd); - REQUIRE(mm); - - auto m = src.get(*mm); - REQUIRE(m.fd == fd); - REQUIRE(m.name(st) == "second"); - } -} - -TEST_CASE("StringTable add/iterate", "[StringTable]") { - // To be able to capture error_exits - test::ErrorExitReplace errthrow; - - alignas(StringTable::length_t) uint8_t backing[64]; - - int dummy = 1; - StringTable st{SectionArg{.output_file = dummy, .range = backing}}; - - SECTION("Initial properties") { - REQUIRE(StringTable::align_of == alignof(StringTable::length_t)); - REQUIRE(st.size() == 0); - REQUIRE(st.begin() == st.end()); - - REQUIRE(sizeof(StringTable::length_t) <= sizeof(StringTable::offset_t)); - } - - SECTION("Adding/retrieving") { - auto ofs = st.add_string("Hello"); - REQUIRE(ofs); - REQUIRE(st.from_offset(*ofs) == "Hello"); - - auto ofs2 = st.add_string("World"); - REQUIRE(ofs2); - REQUIRE(st.from_offset(*ofs2) == "World"); - } - - SECTION("Iteration") { - st.add_string("a"); - st.add_string("b"); - st.add_string("c"); - st.add_string("d"); - - std::vector res; - std::copy(st.begin(), st.end(), std::back_inserter(res)); - REQUIRE(res.size() == 4); - REQUIRE(res[0] == "a"); - REQUIRE(res[1] == "b"); - REQUIRE(res[2] == "c"); - REQUIRE(res[3] == "d"); - } - - SECTION("Capacity") { - SECTION("Fill with one string") { - std::string s(sizeof(backing) - sizeof(StringTable::length_t), 'A'); - REQUIRE(st.add_string(s)); - std::string s2{1, 'B'}; - REQUIRE(!st.add_string(s2)); - } - - SECTION("Fill with many short strings") { - std::string s{"a"}; - size_t n = 0; - while (st.add_string(s)) { - ++n; - } - auto allocsize = sizeof(StringTable::length_t) + s.size(); - // Per string allocation size - if (auto rem = allocsize % StringTable::align_of; rem != 0) { - allocsize += StringTable::align_of - rem; - } - - REQUIRE(n == sizeof(backing) / allocsize); - } - } - - SECTION("Errors") { - // Trying to store a string larger than can be represented by the length_t - auto len = - static_cast(std::numeric_limits::max()) + - 1; - char const *strp = reinterpret_cast(&backing[0]); - REQUIRE_THROWS_AS(st.add_string({strp, len}), test::ErrorExit); - - // Allocation is larger than can be represented by the offset type. - auto alloc_size = - static_cast(std::numeric_limits::max()) + - 1; - auto span = StringTable::span_t{&backing[0], alloc_size}; - REQUIRE_THROWS_AS( - (StringTable{SectionArg{.output_file = dummy, .range = span}}), - test::ErrorExit); + SECTION("Tainted Control Flow (includes String Table and Functions)") { + int function_id = 1; + + // just before enter_function, cf __polytracker_enter_function + // (we pair these always - function trace should only contain fns with + // enter and leave events!) + tdg.section().add_mapping(function_id, "hello_world"); + REQUIRE(tdg.section().count() == 1); + + // adds a new entry. entry size is dependent on varint_encoding, which + // uses up to 5 bytes packed into a size_t to represent a buffer that + // was originally filled with uint8_t's. + tdg.section().enter_function(function_id); + auto size_with_one_entry = tdg.section().size(); + REQUIRE(size_with_one_entry > 0); + + // adds a new entry + tdg.section().tainted_control_flow(-1, function_id); + auto size_with_two_entries = tdg.section().size(); + REQUIRE((size_with_two_entries / 2) >= size_with_one_entry); + + // adds a new entry + tdg.section().leave_function(function_id); + auto size_with_three_entries = tdg.section().size(); + REQUIRE((size_with_three_entries / 3) >= size_with_one_entry); } } } // namespace taintdag \ No newline at end of file