diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..20d75f4
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,64 @@
+---
+BasedOnStyle: Google
+AccessModifierOffset: -2
+ConstructorInitializerIndentWidth: 2
+AlignEscapedNewlinesLeft: false
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: None
+AlwaysBreakTemplateDeclarations: true
+AlwaysBreakBeforeMultilineStrings: false
+BreakBeforeBinaryOperators: false
+BreakBeforeTernaryOperators: false
+BreakConstructorInitializersBeforeComma: true
+BinPackParameters: true
+ColumnLimit: 120
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+DerivePointerBinding: false
+PointerBindsToType: true
+ExperimentalAutoDetectBinPacking: false
+IndentCaseLabels: true
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 60
+PenaltyBreakString: 100
+PenaltyBreakFirstLessLess: 1000
+PenaltyExcessCharacter: 1000
+PenaltyReturnTypeOnItsOwnLine: 70
+SpacesBeforeTrailingComments: 2
+Cpp11BracedListStyle: false
+Standard: Auto
+IndentWidth: 2
+TabWidth: 2
+UseTab: Never
+IndentFunctionDeclarationAfterType: false
+SpacesInParentheses: false
+SpacesInAngles:  false
+SpaceInEmptyParentheses: false
+SpacesInCStyleCastParentheses: false
+SpaceAfterControlStatementKeyword: true
+SpaceBeforeAssignmentOperators: true
+ContinuationIndentWidth: 4
+SortIncludes: false
+SpaceAfterCStyleCast: false
+
+# Configure each individual brace in BraceWrapping
+BreakBeforeBraces: Custom
+
+# Control of individual brace wrapping cases
+BraceWrapping: {
+  AfterClass: 'true'
+    AfterControlStatement: 'true'
+    AfterEnum : 'true'
+    AfterFunction : 'true'
+    AfterNamespace : 'true'
+    AfterStruct : 'true'
+    AfterUnion : 'true'
+    BeforeCatch : 'true'
+    BeforeElse : 'true'
+    IndentBraces : 'false'
+}
\ No newline at end of file
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 0000000..6f40a77
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,49 @@
+---
+Checks:          '-*,
+                  performance-*,
+                  llvm-namespace-comment,
+                  modernize-redundant-void-arg,
+                  modernize-use-nullptr,
+                  modernize-use-default,
+                  modernize-use-override,
+                  modernize-loop-convert,
+                  readability-named-parameter,
+                  readability-redundant-smartptr-get,
+                  readability-redundant-string-cstr,
+                  readability-simplify-boolean-expr,
+                  readability-container-size-empty,
+                  readability-identifier-naming,
+                  '
+HeaderFilterRegex: ''
+AnalyzeTemporaryDtors: false
+CheckOptions:
+  - key:             llvm-namespace-comment.ShortNamespaceLines
+    value:           '10'
+  - key:             llvm-namespace-comment.SpacesBeforeComments
+    value:           '2'
+  - key:             readability-braces-around-statements.ShortStatementLines
+    value:           '2'
+  # type names
+  - key:             readability-identifier-naming.ClassCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.UnionCase
+    value:           CamelCase
+  # method names
+  - key:             readability-identifier-naming.MethodCase
+    value:           camelBack
+  # variable names
+  - key:             readability-identifier-naming.VariableCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ClassMemberSuffix
+    value:           '_'
+  # const static or global variables are UPPER_CASE
+  - key:             readability-identifier-naming.EnumConstantCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.StaticConstantCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.ClassConstantCase
+    value:           UPPER_CASE
+  - key:             readability-identifier-naming.GlobalVariableCase
+    value:           UPPER_CASE
\ No newline at end of file
diff --git a/.flake8 b/.flake8
index 1d0f20d..7877ccc 100644
--- a/.flake8
+++ b/.flake8
@@ -1,6 +1,6 @@
 [flake8]
 max-line-length = 160
-exclude = tests/*
+exclude = tests/* uw_detection/src/yolov3
 max-complexity = 10
 # We import modules here to make the API cleaner
 per-file-ignores = **/recognize_speech/__init__.py:F401
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 4e0e4d4..0cf493d 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -7,6 +7,7 @@ jobs:
     env:
       AFTER_SCRIPT: 'cd $target_ws && rosenv && catkin build $(catkin list --depends-on roslint -u) --no-deps --no-status --verbose --make-args roslint'
       CATKIN_LINT: true
+      CATKIN_LINT_ARGS: "--ignore missing_directory --ignore external_link_directory"
       ROS_DISTRO: melodic
       ROS_REPO: main
     strategy:
@@ -15,7 +16,9 @@ jobs:
           - {}
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v1
+      - uses: actions/checkout@v2
+        with:
+          submodules: true
       - uses: 'ros-industrial/industrial_ci@master'
         env: ${{matrix.env}}
   lint:
diff --git a/uw_detection/CMakeLists.txt b/uw_detection/CMakeLists.txt
new file mode 100644
index 0000000..836d796
--- /dev/null
+++ b/uw_detection/CMakeLists.txt
@@ -0,0 +1,70 @@
+cmake_minimum_required(VERSION 2.8.12)
+project(uw_detection)
+
+find_package(catkin REQUIRED COMPONENTS
+        cv_bridge
+        geometry_msgs
+        image_transport
+        message_generation
+        message_filters
+        message_runtime
+        roscpp
+        sensor_msgs
+        std_msgs
+        std_srvs
+        tf2
+        tf2_eigen
+        visualization_msgs)
+
+catkin_python_setup()
+
+catkin_package()
+
+
+option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
+set(CMAKE_CXX_STANDARD 11)
+
+find_package(OpenCV REQUIRED)
+include_directories(OpenCV_INCLUDE_DIRS
+        ${catkin_INCLUDE_DIRS}
+        include)
+
+find_package(CUDA)
+
+if (CUDA_FOUND)
+set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
+
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    message("embed_platform on")
+    include_directories(/usr/local/cuda/targets/aarch64-linux/include)
+    link_directories(/usr/local/cuda/targets/aarch64-linux/lib)
+else()
+    message("embed_platform off")
+    include_directories(/usr/local/cuda/include)
+    link_directories(/usr/local/cuda/lib64)
+endif()
+
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
+
+cuda_add_library(yololayer SHARED src/yolov3/yololayer.cu)
+
+add_library(yolov3-spp src/yolov3/yolov3-spp.cpp)
+target_link_libraries(yolov3-spp nvinfer cudart yololayer ${OpenCV_LIBS})
+
+add_library(yolov3 src/yolov3/yolov3.cpp)
+target_link_libraries(yolov3 nvinfer cudart yololayer ${OpenCV_LIBS})
+
+add_executable(yolov3_spp_prepare_plan src/yolov3_spp_prepare_plan.cpp)
+target_link_libraries(yolov3_spp_prepare_plan yolov3 nvinfer cudart yololayer ${OpenCV_LIBS} ${catkin_LIBRARIES})
+
+add_executable(yolov3_prepare_plan src/yolov3_prepare_plan.cpp)
+target_link_libraries(yolov3_prepare_plan yolov3 nvinfer cudart yololayer ${OpenCV_LIBS} ${catkin_LIBRARIES})
+
+add_executable(yolo_ros_image_node src/yolo_ros_image_node.cpp)
+target_link_libraries(yolo_ros_image_node yolov3 nvinfer cudart yololayer ${OpenCV_LIBS} ${catkin_LIBRARIES})
+
+add_definitions(-O2 -pthread)
+endif()
+
diff --git a/uw_detection/README.md b/uw_detection/README.md
new file mode 100644
index 0000000..67778c5
--- /dev/null
+++ b/uw_detection/README.md
@@ -0,0 +1,17 @@
+# uw_detection
+
+Wrappers for detection models. Tools for preparing data and training.
+
+TensorRT YOLOv3 implementations taken from [tensorrtx](https://github.com/wang-xinyu/tensorrtx). Additional modifactions based on [this implementation](https://github.com/jkjung-avt/tensorrt_demos/blob/master/plugins/yolo_layer.cu#L39)
+
+## Usage
+
+### YOLO
+
+All of the YOLO targets described here are only available when CMake can detect a working CUDA and TensorRT installation. Ensure these are set up correctly before beginning. They can't be run on the CPU
+
+These implementations come with a helper for loading weights from the Ultralytics PyTorch trained models. Follow the steps in [their readme](https://github.com/wang-xinyu/tensorrtx/blob/master/yolov3/README.md) (the process involves using one of the `gen_wts.py` scripts to load the Torch graph and dump it into the necessary format). You may need to change the CUDA device set in the script.
+
+TensorRT requires that you precompute an inference graph ("engine") before you can actually use the weights you've obtained. The `*_prepare_plan.cpp` targets will take care of this. They expect the weights file to be placed in the `share` folder at the root of the package. When they've run successfully (only required once), they'll output the serialized engine into the `share` folder. Note that if you're using a customized class list, you'll need to adjust the definition for the number of classes in "yololayer.h".
+
+Now you can test the model on a ROS image topic using `yolo_ros_image_node.cpp`. It's configured to take images from Fetch's head camera topic and feed them through YOLOv3, outputting the results to the console and drawing them in a CV window. It'll work in simulation without changes assuming you've followed the previous steps for the Ultralytics provided yolov3 checkpoint. 
\ No newline at end of file
diff --git a/uw_detection/include/uw_detection/Utils.h b/uw_detection/include/uw_detection/Utils.h
new file mode 100644
index 0000000..0de663c
--- /dev/null
+++ b/uw_detection/include/uw_detection/Utils.h
@@ -0,0 +1,94 @@
+#ifndef __TRT_UTILS_H_
+#define __TRT_UTILS_H_
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <cudnn.h>
+
+#ifndef CUDA_CHECK
+
+#define CUDA_CHECK(callstr)                                                                    \
+    {                                                                                          \
+        cudaError_t error_code = callstr;                                                      \
+        if (error_code != cudaSuccess) {                                                       \
+            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
+            assert(0);                                                                         \
+        }                                                                                      \
+    }
+
+#endif
+
+namespace Tn
+{
+    class Profiler : public nvinfer1::IProfiler
+    {
+    public:
+        void printLayerTimes(int itrationsTimes)
+        {
+            float totalTime = 0;
+            for (size_t i = 0; i < mProfile.size(); i++)
+            {
+                printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes);
+                totalTime += mProfile[i].second;
+            }
+            printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes);
+        }
+    private:
+        typedef std::pair<std::string, float> Record;
+        std::vector<Record> mProfile;
+
+        virtual void reportLayerTime(const char* layerName, float ms)
+        {
+            auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; });
+            if (record == mProfile.end())
+                mProfile.push_back(std::make_pair(layerName, ms));
+            else
+                record->second += ms;
+        }
+    };
+
+    //Logger for TensorRT info/warning/errors
+    class Logger : public nvinfer1::ILogger
+    {
+    public:
+
+        Logger(): Logger(Severity::kWARNING) {}
+
+        Logger(Severity severity): reportableSeverity(severity) {}
+
+        void log(Severity severity, const char* msg) override
+        {
+            // suppress messages with severity enum value greater than the reportable
+            if (severity > reportableSeverity) return;
+
+            switch (severity)
+            {
+                case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
+                case Severity::kERROR: std::cerr << "ERROR: "; break;
+                case Severity::kWARNING: std::cerr << "WARNING: "; break;
+                case Severity::kINFO: std::cerr << "INFO: "; break;
+                default: std::cerr << "UNKNOWN: "; break;
+            }
+            std::cerr << msg << std::endl;
+        }
+
+        Severity reportableSeverity{Severity::kWARNING};
+    };
+
+    template<typename T> 
+    void write(char*& buffer, const T& val)
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+
+    template<typename T> 
+    void read(const char*& buffer, T& val)
+    {
+        val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+    }
+}
+
+#endif
\ No newline at end of file
diff --git a/uw_detection/include/uw_detection/logging.h b/uw_detection/include/uw_detection/logging.h
new file mode 100644
index 0000000..602b69f
--- /dev/null
+++ b/uw_detection/include/uw_detection/logging.h
@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORRT_LOGGING_H
+#define TENSORRT_LOGGING_H
+
+#include "NvInferRuntimeCommon.h"
+#include <cassert>
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+using Severity = nvinfer1::ILogger::Severity;
+
+class LogStreamConsumerBuffer : public std::stringbuf
+{
+public:
+    LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mOutput(stream)
+        , mPrefix(prefix)
+        , mShouldLog(shouldLog)
+    {
+    }
+
+    LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
+        : mOutput(other.mOutput)
+    {
+    }
+
+    ~LogStreamConsumerBuffer()
+    {
+        // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
+        // std::streambuf::pptr() gives a pointer to the current position of the output sequence
+        // if the pointer to the beginning is not equal to the pointer to the current position,
+        // call putOutput() to log the output to the stream
+        if (pbase() != pptr())
+        {
+            putOutput();
+        }
+    }
+
+    // synchronizes the stream buffer and returns 0 on success
+    // synchronizing the stream buffer consists of inserting the buffer contents into the stream,
+    // resetting the buffer and flushing the stream
+    virtual int sync()
+    {
+        putOutput();
+        return 0;
+    }
+
+    void putOutput()
+    {
+        if (mShouldLog)
+        {
+            // prepend timestamp
+            std::time_t timestamp = std::time(nullptr);
+            tm* tm_local = std::localtime(&timestamp);
+            std::cout << "[";
+            std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
+            std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
+            std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
+            // std::stringbuf::str() gets the string contents of the buffer
+            // insert the buffer contents pre-appended by the appropriate prefix into the stream
+            mOutput << mPrefix << str();
+            // set the buffer to empty
+            str("");
+            // flush the stream
+            mOutput.flush();
+        }
+    }
+
+    void setShouldLog(bool shouldLog)
+    {
+        mShouldLog = shouldLog;
+    }
+
+private:
+    std::ostream& mOutput;
+    std::string mPrefix;
+    bool mShouldLog;
+};
+
+//!
+//! \class LogStreamConsumerBase
+//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
+//!
+class LogStreamConsumerBase
+{
+public:
+    LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
+        : mBuffer(stream, prefix, shouldLog)
+    {
+    }
+
+protected:
+    LogStreamConsumerBuffer mBuffer;
+};
+
+//!
+//! \class LogStreamConsumer
+//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
+//!  Order of base classes is LogStreamConsumerBase and then std::ostream.
+//!  This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
+//!  in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
+//!  This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
+//!  Please do not change the order of the parent classes.
+//!
+class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
+{
+public:
+    //! \brief Creates a LogStreamConsumer which logs messages with level severity.
+    //!  Reportable severity determines if the messages are severe enough to be logged.
+    LogStreamConsumer(Severity reportableSeverity, Severity severity)
+        : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(severity <= reportableSeverity)
+        , mSeverity(severity)
+    {
+    }
+
+    LogStreamConsumer(LogStreamConsumer&& other)
+        : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
+        , std::ostream(&mBuffer) // links the stream buffer with the stream
+        , mShouldLog(other.mShouldLog)
+        , mSeverity(other.mSeverity)
+    {
+    }
+
+    void setReportableSeverity(Severity reportableSeverity)
+    {
+        mShouldLog = mSeverity <= reportableSeverity;
+        mBuffer.setShouldLog(mShouldLog);
+    }
+
+private:
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    static std::string severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    bool mShouldLog;
+    Severity mSeverity;
+};
+
+//! \class Logger
+//!
+//! \brief Class which manages logging of TensorRT tools and samples
+//!
+//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
+//! and supports logging two types of messages:
+//!
+//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
+//! - Test pass/fail messages
+//!
+//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
+//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
+//!
+//! In the future, this class could be extended to support dumping test results to a file in some standard format
+//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
+//!
+//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
+//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
+//! library and messages coming from the sample.
+//!
+//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
+//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
+//! object.
+
+class Logger : public nvinfer1::ILogger
+{
+public:
+    Logger(Severity severity = Severity::kWARNING)
+        : mReportableSeverity(severity)
+    {
+    }
+
+    //!
+    //! \enum TestResult
+    //! \brief Represents the state of a given test
+    //!
+    enum class TestResult
+    {
+        kRUNNING, //!< The test is running
+        kPASSED,  //!< The test passed
+        kFAILED,  //!< The test failed
+        kWAIVED   //!< The test was waived
+    };
+
+    //!
+    //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
+    //! \return The nvinfer1::ILogger associated with this Logger
+    //!
+    //! TODO Once all samples are updated to use this method to register the logger with TensorRT,
+    //! we can eliminate the inheritance of Logger from ILogger
+    //!
+    nvinfer1::ILogger& getTRTLogger()
+    {
+        return *this;
+    }
+
+    //!
+    //! \brief Implementation of the nvinfer1::ILogger::log() virtual method
+    //!
+    //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
+    //! inheritance from nvinfer1::ILogger
+    //!
+    void log(Severity severity, const char* msg) override
+    {
+        LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
+    }
+
+    //!
+    //! \brief Method for controlling the verbosity of logging output
+    //!
+    //! \param severity The logger will only emit messages that have severity of this level or higher.
+    //!
+    void setReportableSeverity(Severity severity)
+    {
+        mReportableSeverity = severity;
+    }
+
+    //!
+    //! \brief Opaque handle that holds logging information for a particular test
+    //!
+    //! This object is an opaque handle to information used by the Logger to print test results.
+    //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
+    //! with Logger::reportTest{Start,End}().
+    //!
+    class TestAtom
+    {
+    public:
+        TestAtom(TestAtom&&) = default;
+
+    private:
+        friend class Logger;
+
+        TestAtom(bool started, const std::string& name, const std::string& cmdline)
+            : mStarted(started)
+            , mName(name)
+            , mCmdline(cmdline)
+        {
+        }
+
+        bool mStarted;
+        std::string mName;
+        std::string mCmdline;
+    };
+
+    //!
+    //! \brief Define a test for logging
+    //!
+    //! \param[in] name The name of the test.  This should be a string starting with
+    //!                  "TensorRT" and containing dot-separated strings containing
+    //!                  the characters [A-Za-z0-9_].
+    //!                  For example, "TensorRT.sample_googlenet"
+    //! \param[in] cmdline The command line used to reproduce the test
+    //
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    //!
+    static TestAtom defineTest(const std::string& name, const std::string& cmdline)
+    {
+        return TestAtom(false, name, cmdline);
+    }
+
+    //!
+    //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
+    //!        as input
+    //!
+    //! \param[in] name The name of the test
+    //! \param[in] argc The number of command-line arguments
+    //! \param[in] argv The array of command-line arguments (given as C strings)
+    //!
+    //! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
+    static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
+    {
+        auto cmdline = genCmdlineString(argc, argv);
+        return defineTest(name, cmdline);
+    }
+
+    //!
+    //! \brief Report that a test has started.
+    //!
+    //! \pre reportTestStart() has not been called yet for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has started
+    //!
+    static void reportTestStart(TestAtom& testAtom)
+    {
+        reportTestResult(testAtom, TestResult::kRUNNING);
+        assert(!testAtom.mStarted);
+        testAtom.mStarted = true;
+    }
+
+    //!
+    //! \brief Report that a test has ended.
+    //!
+    //! \pre reportTestStart() has been called for the given testAtom
+    //!
+    //! \param[in] testAtom The handle to the test that has ended
+    //! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
+    //!                   TestResult::kFAILED, TestResult::kWAIVED
+    //!
+    static void reportTestEnd(const TestAtom& testAtom, TestResult result)
+    {
+        assert(result != TestResult::kRUNNING);
+        assert(testAtom.mStarted);
+        reportTestResult(testAtom, result);
+    }
+
+    static int reportPass(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kPASSED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportFail(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kFAILED);
+        return EXIT_FAILURE;
+    }
+
+    static int reportWaive(const TestAtom& testAtom)
+    {
+        reportTestEnd(testAtom, TestResult::kWAIVED);
+        return EXIT_SUCCESS;
+    }
+
+    static int reportTest(const TestAtom& testAtom, bool pass)
+    {
+        return pass ? reportPass(testAtom) : reportFail(testAtom);
+    }
+
+    Severity getReportableSeverity() const
+    {
+        return mReportableSeverity;
+    }
+
+private:
+    //!
+    //! \brief returns an appropriate string for prefixing a log message with the given severity
+    //!
+    static const char* severityPrefix(Severity severity)
+    {
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: return "[F] ";
+        case Severity::kERROR: return "[E] ";
+        case Severity::kWARNING: return "[W] ";
+        case Severity::kINFO: return "[I] ";
+        case Severity::kVERBOSE: return "[V] ";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate string for prefixing a test result message with the given result
+    //!
+    static const char* testResultString(TestResult result)
+    {
+        switch (result)
+        {
+        case TestResult::kRUNNING: return "RUNNING";
+        case TestResult::kPASSED: return "PASSED";
+        case TestResult::kFAILED: return "FAILED";
+        case TestResult::kWAIVED: return "WAIVED";
+        default: assert(0); return "";
+        }
+    }
+
+    //!
+    //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
+    //!
+    static std::ostream& severityOstream(Severity severity)
+    {
+        return severity >= Severity::kINFO ? std::cout : std::cerr;
+    }
+
+    //!
+    //! \brief method that implements logging test results
+    //!
+    static void reportTestResult(const TestAtom& testAtom, TestResult result)
+    {
+        severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
+                                         << testAtom.mCmdline << std::endl;
+    }
+
+    //!
+    //! \brief generate a command line string from the given (argc, argv) values
+    //!
+    static std::string genCmdlineString(int argc, char const* const* argv)
+    {
+        std::stringstream ss;
+        for (int i = 0; i < argc; i++)
+        {
+            if (i > 0)
+                ss << " ";
+            ss << argv[i];
+        }
+        return ss.str();
+    }
+
+    Severity mReportableSeverity;
+};
+
+namespace
+{
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
+//!
+//! Example usage:
+//!
+//!     LOG_VERBOSE(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
+//!
+//! Example usage:
+//!
+//!     LOG_INFO(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_INFO(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
+//!
+//! Example usage:
+//!
+//!     LOG_WARN(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_WARN(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
+//!
+//! Example usage:
+//!
+//!     LOG_ERROR(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_ERROR(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
+}
+
+//!
+//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
+//         ("fatal" severity)
+//!
+//! Example usage:
+//!
+//!     LOG_FATAL(logger) << "hello world" << std::endl;
+//!
+inline LogStreamConsumer LOG_FATAL(const Logger& logger)
+{
+    return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
+}
+
+} // anonymous namespace
+
+#endif // TENSORRT_LOGGING_H
diff --git a/uw_detection/include/uw_detection/yololayer.h b/uw_detection/include/uw_detection/yololayer.h
new file mode 100644
index 0000000..91af21e
--- /dev/null
+++ b/uw_detection/include/uw_detection/yololayer.h
@@ -0,0 +1,158 @@
+#ifndef _YOLO_LAYER_H
+#define _YOLO_LAYER_H
+
+#include <assert.h>
+#include <cmath>
+#include <string.h>
+#include <cublas_v2.h>
+#include "NvInfer.h"
+#include <uw_detection/Utils.h>
+#include <iostream>
+
+namespace Yolo
+{
+    static constexpr int CHECK_COUNT = 3;
+    static constexpr float IGNORE_THRESH = 0.1f;
+    static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
+    static constexpr int INPUT_H = 608;
+    static constexpr int INPUT_W = 608;
+
+    struct YoloKernel
+    {
+        int width;
+        int height;
+        float anchors[CHECK_COUNT*2];
+    };
+
+    static constexpr YoloKernel yolo1 = {
+        INPUT_W / 32,
+        INPUT_H / 32,
+        {116,90,  156,198,  373,326}
+    };
+    static constexpr YoloKernel yolo2 = {
+        INPUT_W / 16,
+        INPUT_H / 16,
+        {30,61,  62,45,  59,119}
+    };
+    static constexpr YoloKernel yolo3 = {
+        INPUT_W / 8,
+        INPUT_H / 8,
+        {10,13,  16,30,  33,23}
+    };
+
+    static constexpr int LOCATIONS = 4;
+    struct alignas(float) Detection{
+        //x y w h
+        float bbox[LOCATIONS];
+        float det_confidence;
+        float class_id;
+        float class_confidence;
+    };
+}
+
+
+namespace nvinfer1
+{
+    class YoloLayerPlugin: public IPluginV2IOExt
+    {
+        public:
+            explicit YoloLayerPlugin(int num_classes);
+            YoloLayerPlugin(const void* data, size_t length);
+
+            ~YoloLayerPlugin();
+
+            int getNbOutputs() const override
+            {
+                return 1;
+            }
+
+            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+
+            int initialize() override;
+
+            virtual void terminate() override {};
+
+            virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
+
+            virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+
+            virtual size_t getSerializationSize() const override;
+
+            virtual void serialize(void* buffer) const override;
+
+            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
+                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
+            }
+
+            const char* getPluginType() const override;
+
+            const char* getPluginVersion() const override;
+
+            void destroy() override;
+
+            IPluginV2IOExt* clone() const override;
+
+            void setPluginNamespace(const char* pluginNamespace) override;
+
+            const char* getPluginNamespace() const override;
+
+            DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override;
+
+            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
+
+            bool canBroadcastInputAcrossBatch(int inputIndex) const override;
+
+            void attachToContext(
+                    cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
+
+            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override;
+
+            void detachFromContext() override;
+
+        private:
+            void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1);
+            int mClassCount;
+            int mKernelCount;
+            std::vector<Yolo::YoloKernel> mYoloKernel;
+            int mThreadCount = 256;
+            const char* mPluginNamespace;
+    };
+
+    class YoloPluginCreator : public IPluginCreator
+    {
+        public:
+            YoloPluginCreator();
+
+            ~YoloPluginCreator() override = default;
+
+            const char* getPluginName() const override;
+
+            const char* getPluginVersion() const override;
+
+            const PluginFieldCollection* getFieldNames() override;
+
+            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+
+            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
+
+            void setPluginNamespace(const char* libNamespace) override
+            {
+                mNamespace = libNamespace;
+            }
+
+            const char* getPluginNamespace() const override
+            {
+                return mNamespace.c_str();
+            }
+
+        private:
+            std::string mNamespace;
+            static PluginFieldCollection mFC;
+            static std::vector<PluginField> mPluginAttributes;
+    };
+    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
+
+
+};
+
+#endif 
diff --git a/uw_detection/include/uw_detection/yolov3-spp.h b/uw_detection/include/uw_detection/yolov3-spp.h
new file mode 100644
index 0000000..a39da6a
--- /dev/null
+++ b/uw_detection/include/uw_detection/yolov3-spp.h
@@ -0,0 +1,13 @@
+#pragma once
+#include "NvInfer.h"
+#include <uw_detection/yololayer.h>
+
+void APIToModel(unsigned int maxBatchSize, nvinfer1::IHostMemory** modelStream, const std::string& weights_path);
+
+void doInference(nvinfer1::IExecutionContext& context, float* input, float* output, int batchSize);
+
+cv::Mat preprocess_img(cv::Mat& img);
+
+cv::Rect get_rect(cv::Mat& img, float bbox[4]);
+
+void nms(std::vector<Yolo::Detection>& res, float* output, float nms_thresh);
\ No newline at end of file
diff --git a/uw_detection/include/uw_detection/yolov3.h b/uw_detection/include/uw_detection/yolov3.h
new file mode 100644
index 0000000..6cdc155
--- /dev/null
+++ b/uw_detection/include/uw_detection/yolov3.h
@@ -0,0 +1,14 @@
+#pragma once
+#include "NvInfer.h"
+#include <uw_detection/yololayer.h>
+#include <opencv2/opencv.hpp>
+
+void APIToModel(unsigned int maxBatchSize, nvinfer1::IHostMemory** modelStream, const std::string& weights_path);
+
+void doInference(nvinfer1::IExecutionContext& context, float* input, float* output, int batchSize);
+
+cv::Mat preprocess_img(cv::Mat& img);
+
+cv::Rect get_rect(cv::Mat& img, float bbox[4]);
+
+void nms(std::vector<Yolo::Detection>& res, float* output, float nms_thresh);
\ No newline at end of file
diff --git a/uw_detection/package.xml b/uw_detection/package.xml
new file mode 100644
index 0000000..772408d
--- /dev/null
+++ b/uw_detection/package.xml
@@ -0,0 +1,33 @@
+<package format="2">
+  <name>uw_detection</name>
+  <version>0.1.0</version>
+  <description>
+    Detection models, as well as data prep, labeling, and training tools
+  </description>
+  <license>MIT</license>
+  <maintainer email="robocup@cs.washington.edu">UW RoboCup Contributors</maintainer>
+  <author email="robocup@cs.washington.edu">UW RoboCup Contributors</author>
+
+
+  <buildtool_depend>catkin</buildtool_depend>
+
+  <exec_depend>catkin</exec_depend>
+  <exec_depend>rospack</exec_depend>
+
+  <depend>cv_bridge</depend>
+  <depend>roscpp</depend>
+  <depend>std_msgs</depend>
+  <depend>sensor_msgs</depend>
+  <depend>geometry_msgs</depend>
+  <depend>visualization_msgs</depend>
+  <depend>message_generation</depend>
+  <depend>message_runtime</depend>
+
+  <build_depend>image_transport</build_depend>
+  <build_depend>octomap</build_depend>
+  <build_depend>std_srvs</build_depend>
+  <build_depend>tf2</build_depend>
+  <build_depend>tf2_eigen</build_depend>
+  <build_depend>message_filters</build_depend>
+
+</package>
\ No newline at end of file
diff --git a/uw_detection/scripts/camera_detect_demo b/uw_detection/scripts/camera_detect_demo
new file mode 100755
index 0000000..ba8340c
--- /dev/null
+++ b/uw_detection/scripts/camera_detect_demo
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+import argparse
+from sys import platform
+
+from yolov3.models import *  # set ONNX_EXPORT in models.py
+from yolov3.utils.datasets import *
+from yolov3.utils.utils import *
+import cv2
+import pyrealsense2 as rs
+import torchvision.transforms as transforms
+
+
+def detect(pipe=None, save_img=False):
+    img_size = (320, 192) if ONNX_EXPORT else opt.img_size  # (320, 192) or (416, 256) or (608, 352) for (height, width)
+    out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt
+    webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt')
+
+    # Initialize
+    device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device)
+    if os.path.exists(out):
+        shutil.rmtree(out)  # delete output folder
+    os.makedirs(out)  # make new output folder
+
+    # Initialize model
+    model = Darknet(opt.cfg, img_size)
+
+    # Load weights
+    attempt_download(weights)
+    if weights.endswith('.pt'):  # pytorch format
+        model.load_state_dict(torch.load(weights, map_location=device)['model'])
+    else:  # darknet format
+        load_darknet_weights(model, weights)
+
+    # Fuse Conv2d + BatchNorm2d layers
+    # model.fuse()
+    # torch_utils.model_info(model, report='summary')  # 'full' or 'summary'
+
+    # Eval mode
+    model.to(device).eval()
+
+    # Export mode
+    if ONNX_EXPORT:
+        model.fuse()
+        img = torch.zeros((1, 3) + img_size)  # (1, 3, 320, 192)
+        torch.onnx.export(model, img, 'weights/export.onnx', verbose=False, opset_version=11)
+
+        # Validate exported model
+        import onnx
+        model = onnx.load('weights/export.onnx')  # Load the ONNX model
+        onnx.checker.check_model(model)  # Check that the IR is well formed
+        print(onnx.helper.printable_graph(model.graph))  # Print a human readable representation of the graph
+        return
+
+    # Half precision
+    half = half and device.type != 'cpu'  # half precision only supported on CUDA
+    if half:
+        model.half()
+
+    # Set Dataloader
+    vid_path, vid_writer = None, None
+    if webcam:
+        view_img = True
+        torch.backends.cudnn.benchmark = True  # set True to speed up constant image size inference
+        # dataset = LoadStreams(source, img_size=img_size, half=half)
+    else:
+        save_img = True
+        # dataset = LoadImages(source, img_size=img_size, half=half)
+
+    # Get names and colors
+    names = load_classes(opt.names)
+    colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
+
+    # Run inference
+    t0 = time.time()
+    # for path, img, im0s, vid_cap in dataset:
+    while True:
+        t = time.time()
+
+        # Get detections
+        img = pipe.wait_for_frames()
+
+        # Align the depth frame to color frame
+        aligned_frames = align.process(img)
+
+        # Get aligned frames
+        aligned_depth_frame = aligned_frames.get_depth_frame()  # aligned_depth_frame is a 640x480 depth image
+        aligned_color_frame = aligned_frames.get_color_frame()
+
+        aligned_color_frame = np.asanyarray(img.get_data())
+        # im0 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        # im0 = cv2.resize(im0, (640,480))
+        aligned_color_frame = cv2.resize(img, (320, 320))
+        im0 = cv2.cvtColor(aligned_color_frame, cv2.COLOR_RGB2BGR)  # im0 is original image
+        img = transforms.ToTensor()(im0).to(device)
+
+        aligned_depth_frame = cv2.resize(aligned_depth_frame, (320, 320))
+
+        # img = torch.from_numpy(img).to(device)
+        if img.ndimension() == 3:
+            img = img.unsqueeze(0)
+        pred = model(img)[0]
+
+        if opt.half:
+            pred = pred.float()
+
+        # Apply NMS
+        pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
+
+        # Apply Classifier
+        # if classify:
+        #    pred = apply_classifier(pred, modelc, img, im0s)
+
+        # Process detections
+        for i, det in enumerate(pred):  # detections per image
+            # depth mask for each of the objects
+            class_mask = np.zeros((320, 320, 21))
+
+            s = ""
+            # if webcam:  # batch_size >= 1
+            #    p, s, im0 = " ", '%g: ' % i, im0s[i]
+            # else:
+            #    p, s, im0 = path, '', im0s
+
+            # save_path = str(Path(out) / Path(p).name)
+            s += '%gx%g ' % img.shape[2:]  # print string
+            if det is not None and len(det):
+                # Rescale boxes from img_size to im0 size
+                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
+
+                # Print results
+                for c in det[:, -1].unique():
+                    n = (det[:, -1] == c).sum()  # detections per class
+                    s += '%g %ss, ' % (n, names[int(c)])  # add to string
+
+                # Write results
+                for *xyxy, conf, cls in det:
+                    if save_img or view_img:  # Add bbox to image
+                        label = '%s %.2f' % (names[int(cls)], conf)
+                        plot_one_box(xyxy, im0, label=label, color=colors[int(cls)])
+            # print(det)
+
+            # fill in bounding box area for mask
+            # cv2.contour for each channel
+            # for each contour -> find the centroid
+
+            # depth + color image combine to overlay
+
+            # Stream results
+            if True:
+                print(1 / (time.time() - t))
+                # cv2.imshow("output", im0)
+                # cv2.imshow("resized", cv2.resize(im0, (1080,720)))
+
+                if cv2.waitKey(1) == ord('q'):  # q to quit
+                    raise StopIteration
+
+            # Save results (image with detections)
+            # if save_img:
+            # if dataset.mode == 'images':
+            #    cv2.imwrite(save_path, im0)
+            # else:
+            #    if vid_path != save_path:  # new video
+            #        vid_path = save_path
+            #        if isinstance(vid_writer, cv2.VideoWriter):
+            #            vid_writer.release()  # release previous video writer
+
+            #       fps = vid_cap.get(cv2.CAP_PROP_FPS)
+            #       w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            #       h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            #       vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h))
+            #  vid_writer.write(im0)
+
+    # if save_txt or save_img:
+    #    print('Results saved to %s' % os.getcwd() + os.sep + out)
+    #    if platform == 'darwin':  # MacOS
+    #        os.system('open ' + out + ' ' + save_path)
+
+    # print('Done. (%.3fs)' % (time.time() - t0))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path')
+    parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path')
+    parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path')
+    parser.add_argument('--source', type=str, default='data/samples', help='source')  # input file/folder, 0 for webcam
+    parser.add_argument('--output', type=str, default='output', help='output folder')  # output folder
+    parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)')
+    parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold')
+    parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
+    parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)')
+    parser.add_argument('--half', action='store_true', help='half precision FP16 inference')
+    parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
+    parser.add_argument('--view-img', action='store_true', help='display results')
+    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
+    parser.add_argument('--classes', nargs='+', type=int, help='filter by class')
+    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
+    opt = parser.parse_args()
+    print(opt)
+
+    align_to = rs.stream.color
+    align = rs.align(align_to)
+
+    pipeline = rs.pipeline()
+    pipeline.start()
+
+    with torch.no_grad():
+        detect(pipe=pipeline)
diff --git a/uw_detection/scripts/camera_detect_sim b/uw_detection/scripts/camera_detect_sim
new file mode 100644
index 0000000..ba8340c
--- /dev/null
+++ b/uw_detection/scripts/camera_detect_sim
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+import argparse
+from sys import platform
+
+from yolov3.models import *  # set ONNX_EXPORT in models.py
+from yolov3.utils.datasets import *
+from yolov3.utils.utils import *
+import cv2
+import pyrealsense2 as rs
+import torchvision.transforms as transforms
+
+
+def detect(pipe=None, save_img=False):
+    img_size = (320, 192) if ONNX_EXPORT else opt.img_size  # (320, 192) or (416, 256) or (608, 352) for (height, width)
+    out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt
+    webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt')
+
+    # Initialize
+    device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device)
+    if os.path.exists(out):
+        shutil.rmtree(out)  # delete output folder
+    os.makedirs(out)  # make new output folder
+
+    # Initialize model
+    model = Darknet(opt.cfg, img_size)
+
+    # Load weights
+    attempt_download(weights)
+    if weights.endswith('.pt'):  # pytorch format
+        model.load_state_dict(torch.load(weights, map_location=device)['model'])
+    else:  # darknet format
+        load_darknet_weights(model, weights)
+
+    # Fuse Conv2d + BatchNorm2d layers
+    # model.fuse()
+    # torch_utils.model_info(model, report='summary')  # 'full' or 'summary'
+
+    # Eval mode
+    model.to(device).eval()
+
+    # Export mode
+    if ONNX_EXPORT:
+        model.fuse()
+        img = torch.zeros((1, 3) + img_size)  # (1, 3, 320, 192)
+        torch.onnx.export(model, img, 'weights/export.onnx', verbose=False, opset_version=11)
+
+        # Validate exported model
+        import onnx
+        model = onnx.load('weights/export.onnx')  # Load the ONNX model
+        onnx.checker.check_model(model)  # Check that the IR is well formed
+        print(onnx.helper.printable_graph(model.graph))  # Print a human readable representation of the graph
+        return
+
+    # Half precision
+    half = half and device.type != 'cpu'  # half precision only supported on CUDA
+    if half:
+        model.half()
+
+    # Set Dataloader
+    vid_path, vid_writer = None, None
+    if webcam:
+        view_img = True
+        torch.backends.cudnn.benchmark = True  # set True to speed up constant image size inference
+        # dataset = LoadStreams(source, img_size=img_size, half=half)
+    else:
+        save_img = True
+        # dataset = LoadImages(source, img_size=img_size, half=half)
+
+    # Get names and colors
+    names = load_classes(opt.names)
+    colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
+
+    # Run inference
+    t0 = time.time()
+    # for path, img, im0s, vid_cap in dataset:
+    while True:
+        t = time.time()
+
+        # Get detections
+        img = pipe.wait_for_frames()
+
+        # Align the depth frame to color frame
+        aligned_frames = align.process(img)
+
+        # Get aligned frames
+        aligned_depth_frame = aligned_frames.get_depth_frame()  # aligned_depth_frame is a 640x480 depth image
+        aligned_color_frame = aligned_frames.get_color_frame()
+
+        aligned_color_frame = np.asanyarray(img.get_data())
+        # im0 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        # im0 = cv2.resize(im0, (640,480))
+        aligned_color_frame = cv2.resize(img, (320, 320))
+        im0 = cv2.cvtColor(aligned_color_frame, cv2.COLOR_RGB2BGR)  # im0 is original image
+        img = transforms.ToTensor()(im0).to(device)
+
+        aligned_depth_frame = cv2.resize(aligned_depth_frame, (320, 320))
+
+        # img = torch.from_numpy(img).to(device)
+        if img.ndimension() == 3:
+            img = img.unsqueeze(0)
+        pred = model(img)[0]
+
+        if opt.half:
+            pred = pred.float()
+
+        # Apply NMS
+        pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
+
+        # Apply Classifier
+        # if classify:
+        #    pred = apply_classifier(pred, modelc, img, im0s)
+
+        # Process detections
+        for i, det in enumerate(pred):  # detections per image
+            # depth mask for each of the objects
+            class_mask = np.zeros((320, 320, 21))
+
+            s = ""
+            # if webcam:  # batch_size >= 1
+            #    p, s, im0 = " ", '%g: ' % i, im0s[i]
+            # else:
+            #    p, s, im0 = path, '', im0s
+
+            # save_path = str(Path(out) / Path(p).name)
+            s += '%gx%g ' % img.shape[2:]  # print string
+            if det is not None and len(det):
+                # Rescale boxes from img_size to im0 size
+                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
+
+                # Print results
+                for c in det[:, -1].unique():
+                    n = (det[:, -1] == c).sum()  # detections per class
+                    s += '%g %ss, ' % (n, names[int(c)])  # add to string
+
+                # Write results
+                for *xyxy, conf, cls in det:
+                    if save_img or view_img:  # Add bbox to image
+                        label = '%s %.2f' % (names[int(cls)], conf)
+                        plot_one_box(xyxy, im0, label=label, color=colors[int(cls)])
+            # print(det)
+
+            # fill in bounding box area for mask
+            # cv2.contour for each channel
+            # for each contour -> find the centroid
+
+            # depth + color image combine to overlay
+
+            # Stream results
+            if True:
+                print(1 / (time.time() - t))
+                # cv2.imshow("output", im0)
+                # cv2.imshow("resized", cv2.resize(im0, (1080,720)))
+
+                if cv2.waitKey(1) == ord('q'):  # q to quit
+                    raise StopIteration
+
+            # Save results (image with detections)
+            # if save_img:
+            # if dataset.mode == 'images':
+            #    cv2.imwrite(save_path, im0)
+            # else:
+            #    if vid_path != save_path:  # new video
+            #        vid_path = save_path
+            #        if isinstance(vid_writer, cv2.VideoWriter):
+            #            vid_writer.release()  # release previous video writer
+
+            #       fps = vid_cap.get(cv2.CAP_PROP_FPS)
+            #       w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            #       h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            #       vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h))
+            #  vid_writer.write(im0)
+
+    # if save_txt or save_img:
+    #    print('Results saved to %s' % os.getcwd() + os.sep + out)
+    #    if platform == 'darwin':  # MacOS
+    #        os.system('open ' + out + ' ' + save_path)
+
+    # print('Done. (%.3fs)' % (time.time() - t0))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path')
+    parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path')
+    parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path')
+    parser.add_argument('--source', type=str, default='data/samples', help='source')  # input file/folder, 0 for webcam
+    parser.add_argument('--output', type=str, default='output', help='output folder')  # output folder
+    parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)')
+    parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold')
+    parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
+    parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)')
+    parser.add_argument('--half', action='store_true', help='half precision FP16 inference')
+    parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu')
+    parser.add_argument('--view-img', action='store_true', help='display results')
+    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
+    parser.add_argument('--classes', nargs='+', type=int, help='filter by class')
+    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
+    opt = parser.parse_args()
+    print(opt)
+
+    align_to = rs.stream.color
+    align = rs.align(align_to)
+
+    pipeline = rs.pipeline()
+    pipeline.start()
+
+    with torch.no_grad():
+        detect(pipe=pipeline)
diff --git a/uw_detection/scripts/camera_to_baselink b/uw_detection/scripts/camera_to_baselink
new file mode 100755
index 0000000..1032c6f
--- /dev/null
+++ b/uw_detection/scripts/camera_to_baselink
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+import tf
+import rospy
+from tf import TransformListener
+from std_msgs.msg import Float32MultiArray, MultiArrayDimension
+import numpy as np
+
+position = None
+
+def callback(data):
+    centroids_in_baselink = []
+    if position is not None:
+        # Convert each centroid prediction in baselink frame from realsense frame
+        #rospy.loginfo("DATA: ", data)
+        #rospy.loginfo("POSITION: ", position)
+        # data is a 50x4 array, 50 samples of [x,y,z,class_idx]
+        data = np.array(data.data).reshape(50,4)
+        #print(position)
+        for i, row in enumerate(data):
+            new_x = row[0] + position[0]
+            new_y = row[1] + position[1]
+            new_z = row[2] + position[2]
+            data[i,:] = np.array([new_x, new_y, new_z, row[3]])
+        #    get row to be 
+        # publish converted array
+        centroid_msg = Float32MultiArray()
+        centroid_msg.layout.data_offset = 0
+        centroid_msg.layout.dim = [MultiArrayDimension(), MultiArrayDimension()]
+        centroid_msg.layout.dim[0].label = "channels"
+        centroid_msg.layout.dim[0].size = 4
+        centroid_msg.layout.dim[0].stride = 200
+        centroid_msg.layout.dim[1].label = "samples"
+        centroid_msg.layout.dim[1].size = 50
+        centroid_msg.layout.dim[1].stride = 50
+        centroid_msg.data = data.flatten()
+        pub.publish(centroid_msg)
+        rate.sleep()
+
+if __name__ == "__main__":
+    rospy.init_node('centroid_listener', anonymous = True)
+    rospy.Subscriber("object_centroids", Float32MultiArray, callback)
+    pub = rospy.Publisher('object_centroids_baselink', Float32MultiArray, queue_size = 10)
+    #rospy.init_node("baselink_tf_listener")
+    listener = TransformListener()
+    rate = rospy.Rate(100.)
+    while not rospy.is_shutdown():
+        listener.waitForTransform("/base_link", "head_external_camera_depth_frame", rospy.Time(), rospy.Duration(3.0))
+        #t = listener.getLatestCommonTime("/base_link", "/head_external_camera_aligned_depth_to_color_frame")
+        position, _ = listener.lookupTransform("/base_link", "/head_external_camera_depth_frame", rospy.Time())
+        #rate.sleep()
+        rospy.spin()
diff --git a/uw_detection/setup.py b/uw_detection/setup.py
new file mode 100644
index 0000000..756d253
--- /dev/null
+++ b/uw_detection/setup.py
@@ -0,0 +1,11 @@
+# ! DO NOT MANUALLY INVOKE THIS setup.py, USE CATKIN INSTEAD
+
+from distutils.core import setup
+from catkin_pkg.python_setup import generate_distutils_setup
+
+# fetch values from package.xml
+setup_args = generate_distutils_setup(
+    packages=['yolov3'],
+    package_dir={'': 'src'})
+
+setup(**setup_args)
diff --git a/uw_detection/share/.gitignore b/uw_detection/share/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/uw_detection/share/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/uw_detection/src/yolo_ros_image_node.cpp b/uw_detection/src/yolo_ros_image_node.cpp
new file mode 100644
index 0000000..5166c7f
--- /dev/null
+++ b/uw_detection/src/yolo_ros_image_node.cpp
@@ -0,0 +1,113 @@
+#include <chrono>
+#include <cv_bridge/cv_bridge.h>
+#include <fstream>
+#include <image_transport/image_transport.h>
+#include <opencv2/highgui/highgui.hpp>
+#include <ros/package.h>
+#include <ros/ros.h>
+#include <uw_detection/logging.h>
+#include <uw_detection/yololayer.h>
+#include <uw_detection/yolov3.h>
+#define NMS_THRESH 0.4
+
+using namespace nvinfer1;
+
+// stuff we know about the network and the input/output blobs
+static const int INPUT_H = Yolo::INPUT_H;
+static const int INPUT_W = Yolo::INPUT_W;
+static const int OUTPUT_SIZE = 1000 * 7 + 1;  // we assume the yololayer outputs
+                                              // no more than 1000 boxes that
+                                              // conf >= 0.1
+
+static Logger gLogger;
+// prepare input data ---------------------------
+static float data[3 * INPUT_H * INPUT_W];
+// for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
+//    data[i] = 1.0;
+static float prob[OUTPUT_SIZE];
+
+static IExecutionContext* context;
+
+void imageCallback(const sensor_msgs::ImageConstPtr& msg)
+{
+  try
+  {
+    auto img = cv_bridge::toCvShare(msg, "bgr8")->image;
+
+    cv::Mat pr_img = preprocess_img(img);
+
+    for (int i = 0; i < INPUT_H * INPUT_W; i++)
+    {
+      data[i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
+      data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
+      data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
+    }
+
+    auto start = std::chrono::system_clock::now();
+    doInference(*context, data, prob, 1);
+    auto end = std::chrono::system_clock::now();
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+    std::vector<Yolo::Detection> res;
+    nms(res, prob, NMS_THRESH);
+    for (int i = 0; i < 20; i++)
+    {
+      std::cout << prob[i] << ",";
+    }
+    std::cout << res.size() << std::endl;
+    for (size_t j = 0; j < res.size(); j++)
+    {
+      float* p = (float*)&res[j];
+      for (size_t k = 0; k < 7; k++)
+      {
+        std::cout << p[k] << ", ";
+      }
+      std::cout << std::endl;
+      cv::Rect r = get_rect(img, res[j].bbox);
+      cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+      cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2,
+                  cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+    }
+    cv::imshow("view", img);
+  }
+  catch (cv_bridge::Exception& e)
+  {
+    ROS_ERROR("Could not convert from '%s' to 'bgr8'.", msg->encoding.c_str());
+  }
+}
+
+int main(int argc, char** argv)
+{
+  cudaSetDevice(0);
+  size_t size{ 0 };
+
+  char* trtModelStream{ nullptr };
+
+  ros::init(argc, argv, "image_listener");
+  std::string path = ros::package::getPath("uw_detection");
+  std::ifstream file(path + "/share/yolov3.engine", std::ios::binary);
+  if (file.good())
+  {
+    file.seekg(0, file.end);
+    size = file.tellg();
+    file.seekg(0, file.beg);
+    trtModelStream = new char[size];
+    assert(trtModelStream);
+    file.read(trtModelStream, size);
+    file.close();
+  }
+
+  IRuntime* runtime = createInferRuntime(gLogger);
+  assert(runtime != nullptr);
+  ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
+  assert(engine != nullptr);
+  context = engine->createExecutionContext();
+  assert(context != nullptr);
+  delete[] trtModelStream;
+  ros::NodeHandle nh;
+  cv::namedWindow("view");
+  cv::startWindowThread();
+  image_transport::ImageTransport it(nh);
+  image_transport::Subscriber sub = it.subscribe("/head_camera/rgb/image_raw", 1, imageCallback);
+  ros::spin();
+  cv::destroyWindow("view");
+}
\ No newline at end of file
diff --git a/uw_detection/src/yolov3/README.md b/uw_detection/src/yolov3/README.md
new file mode 100644
index 0000000..ec38033
--- /dev/null
+++ b/uw_detection/src/yolov3/README.md
@@ -0,0 +1,64 @@
+This directory contains both the SPP and regular YOLOv3 implementations copied from tensorrtx. The minimum changes required to catkinize the code were applied; headers were moved into the package include directory, and the model API was exposed and modified to accept a path to weights file.
+
+
+
+# yolov3-spp
+
+yolov4 is [here](../yolov4).
+
+The Pytorch implementation is [ultralytics/yolov3](https://github.com/ultralytics/yolov3). It provides two trained weights of yolov3-spp, `yolov3-spp.pt` and `yolov3-spp-ultralytics.pt`(originally named `ultralytics68.pt`).
+
+Following tricks are used in this yolov3-spp:
+
+- Yololayer plugin is different from the plugin used in [this repo's yolov3](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov3). In this version, three yololayer are implemented in one plugin to improve speed, codes derived from [lewes6369/TensorRT-Yolov3](https://github.com/lewes6369/TensorRT-Yolov3)
+- Batchnorm layer, implemented by scale layer.
+
+## Excute:
+
+```
+1. generate yolov3-spp_ultralytics68.wts from pytorch implementation with yolov3-spp.cfg and yolov3-spp-ultralytics.pt
+
+git clone https://github.com/wang-xinyu/tensorrtx.git
+git clone https://github.com/ultralytics/yolov3.git
+// download its weights 'yolov3-spp-ultralytics.pt'
+cd yolov3
+cp ../tensorrtx/yolov3-spp/gen_wts.py .
+python gen_wts.py yolov3-spp-ultralytics.pt
+// a file 'yolov3-spp_ultralytics68.wts' will be generated.
+// the master branch of yolov3 should work, if not, you can checkout 4ac60018f6e6c1e24b496485f126a660d9c793d8
+
+2. put yolov3-spp_ultralytics68.wts into yolov3-spp, build and run
+
+mv yolov3-spp_ultralytics68.wts ../tensorrtx/yolov3-spp/
+cd ../tensorrtx/yolov3-spp
+mkdir build
+cd build
+cmake ..
+make
+sudo ./yolov3-spp -s             // serialize model to plan file i.e. 'yolov3-spp.engine'
+sudo ./yolov3-spp -d  ../samples // deserialize plan file and run inference, the images in samples will be processed.
+
+3. check the images generated, as follows. _zidane.jpg and _bus.jpg
+```
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/15235574/78247927-4d9fac00-751e-11ea-8b1b-704a0aeb3fcf.jpg">
+</p>
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/15235574/78247970-60b27c00-751e-11ea-88df-41473fed4823.jpg">
+</p>
+
+## Config
+
+- Input shape defined in yololayer.h
+- Number of classes defined in yololayer.h
+- FP16/FP32 can be selected by the macro in yolov3-spp.cpp
+- GPU id can be selected by the macro in yolov3-spp.cpp
+- NMS thresh in yolov3-spp.cpp
+- BBox confidence thresh in yolov3-spp.cpp
+
+## More Information
+
+See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx)
+
diff --git a/uw_detection/src/yolov3/gen_wts.py b/uw_detection/src/yolov3/gen_wts.py
new file mode 100644
index 0000000..52c8e1c
--- /dev/null
+++ b/uw_detection/src/yolov3/gen_wts.py
@@ -0,0 +1,23 @@
+import struct
+import sys
+from models import *
+from utils.utils import *
+
+model = Darknet('cfg/yolov3.cfg', (608, 608))
+weights = sys.argv[1]
+device = torch_utils.select_device('0')
+if weights.endswith('.pt'):  # pytorch format
+    model.load_state_dict(torch.load(weights, map_location=device)['model'])
+else:  # darknet format
+    load_darknet_weights(model, weights)
+model = model.eval()
+
+f = open('yolov3.wts', 'w')
+f.write('{}\n'.format(len(model.state_dict().keys())))
+for k, v in model.state_dict().items():
+    vr = v.reshape(-1).cpu().numpy()
+    f.write('{} {} '.format(k, len(vr)))
+    for vv in vr:
+        f.write(' ')
+        f.write(struct.pack('>f',float(vv)).hex())
+    f.write('\n')
\ No newline at end of file
diff --git a/uw_detection/src/yolov3/gen_wts_spp.py b/uw_detection/src/yolov3/gen_wts_spp.py
new file mode 100644
index 0000000..7604c77
--- /dev/null
+++ b/uw_detection/src/yolov3/gen_wts_spp.py
@@ -0,0 +1,22 @@
+import struct
+import sys
+from models import *
+from utils.utils import *
+
+model = Darknet('cfg/yolov3-spp.cfg', (416, 416))
+weights = sys.argv[1]
+dev = '1'
+device = torch_utils.select_device(dev)
+model.load_state_dict(torch.load(weights, map_location=device)['model'])
+
+
+f = open('yolov3-spp_ultralytics68.wts', 'w')
+f.write('{}\n'.format(len(model.state_dict().keys())))
+for k, v in model.state_dict().items():
+    vr = v.reshape(-1).cpu().numpy()
+    f.write('{} {} '.format(k, len(vr)))
+    for vv in vr:
+        f.write(' ')
+        f.write(struct.pack('>f',float(vv)).hex())
+    f.write('\n')
+
diff --git a/uw_detection/src/yolov3/samples/bus.jpg b/uw_detection/src/yolov3/samples/bus.jpg
new file mode 100644
index 0000000..b43e311
Binary files /dev/null and b/uw_detection/src/yolov3/samples/bus.jpg differ
diff --git a/uw_detection/src/yolov3/samples/zidane.jpg b/uw_detection/src/yolov3/samples/zidane.jpg
new file mode 100644
index 0000000..92d72ea
Binary files /dev/null and b/uw_detection/src/yolov3/samples/zidane.jpg differ
diff --git a/uw_detection/src/yolov3/yololayer.cu b/uw_detection/src/yolov3/yololayer.cu
new file mode 100644
index 0000000..8ff75b6
--- /dev/null
+++ b/uw_detection/src/yolov3/yololayer.cu
@@ -0,0 +1,307 @@
+#include <uw_detection/yololayer.h>
+
+using namespace Yolo;
+
+namespace nvinfer1
+{
+    YoloLayerPlugin::YoloLayerPlugin(int num_classes)
+    {
+        mClassCount = num_classes;
+        mYoloKernel.clear();
+        mYoloKernel.push_back(yolo1);
+        mYoloKernel.push_back(yolo2);
+        mYoloKernel.push_back(yolo3);
+
+        mKernelCount = mYoloKernel.size();
+    }
+    
+    YoloLayerPlugin::~YoloLayerPlugin()
+    {
+    }
+
+    // create the plugin at runtime from a byte stream
+    YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length)
+    {
+        using namespace Tn;
+        const char *d = reinterpret_cast<const char *>(data), *a = d;
+        read(d, mClassCount);
+        read(d, mThreadCount);
+        read(d, mKernelCount);
+        mYoloKernel.resize(mKernelCount);
+        auto kernelSize = mKernelCount*sizeof(YoloKernel);
+        memcpy(mYoloKernel.data(),d,kernelSize);
+        d += kernelSize;
+
+        assert(d == a + length);
+    }
+
+    void YoloLayerPlugin::serialize(void* buffer) const
+    {
+        using namespace Tn;
+        char* d = static_cast<char*>(buffer), *a = d;
+        write(d, mClassCount);
+        write(d, mThreadCount);
+        write(d, mKernelCount);
+        auto kernelSize = mKernelCount*sizeof(YoloKernel);
+        memcpy(d,mYoloKernel.data(),kernelSize);
+        d += kernelSize;
+
+        assert(d == a + getSerializationSize());
+    }
+    
+    size_t YoloLayerPlugin::getSerializationSize() const
+    {  
+        return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount)  + sizeof(Yolo::YoloKernel) * mYoloKernel.size();
+    }
+
+    int YoloLayerPlugin::initialize()
+    { 
+        return 0;
+    }
+    
+    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+    {
+        //output the result to channel
+        int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
+
+        return Dims3(totalsize + 1, 1, 1);
+    }
+
+    // Set plugin namespace
+    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
+    {
+        mPluginNamespace = pluginNamespace;
+    }
+
+    const char* YoloLayerPlugin::getPluginNamespace() const
+    {
+        return mPluginNamespace;
+    }
+
+    // Return the DataType of the plugin output at the requested index
+    DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const
+    {
+        return DataType::kFLOAT;
+    }
+
+    // Return true if output tensor is broadcast across a batch.
+    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
+    {
+        return false;
+    }
+
+    // Return true if plugin can use input that is broadcast across batch without replication.
+    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
+    {
+        return false;
+    }
+
+    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
+    {
+    }
+
+    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
+    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
+    {
+    }
+
+    // Detach the plugin object from its execution context.
+    void YoloLayerPlugin::detachFromContext() {}
+
+    const char* YoloLayerPlugin::getPluginType() const
+    {
+        return "YoloLayer_TRT";
+    }
+
+    const char* YoloLayerPlugin::getPluginVersion() const
+    {
+        return "1";
+    }
+
+    void YoloLayerPlugin::destroy()
+    {
+        delete this;
+    }
+
+    // Clone the plugin
+    IPluginV2IOExt* YoloLayerPlugin::clone() const
+    {
+        YoloLayerPlugin *p = new YoloLayerPlugin(mClassCount);
+        p->setPluginNamespace(mPluginNamespace);
+        return p;
+    }
+
+    __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); };
+
+    __global__ void CalDetection(const float *input, float *output,int noElements, 
+            int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) {
+ 
+        int idx = threadIdx.x + blockDim.x * blockIdx.x;
+        if (idx >= noElements) return;
+
+        int total_grid = yoloWidth * yoloHeight;
+        int bnIdx = idx / total_grid;
+        idx = idx - total_grid*bnIdx;
+        int info_len_i = 5 + classes;
+        const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT);
+
+        for (int k = 0; k < 3; ++k) {
+            int class_id = 0;
+            float max_cls_prob = 0.0;
+            for (int i = 5; i < info_len_i; ++i) {
+                float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]);
+                if (p > max_cls_prob) {
+                    max_cls_prob = p;
+                    class_id = i - 5;
+                }
+            }
+            float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]);
+            if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue;
+
+            float *res_count = output + bnIdx*outputElem;
+            int count = (int)atomicAdd(res_count, 1);
+            if (count >= MAX_OUTPUT_BBOX_COUNT) return;
+            char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection);
+            Detection* det =  (Detection*)(data);
+
+            int row = idx / yoloWidth;
+            int col = idx % yoloWidth;
+
+            //Location
+            det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth;
+            det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight;
+            det->bbox[2] = expf(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k];
+            det->bbox[3] = expf(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1];
+            det->det_confidence = box_prob;
+            det->class_id = class_id;
+            det->class_confidence = max_cls_prob;
+        }
+    }
+
+    void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) {
+        void* devAnchor;
+        size_t AnchorLen = sizeof(float)* CHECK_COUNT*2;
+        CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen));
+
+        int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float);
+
+        for(int idx = 0 ; idx < batchSize; ++idx) {
+            CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float)));
+        }
+        int numElem = 0;
+        for (unsigned int i = 0;i< mYoloKernel.size();++i)
+        {
+            const auto& yolo = mYoloKernel[i];
+            numElem = yolo.width*yolo.height*batchSize;
+            if (numElem < mThreadCount)
+                mThreadCount = numElem;
+            CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice));
+            CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>>
+                (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem);
+        }
+
+        CUDA_CHECK(cudaFree(devAnchor));
+    }
+
+
+    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
+    {
+        //assert(batchSize == 1);
+        //GPU
+        //CUDA_CHECK(cudaStreamSynchronize(stream));
+        forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize);
+
+        return 0;
+    }
+
+    PluginFieldCollection YoloPluginCreator::mFC{};
+    std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
+
+    YoloPluginCreator::YoloPluginCreator()
+    {
+        mPluginAttributes.clear();
+        mPluginAttributes.emplace_back(PluginField("numClasses", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("yoloWidth", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("yoloHeight", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("inputWidth", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("inputHeight", nullptr, PluginFieldType::kINT32, 1));
+
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields = mPluginAttributes.data();
+    }
+
+    const char* YoloPluginCreator::getPluginName() const
+    {
+            return "YoloLayer_TRT";
+    }
+
+    const char* YoloPluginCreator::getPluginVersion() const
+    {
+            return "1";
+    }
+
+    const PluginFieldCollection* YoloPluginCreator::getFieldNames()
+    {
+            return &mFC;
+    }
+
+    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+    {
+        //assert(!strcmp(name, getPluginName()));
+        const PluginField* fields = fc->fields;
+        int yolo_width, yolo_height, num_anchors = 0;
+        int num_classes;
+        int input_width, input_height;
+
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+          const char* attrName = fields[i].name;
+          if (!strcmp(attrName, "yoloWidth"))
+          {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            yolo_width = *(static_cast<const int*>(fields[i].data));
+          }
+          else if (!strcmp(attrName, "yoloHeight"))
+          {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            yolo_height = *(static_cast<const int*>(fields[i].data));
+          }
+          else if (!strcmp(attrName, "numAnchors"))
+          {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            num_anchors = *(static_cast<const int*>(fields[i].data));
+          }
+          else if (!strcmp(attrName, "numClasses"))
+          {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            num_classes = *(static_cast<const int*>(fields[i].data));
+          }
+          else if (!strcmp(attrName, "inputWidth"))
+          {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            input_width = *(static_cast<const int*>(fields[i].data));
+          }
+          else if (!strcmp(attrName, "inputHeight"))
+          {
+            assert(fields[i].type == PluginFieldType::kINT32);
+            input_height = *(static_cast<const int*>(fields[i].data));
+          }
+        }
+        //assert(yolo_width > 0 && yolo_height > 0);
+        //assert(num_classes > 0);
+        //assert(input_width > 0 && input_height > 0);
+        YoloLayerPlugin* obj = new YoloLayerPlugin(num_classes);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+
+    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+    {
+        // This object will be deleted when the network is destroyed, which will
+        // call MishPlugin::destroy()
+        YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
+        obj->setPluginNamespace(mNamespace.c_str());
+        return obj;
+    }
+
+}
diff --git a/uw_detection/src/yolov3/yolov3-spp.cpp b/uw_detection/src/yolov3/yolov3-spp.cpp
new file mode 100644
index 0000000..24e3aad
--- /dev/null
+++ b/uw_detection/src/yolov3/yolov3-spp.cpp
@@ -0,0 +1,572 @@
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <chrono>
+#include <opencv2/opencv.hpp>
+#include <dirent.h>
+#include "NvInfer.h"
+#include "cuda_runtime_api.h"
+#include <uw_detection/logging.h>
+#include <uw_detection/yololayer.h>
+
+#define CHECK(status) \
+    do\
+    {\
+        auto ret = (status);\
+        if (ret != 0)\
+        {\
+            std::cerr << "Cuda failure: " << ret << std::endl;\
+            abort();\
+        }\
+    } while (0)
+
+
+#define USE_FP16  // comment out this if want to use FP32
+#define DEVICE 0  // GPU id
+#define NMS_THRESH 0.4
+#define BBOX_CONF_THRESH 0.5
+
+using namespace nvinfer1;
+
+// stuff we know about the network and the input/output blobs
+static const int INPUT_H = Yolo::INPUT_H;
+static const int INPUT_W = Yolo::INPUT_W;
+static const int OUTPUT_SIZE = 1000 * 7 + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
+const char* INPUT_BLOB_NAME = "data";
+const char* OUTPUT_BLOB_NAME = "prob";
+static Logger gLogger;
+
+cv::Mat preprocess_img(cv::Mat& img) {
+    int w, h, x, y;
+    float r_w = INPUT_W / (img.cols*1.0);
+    float r_h = INPUT_H / (img.rows*1.0);
+    if (r_h > r_w) {
+        w = INPUT_W;
+        h = r_w * img.rows;
+        x = 0;
+        y = (INPUT_H - h) / 2;
+    } else {
+        w = r_h* img.cols;
+        h = INPUT_H;
+        x = (INPUT_W - w) / 2;
+        y = 0;
+    }
+    cv::Mat re(h, w, CV_8UC3);
+    cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
+    cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
+    re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
+    return out;
+}
+
+cv::Rect get_rect(const cv::Mat& img, float bbox[4]) {
+    int l, r, t, b;
+    float r_w = INPUT_W / (img.cols * 1.0);
+    float r_h = INPUT_H / (img.rows * 1.0);
+    if (r_h > r_w) {
+        l = bbox[0] - bbox[2]/2.f;
+        r = bbox[0] + bbox[2]/2.f;
+        t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
+        b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
+        l = l / r_w;
+        r = r / r_w;
+        t = t / r_w;
+        b = b / r_w;
+    } else {
+        l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
+        r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
+        t = bbox[1] - bbox[3]/2.f;
+        b = bbox[1] + bbox[3]/2.f;
+        l = l / r_h;
+        r = r / r_h;
+        t = t / r_h;
+        b = b / r_h;
+    }
+    return cv::Rect(l, t, r-l, b-t);
+}
+
+float iou(float lbox[4], float rbox[4]) {
+    float interBox[] = {
+        std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
+        std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
+        std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
+        std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
+    };
+
+    if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
+        return 0.0f;
+
+    float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
+    return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
+}
+
+bool cmp(Yolo::Detection& a, Yolo::Detection& b) {
+    return a.det_confidence > b.det_confidence;
+}
+
+void nms(std::vector<Yolo::Detection>& res, float *output, float nms_thresh = NMS_THRESH) {
+    std::map<float, std::vector<Yolo::Detection>> m;
+    for (int i = 0; i < output[0] && i < 1000; i++) {
+        if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue;
+        Yolo::Detection det;
+        memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float));
+        if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
+        m[det.class_id].push_back(det);
+    }
+    for (auto it = m.begin(); it != m.end(); it++) {
+        //std::cout << it->second[0].class_id << " --- " << std::endl;
+        auto& dets = it->second;
+        std::sort(dets.begin(), dets.end(), cmp);
+        for (size_t m = 0; m < dets.size(); ++m) {
+            auto& item = dets[m];
+            res.push_back(item);
+            for (size_t n = m + 1; n < dets.size(); ++n) {
+                if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
+                    dets.erase(dets.begin()+n);
+                    --n;
+                }
+            }
+        }
+    }
+}
+
+// TensorRT weight files have a simple space delimited format:
+// [type] [size] <data x size in hex>
+std::map<std::string, Weights> loadWeights(const std::string file) {
+    std::cout << "Loading weights: " << file << std::endl;
+    std::map<std::string, Weights> weightMap;
+
+    // Open weights file
+    std::ifstream input(file);
+    assert(input.is_open() && "Unable to load weight file.");
+
+    // Read number of weight blobs
+    int32_t count;
+    input >> count;
+    assert(count > 0 && "Invalid weight map file.");
+
+    while (count--)
+    {
+        Weights wt{DataType::kFLOAT, nullptr, 0};
+        uint32_t size;
+
+        // Read name and type of blob
+        std::string name;
+        input >> name >> std::dec >> size;
+        wt.type = DataType::kFLOAT;
+
+        // Load blob
+        uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
+        for (uint32_t x = 0, y = size; x < y; ++x)
+        {
+            input >> std::hex >> val[x];
+        }
+        wt.values = val;
+
+        wt.count = size;
+        weightMap[name] = wt;
+    }
+
+    return weightMap;
+}
+
+IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
+    float *gamma = (float*)weightMap[lname + ".weight"].values;
+    float *beta = (float*)weightMap[lname + ".bias"].values;
+    float *mean = (float*)weightMap[lname + ".running_mean"].values;
+    float *var = (float*)weightMap[lname + ".running_var"].values;
+    int len = weightMap[lname + ".running_var"].count;
+
+    float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        scval[i] = gamma[i] / sqrt(var[i] + eps);
+    }
+    Weights scale{DataType::kFLOAT, scval, len};
+
+    float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
+    }
+    Weights shift{DataType::kFLOAT, shval, len};
+
+    float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+    for (int i = 0; i < len; i++) {
+        pval[i] = 1.0;
+    }
+    Weights power{DataType::kFLOAT, pval, len};
+
+    weightMap[lname + ".scale"] = scale;
+    weightMap[lname + ".shift"] = shift;
+    weightMap[lname + ".power"] = power;
+    IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
+    assert(scale_1);
+    return scale_1;
+}
+
+ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, int linx) {
+    Weights emptywts{DataType::kFLOAT, nullptr, 0};
+    IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
+    assert(conv1);
+    conv1->setStrideNd(DimsHW{s, s});
+    conv1->setPaddingNd(DimsHW{p, p});
+
+    IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5);
+
+    auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
+    lr->setAlpha(0.1);
+
+    return lr;
+}
+
+// Creat the engine using only the API and not any parser.
+ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& weights_path) {
+    INetworkDefinition* network = builder->createNetworkV2(0U);
+
+    // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
+    ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
+    assert(data);
+
+    std::map<std::string, Weights> weightMap = loadWeights(weights_path);
+    Weights emptywts{DataType::kFLOAT, nullptr, 0};
+
+    // Yeah I am stupid, I just want to expand the complete arch of darknet..
+    auto lr0 = convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0);
+    auto lr1 = convBnLeaky(network, weightMap, *lr0->getOutput(0), 64, 3, 2, 1, 1);
+    auto lr2 = convBnLeaky(network, weightMap, *lr1->getOutput(0), 32, 1, 1, 0, 2);
+    auto lr3 = convBnLeaky(network, weightMap, *lr2->getOutput(0), 64, 3, 1, 1, 3);
+    auto ew4 = network->addElementWise(*lr3->getOutput(0), *lr1->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr5 = convBnLeaky(network, weightMap, *ew4->getOutput(0), 128, 3, 2, 1, 5);
+    auto lr6 = convBnLeaky(network, weightMap, *lr5->getOutput(0), 64, 1, 1, 0, 6);
+    auto lr7 = convBnLeaky(network, weightMap, *lr6->getOutput(0), 128, 3, 1, 1, 7);
+    auto ew8 = network->addElementWise(*lr7->getOutput(0), *lr5->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr9 = convBnLeaky(network, weightMap, *ew8->getOutput(0), 64, 1, 1, 0, 9);
+    auto lr10 = convBnLeaky(network, weightMap, *lr9->getOutput(0), 128, 3, 1, 1, 10);
+    auto ew11 = network->addElementWise(*lr10->getOutput(0), *ew8->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr12 = convBnLeaky(network, weightMap, *ew11->getOutput(0), 256, 3, 2, 1, 12);
+    auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 128, 1, 1, 0, 13);
+    auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 256, 3, 1, 1, 14);
+    auto ew15 = network->addElementWise(*lr14->getOutput(0), *lr12->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr16 = convBnLeaky(network, weightMap, *ew15->getOutput(0), 128, 1, 1, 0, 16);
+    auto lr17 = convBnLeaky(network, weightMap, *lr16->getOutput(0), 256, 3, 1, 1, 17);
+    auto ew18 = network->addElementWise(*lr17->getOutput(0), *ew15->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr19 = convBnLeaky(network, weightMap, *ew18->getOutput(0), 128, 1, 1, 0, 19);
+    auto lr20 = convBnLeaky(network, weightMap, *lr19->getOutput(0), 256, 3, 1, 1, 20);
+    auto ew21 = network->addElementWise(*lr20->getOutput(0), *ew18->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr22 = convBnLeaky(network, weightMap, *ew21->getOutput(0), 128, 1, 1, 0, 22);
+    auto lr23 = convBnLeaky(network, weightMap, *lr22->getOutput(0), 256, 3, 1, 1, 23);
+    auto ew24 = network->addElementWise(*lr23->getOutput(0), *ew21->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr25 = convBnLeaky(network, weightMap, *ew24->getOutput(0), 128, 1, 1, 0, 25);
+    auto lr26 = convBnLeaky(network, weightMap, *lr25->getOutput(0), 256, 3, 1, 1, 26);
+    auto ew27 = network->addElementWise(*lr26->getOutput(0), *ew24->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr28 = convBnLeaky(network, weightMap, *ew27->getOutput(0), 128, 1, 1, 0, 28);
+    auto lr29 = convBnLeaky(network, weightMap, *lr28->getOutput(0), 256, 3, 1, 1, 29);
+    auto ew30 = network->addElementWise(*lr29->getOutput(0), *ew27->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr31 = convBnLeaky(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31);
+    auto lr32 = convBnLeaky(network, weightMap, *lr31->getOutput(0), 256, 3, 1, 1, 32);
+    auto ew33 = network->addElementWise(*lr32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr34 = convBnLeaky(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34);
+    auto lr35 = convBnLeaky(network, weightMap, *lr34->getOutput(0), 256, 3, 1, 1, 35);
+    auto ew36 = network->addElementWise(*lr35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr37 = convBnLeaky(network, weightMap, *ew36->getOutput(0), 512, 3, 2, 1, 37);
+    auto lr38 = convBnLeaky(network, weightMap, *lr37->getOutput(0), 256, 1, 1, 0, 38);
+    auto lr39 = convBnLeaky(network, weightMap, *lr38->getOutput(0), 512, 3, 1, 1, 39);
+    auto ew40 = network->addElementWise(*lr39->getOutput(0), *lr37->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr41 = convBnLeaky(network, weightMap, *ew40->getOutput(0), 256, 1, 1, 0, 41);
+    auto lr42 = convBnLeaky(network, weightMap, *lr41->getOutput(0), 512, 3, 1, 1, 42);
+    auto ew43 = network->addElementWise(*lr42->getOutput(0), *ew40->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr44 = convBnLeaky(network, weightMap, *ew43->getOutput(0), 256, 1, 1, 0, 44);
+    auto lr45 = convBnLeaky(network, weightMap, *lr44->getOutput(0), 512, 3, 1, 1, 45);
+    auto ew46 = network->addElementWise(*lr45->getOutput(0), *ew43->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr47 = convBnLeaky(network, weightMap, *ew46->getOutput(0), 256, 1, 1, 0, 47);
+    auto lr48 = convBnLeaky(network, weightMap, *lr47->getOutput(0), 512, 3, 1, 1, 48);
+    auto ew49 = network->addElementWise(*lr48->getOutput(0), *ew46->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr50 = convBnLeaky(network, weightMap, *ew49->getOutput(0), 256, 1, 1, 0, 50);
+    auto lr51 = convBnLeaky(network, weightMap, *lr50->getOutput(0), 512, 3, 1, 1, 51);
+    auto ew52 = network->addElementWise(*lr51->getOutput(0), *ew49->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr53 = convBnLeaky(network, weightMap, *ew52->getOutput(0), 256, 1, 1, 0, 53);
+    auto lr54 = convBnLeaky(network, weightMap, *lr53->getOutput(0), 512, 3, 1, 1, 54);
+    auto ew55 = network->addElementWise(*lr54->getOutput(0), *ew52->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr56 = convBnLeaky(network, weightMap, *ew55->getOutput(0), 256, 1, 1, 0, 56);
+    auto lr57 = convBnLeaky(network, weightMap, *lr56->getOutput(0), 512, 3, 1, 1, 57);
+    auto ew58 = network->addElementWise(*lr57->getOutput(0), *ew55->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr59 = convBnLeaky(network, weightMap, *ew58->getOutput(0), 256, 1, 1, 0, 59);
+    auto lr60 = convBnLeaky(network, weightMap, *lr59->getOutput(0), 512, 3, 1, 1, 60);
+    auto ew61 = network->addElementWise(*lr60->getOutput(0), *ew58->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr62 = convBnLeaky(network, weightMap, *ew61->getOutput(0), 1024, 3, 2, 1, 62);
+    auto lr63 = convBnLeaky(network, weightMap, *lr62->getOutput(0), 512, 1, 1, 0, 63);
+    auto lr64 = convBnLeaky(network, weightMap, *lr63->getOutput(0), 1024, 3, 1, 1, 64);
+    auto ew65 = network->addElementWise(*lr64->getOutput(0), *lr62->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr66 = convBnLeaky(network, weightMap, *ew65->getOutput(0), 512, 1, 1, 0, 66);
+    auto lr67 = convBnLeaky(network, weightMap, *lr66->getOutput(0), 1024, 3, 1, 1, 67);
+    auto ew68 = network->addElementWise(*lr67->getOutput(0), *ew65->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr69 = convBnLeaky(network, weightMap, *ew68->getOutput(0), 512, 1, 1, 0, 69);
+    auto lr70 = convBnLeaky(network, weightMap, *lr69->getOutput(0), 1024, 3, 1, 1, 70);
+    auto ew71 = network->addElementWise(*lr70->getOutput(0), *ew68->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr72 = convBnLeaky(network, weightMap, *ew71->getOutput(0), 512, 1, 1, 0, 72);
+    auto lr73 = convBnLeaky(network, weightMap, *lr72->getOutput(0), 1024, 3, 1, 1, 73);
+    auto ew74 = network->addElementWise(*lr73->getOutput(0), *ew71->getOutput(0), ElementWiseOperation::kSUM);
+    auto lr75 = convBnLeaky(network, weightMap, *ew74->getOutput(0), 512, 1, 1, 0, 75);
+    auto lr76 = convBnLeaky(network, weightMap, *lr75->getOutput(0), 1024, 3, 1, 1, 76);
+    auto lr77 = convBnLeaky(network, weightMap, *lr76->getOutput(0), 512, 1, 1, 0, 77);
+
+    auto pool78 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{5,5});
+    pool78->setPaddingNd(DimsHW{2, 2});
+    pool78->setStrideNd(DimsHW{1, 1});
+    auto pool80 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{9,9});
+    pool80->setPaddingNd(DimsHW{4, 4});
+    pool80->setStrideNd(DimsHW{1, 1});
+    auto pool82 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{13,13});
+    pool82->setPaddingNd(DimsHW{6, 6});
+    pool82->setStrideNd(DimsHW{1, 1});
+
+    ITensor* inputTensors83[] = {pool82->getOutput(0), pool80->getOutput(0), pool78->getOutput(0), lr77->getOutput(0)};
+    auto cat83 = network->addConcatenation(inputTensors83, 4);
+
+    auto lr84 = convBnLeaky(network, weightMap, *cat83->getOutput(0), 512, 1, 1, 0, 84);
+    auto lr85 = convBnLeaky(network, weightMap, *lr84->getOutput(0), 1024, 3, 1, 1, 85);
+    auto lr86 = convBnLeaky(network, weightMap, *lr85->getOutput(0), 512, 1, 1, 0, 86);
+    auto lr87 = convBnLeaky(network, weightMap, *lr86->getOutput(0), 1024, 3, 1, 1, 87);
+    IConvolutionLayer* conv88 = network->addConvolutionNd(*lr87->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.88.Conv2d.weight"], weightMap["module_list.88.Conv2d.bias"]);
+    assert(conv88);
+    auto lr91 = convBnLeaky(network, weightMap, *lr86->getOutput(0), 256, 1, 1, 0, 91);
+
+    float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
+    for (int i = 0; i < 256 * 2 * 2; i++) {
+        deval[i] = 1.0;
+    }
+    Weights deconvwts92{DataType::kFLOAT, deval, 256 * 2 * 2};
+    IDeconvolutionLayer* deconv92 = network->addDeconvolutionNd(*lr91->getOutput(0), 256, DimsHW{2, 2}, deconvwts92, emptywts);
+    assert(deconv92);
+    deconv92->setStrideNd(DimsHW{2, 2});
+    deconv92->setNbGroups(256);
+    weightMap["deconv92"] = deconvwts92;
+
+    ITensor* inputTensors[] = {deconv92->getOutput(0), ew61->getOutput(0)};
+    auto cat93 = network->addConcatenation(inputTensors, 2);
+    auto lr94 = convBnLeaky(network, weightMap, *cat93->getOutput(0), 256, 1, 1, 0, 94);
+    auto lr95 = convBnLeaky(network, weightMap, *lr94->getOutput(0), 512, 3, 1, 1, 95);
+    auto lr96 = convBnLeaky(network, weightMap, *lr95->getOutput(0), 256, 1, 1, 0, 96);
+    auto lr97 = convBnLeaky(network, weightMap, *lr96->getOutput(0), 512, 3, 1, 1, 97);
+    auto lr98 = convBnLeaky(network, weightMap, *lr97->getOutput(0), 256, 1, 1, 0, 98);
+    auto lr99 = convBnLeaky(network, weightMap, *lr98->getOutput(0), 512, 3, 1, 1, 99);
+    IConvolutionLayer* conv100 = network->addConvolutionNd(*lr99->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.100.Conv2d.weight"], weightMap["module_list.100.Conv2d.bias"]);
+    assert(conv100);
+    auto lr103 = convBnLeaky(network, weightMap, *lr98->getOutput(0), 128, 1, 1, 0, 103);
+    Weights deconvwts104{DataType::kFLOAT, deval, 128 * 2 * 2};
+    IDeconvolutionLayer* deconv104 = network->addDeconvolutionNd(*lr103->getOutput(0), 128, DimsHW{2, 2}, deconvwts104, emptywts);
+    assert(deconv104);
+    deconv104->setStrideNd(DimsHW{2, 2});
+    deconv104->setNbGroups(128);
+    ITensor* inputTensors1[] = {deconv104->getOutput(0), ew36->getOutput(0)};
+    auto cat105 = network->addConcatenation(inputTensors1, 2);
+    auto lr106 = convBnLeaky(network, weightMap, *cat105->getOutput(0), 128, 1, 1, 0, 106);
+    auto lr107 = convBnLeaky(network, weightMap, *lr106->getOutput(0), 256, 3, 1, 1, 107);
+    auto lr108 = convBnLeaky(network, weightMap, *lr107->getOutput(0), 128, 1, 1, 0, 108);
+    auto lr109 = convBnLeaky(network, weightMap, *lr108->getOutput(0), 256, 3, 1, 1, 109);
+    auto lr110 = convBnLeaky(network, weightMap, *lr109->getOutput(0), 128, 1, 1, 0, 110);
+    auto lr111 = convBnLeaky(network, weightMap, *lr110->getOutput(0), 256, 3, 1, 1, 111);
+    IConvolutionLayer* conv112 = network->addConvolutionNd(*lr111->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.112.Conv2d.weight"], weightMap["module_list.112.Conv2d.bias"]);
+    assert(conv112);
+
+    auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
+    const PluginFieldCollection* pluginData = creator->getFieldNames();
+    IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData);
+    ITensor* inputTensors_yolo[] = {conv88->getOutput(0), conv100->getOutput(0), conv112->getOutput(0)};
+    auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
+
+    yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
+    network->markOutput(*yolo->getOutput(0));
+
+    // Build engine
+    builder->setMaxBatchSize(maxBatchSize);
+    config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
+#ifdef USE_FP16
+    config->setFlag(BuilderFlag::kFP16);
+#endif
+    std::cout << "Building engine, please wait for a while..." << std::endl;
+    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+    std::cout << "Build engine successfully!" << std::endl;
+
+    // Don't need the network any more
+    network->destroy();
+
+    // Release host memory
+    for (auto& mem : weightMap)
+    {
+        free((void*) (mem.second.values));
+    }
+
+    return engine;
+}
+
+void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, const std::string& weights_path) {
+    // Create builder
+    IBuilder* builder = createInferBuilder(gLogger);
+    IBuilderConfig* config = builder->createBuilderConfig();
+
+    // Create model to populate the network, then set the outputs and create an engine
+    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, weights_path);
+    assert(engine != nullptr);
+
+    // Serialize the engine
+    (*modelStream) = engine->serialize();
+
+    // Close everything down
+    engine->destroy();
+    builder->destroy();
+}
+
+void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
+    const ICudaEngine& engine = context.getEngine();
+
+    // Pointers to input and output device buffers to pass to engine.
+    // Engine requires exactly IEngine::getNbBindings() number of buffers.
+    assert(engine.getNbBindings() == 2);
+    void* buffers[2];
+
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
+    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
+
+    // Create GPU buffers on device
+    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
+    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
+
+    // Create stream
+    cudaStream_t stream;
+    CHECK(cudaStreamCreate(&stream));
+
+    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
+    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+    context.enqueue(batchSize, buffers, stream, nullptr);
+    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    cudaStreamSynchronize(stream);
+
+    // Release stream and buffers
+    cudaStreamDestroy(stream);
+    CHECK(cudaFree(buffers[inputIndex]));
+    CHECK(cudaFree(buffers[outputIndex]));
+}
+
+int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
+    DIR *p_dir = opendir(p_dir_name);
+    if (p_dir == nullptr) {
+        return -1;
+    }
+
+    struct dirent* p_file = nullptr;
+    while ((p_file = readdir(p_dir)) != nullptr) {
+        if (strcmp(p_file->d_name, ".") != 0 &&
+                strcmp(p_file->d_name, "..") != 0) {
+            //std::string cur_file_name(p_dir_name);
+            //cur_file_name += "/";
+            //cur_file_name += p_file->d_name;
+            std::string cur_file_name(p_file->d_name);
+            file_names.push_back(cur_file_name);
+        }
+    }
+
+    closedir(p_dir);
+    return 0;
+}
+
+int main(int argc, char** argv) {
+    cudaSetDevice(DEVICE);
+    // create a model using the API directly and serialize it to a stream
+    char *trtModelStream{nullptr};
+    size_t size{0};
+
+    if (argc == 2 && std::string(argv[1]) == "-s") {
+        IHostMemory* modelStream{nullptr};
+        APIToModel(1, &modelStream, "yolov3-spp_ultralytics68.wts");
+        assert(modelStream != nullptr);
+        std::ofstream p("yolov3-spp.engine", std::ios::binary);
+        if (!p) {
+            std::cerr << "could not open plan output file" << std::endl;
+            return -1;
+        }
+        p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
+        modelStream->destroy();
+        return 0;
+    } else if (argc == 3 && std::string(argv[1]) == "-d") {
+        std::ifstream file("yolov3-spp.engine", std::ios::binary);
+        if (file.good()) {
+            file.seekg(0, file.end);
+            size = file.tellg();
+            file.seekg(0, file.beg);
+            trtModelStream = new char[size];
+            assert(trtModelStream);
+            file.read(trtModelStream, size);
+            file.close();
+        }
+    } else {
+        std::cerr << "arguments not right!" << std::endl;
+        std::cerr << "./yolov3-spp -s  // serialize model to plan file" << std::endl;
+        std::cerr << "./yolov3-spp -d ../samples  // deserialize plan file and run inference" << std::endl;
+        return -1;
+    }
+
+    std::vector<std::string> file_names;
+    if (read_files_in_dir(argv[2], file_names) < 0) {
+        std::cout << "read_files_in_dir failed." << std::endl;
+        return -1;
+    }
+
+    // prepare input data ---------------------------
+    float data[3 * INPUT_H * INPUT_W];
+    //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
+    //    data[i] = 1.0;
+    static float prob[OUTPUT_SIZE];
+    IRuntime* runtime = createInferRuntime(gLogger);
+    assert(runtime != nullptr);
+    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
+    assert(engine != nullptr);
+    IExecutionContext* context = engine->createExecutionContext();
+    assert(context != nullptr);
+    delete[] trtModelStream;
+
+    int fcount = 0;
+    for (auto f: file_names) {
+        fcount++;
+        std::cout << fcount << "  " << f << std::endl;
+        cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f);
+        if (img.empty()) continue;
+        cv::Mat pr_img = preprocess_img(img);
+        for (int i = 0; i < INPUT_H * INPUT_W; i++) {
+            data[i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
+            data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
+            data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
+        }
+
+        // Run inference
+        auto start = std::chrono::system_clock::now();
+        doInference(*context, data, prob, 1);
+        auto end = std::chrono::system_clock::now();
+        std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+        std::vector<Yolo::Detection> res;
+        nms(res, prob);
+        for (int i=0; i<20; i++) {
+            std::cout << prob[i] << ",";
+        }
+        std::cout << res.size() << std::endl;
+        for (size_t j = 0; j < res.size(); j++) {
+            float *p = (float*)&res[j];
+            for (size_t k = 0; k < 7; k++) {
+                std::cout << p[k] << ", ";
+            }
+            std::cout << std::endl;
+            cv::Rect r = get_rect(img, res[j].bbox);
+            cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+            cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+        }
+        cv::imwrite("_" + f, img);
+    }
+
+    // Destroy the engine
+    context->destroy();
+    engine->destroy();
+    runtime->destroy();
+    return 0;
+}
diff --git a/uw_detection/src/yolov3/yolov3.cpp b/uw_detection/src/yolov3/yolov3.cpp
new file mode 100644
index 0000000..13e63b8
--- /dev/null
+++ b/uw_detection/src/yolov3/yolov3.cpp
@@ -0,0 +1,567 @@
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <vector>
+#include <chrono>
+#include <opencv2/opencv.hpp>
+#include <dirent.h>
+#include "NvInfer.h"
+#include "cuda_runtime_api.h"
+#include <uw_detection/logging.h>
+#include <uw_detection/yololayer.h>
+
+#define CHECK(status) \
+    do\
+    {\
+        auto ret = (status);\
+        if (ret != 0)\
+        {\
+            std::cerr << "Cuda failure: " << ret << std::endl;\
+            abort();\
+        }\
+    } while (0)
+
+
+#define USE_FP16  // comment out this if want to use FP32
+#define DEVICE 0  // GPU id
+#define NMS_THRESH 0.4
+#define BBOX_CONF_THRESH 0.5
+#define CLASS_NUM 21
+
+using namespace nvinfer1;
+
+// stuff we know about the network and the input/output blobs
+static const int INPUT_H = Yolo::INPUT_H;
+static const int INPUT_W = Yolo::INPUT_W;
+static const int OUTPUT_SIZE = 1000 * 7 + 1;  // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1
+const char* INPUT_BLOB_NAME = "data";
+const char* OUTPUT_BLOB_NAME = "prob";
+static Logger gLogger;
+
+cv::Mat preprocess_img(cv::Mat& img) {
+  int w, h, x, y;
+  float r_w = INPUT_W / (img.cols*1.0);
+  float r_h = INPUT_H / (img.rows*1.0);
+  if (r_h > r_w) {
+    w = INPUT_W;
+    h = r_w * img.rows;
+    x = 0;
+    y = (INPUT_H - h) / 2;
+  } else {
+    w = r_h* img.cols;
+    h = INPUT_H;
+    x = (INPUT_W - w) / 2;
+    y = 0;
+  }
+  cv::Mat re(h, w, CV_8UC3);
+  cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC);
+  cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128));
+  re.copyTo(out(cv::Rect(x, y, re.cols, re.rows)));
+  return out;
+}
+
+cv::Rect get_rect(const cv::Mat& img, float bbox[4]) {
+  int l, r, t, b;
+  float r_w = INPUT_W / (img.cols * 1.0);
+  float r_h = INPUT_H / (img.rows * 1.0);
+  if (r_h > r_w) {
+    l = bbox[0] - bbox[2]/2.f;
+    r = bbox[0] + bbox[2]/2.f;
+    t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
+    b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2;
+    l = l / r_w;
+    r = r / r_w;
+    t = t / r_w;
+    b = b / r_w;
+  } else {
+    l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
+    r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2;
+    t = bbox[1] - bbox[3]/2.f;
+    b = bbox[1] + bbox[3]/2.f;
+    l = l / r_h;
+    r = r / r_h;
+    t = t / r_h;
+    b = b / r_h;
+  }
+  return cv::Rect(l, t, r-l, b-t);
+}
+
+float iou(float lbox[4], float rbox[4]) {
+  float interBox[] = {
+      std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
+      std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
+      std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
+      std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
+  };
+
+  if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
+    return 0.0f;
+
+  float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
+  return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
+}
+
+bool cmp(Yolo::Detection& a, Yolo::Detection& b) {
+  return a.det_confidence > b.det_confidence;
+}
+
+void nms(std::vector<Yolo::Detection>& res, float *output, float nms_thresh = NMS_THRESH) {
+  std::map<float, std::vector<Yolo::Detection>> m;
+  for (int i = 0; i < output[0] && i < 1000; i++) {
+    if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue;
+    Yolo::Detection det;
+    memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float));
+    if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector<Yolo::Detection>());
+    m[det.class_id].push_back(det);
+  }
+  for (auto it = m.begin(); it != m.end(); it++) {
+    //std::cout << it->second[0].class_id << " --- " << std::endl;
+    auto& dets = it->second;
+    std::sort(dets.begin(), dets.end(), cmp);
+    for (size_t m = 0; m < dets.size(); ++m) {
+      auto& item = dets[m];
+      res.push_back(item);
+      for (size_t n = m + 1; n < dets.size(); ++n) {
+        if (iou(item.bbox, dets[n].bbox) > nms_thresh) {
+          dets.erase(dets.begin()+n);
+          --n;
+        }
+      }
+    }
+  }
+}
+
+// TensorRT weight files have a simple space delimited format:
+// [type] [size] <data x size in hex>
+std::map<std::string, Weights> loadWeights(const std::string file) {
+  std::cout << "Loading weights: " << file << std::endl;
+  std::map<std::string, Weights> weightMap;
+
+  // Open weights file
+  std::ifstream input(file);
+  assert(input.is_open() && "Unable to load weight file.");
+
+  // Read number of weight blobs
+  int32_t count;
+  input >> count;
+  assert(count > 0 && "Invalid weight map file.");
+
+  while (count--)
+  {
+    Weights wt{DataType::kFLOAT, nullptr, 0};
+    uint32_t size;
+
+    // Read name and type of blob
+    std::string name;
+    input >> name >> std::dec >> size;
+    wt.type = DataType::kFLOAT;
+
+    // Load blob
+    uint32_t* val = reinterpret_cast<uint32_t*>(malloc(sizeof(val) * size));
+    for (uint32_t x = 0, y = size; x < y; ++x)
+    {
+      input >> std::hex >> val[x];
+    }
+    wt.values = val;
+
+    wt.count = size;
+    weightMap[name] = wt;
+  }
+
+  return weightMap;
+}
+
+IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input, std::string lname, float eps) {
+  float *gamma = (float*)weightMap[lname + ".weight"].values;
+  float *beta = (float*)weightMap[lname + ".bias"].values;
+  float *mean = (float*)weightMap[lname + ".running_mean"].values;
+  float *var = (float*)weightMap[lname + ".running_var"].values;
+  int len = weightMap[lname + ".running_var"].count;
+
+  float *scval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+  for (int i = 0; i < len; i++) {
+    scval[i] = gamma[i] / sqrt(var[i] + eps);
+  }
+  Weights scale{DataType::kFLOAT, scval, len};
+
+  float *shval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+  for (int i = 0; i < len; i++) {
+    shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps);
+  }
+  Weights shift{DataType::kFLOAT, shval, len};
+
+  float *pval = reinterpret_cast<float*>(malloc(sizeof(float) * len));
+  for (int i = 0; i < len; i++) {
+    pval[i] = 1.0;
+  }
+  Weights power{DataType::kFLOAT, pval, len};
+
+  weightMap[lname + ".scale"] = scale;
+  weightMap[lname + ".shift"] = shift;
+  weightMap[lname + ".power"] = power;
+  IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power);
+  assert(scale_1);
+  return scale_1;
+}
+
+ILayer* convBnLeaky(INetworkDefinition *network, std::map<std::string, Weights>& weightMap, ITensor& input,  int outch, int ksize, int s, int p, int linx) {
+  Weights emptywts{DataType::kFLOAT, nullptr, 0};
+  IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts);
+  assert(conv1);
+  conv1->setStrideNd(DimsHW{s, s});
+  conv1->setPaddingNd(DimsHW{p, p});
+
+  IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5);
+
+  auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU);
+  lr->setAlpha(0.1);
+
+  return lr;
+}
+
+// Creat the engine using only the API and not any parser.
+ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& weights_path) {
+  INetworkDefinition* network = builder->createNetworkV2(0U);
+
+  // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
+  ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W});
+  assert(data);
+
+  std::map<std::string, Weights> weightMap = loadWeights(weights_path);
+  Weights emptywts{DataType::kFLOAT, nullptr, 0};
+
+  // Yeah I am stupid, I just want to expand the complete arch of darknet..
+  auto lr0 = convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0);
+  auto lr1 = convBnLeaky(network, weightMap, *lr0->getOutput(0), 64, 3, 2, 1, 1);
+  auto lr2 = convBnLeaky(network, weightMap, *lr1->getOutput(0), 32, 1, 1, 0, 2);
+  auto lr3 = convBnLeaky(network, weightMap, *lr2->getOutput(0), 64, 3, 1, 1, 3);
+  auto ew4 = network->addElementWise(*lr3->getOutput(0), *lr1->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr5 = convBnLeaky(network, weightMap, *ew4->getOutput(0), 128, 3, 2, 1, 5);
+  auto lr6 = convBnLeaky(network, weightMap, *lr5->getOutput(0), 64, 1, 1, 0, 6);
+  auto lr7 = convBnLeaky(network, weightMap, *lr6->getOutput(0), 128, 3, 1, 1, 7);
+  auto ew8 = network->addElementWise(*lr7->getOutput(0), *lr5->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr9 = convBnLeaky(network, weightMap, *ew8->getOutput(0), 64, 1, 1, 0, 9);
+  auto lr10 = convBnLeaky(network, weightMap, *lr9->getOutput(0), 128, 3, 1, 1, 10);
+  auto ew11 = network->addElementWise(*lr10->getOutput(0), *ew8->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr12 = convBnLeaky(network, weightMap, *ew11->getOutput(0), 256, 3, 2, 1, 12);
+  auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 128, 1, 1, 0, 13);
+  auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 256, 3, 1, 1, 14);
+  auto ew15 = network->addElementWise(*lr14->getOutput(0), *lr12->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr16 = convBnLeaky(network, weightMap, *ew15->getOutput(0), 128, 1, 1, 0, 16);
+  auto lr17 = convBnLeaky(network, weightMap, *lr16->getOutput(0), 256, 3, 1, 1, 17);
+  auto ew18 = network->addElementWise(*lr17->getOutput(0), *ew15->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr19 = convBnLeaky(network, weightMap, *ew18->getOutput(0), 128, 1, 1, 0, 19);
+  auto lr20 = convBnLeaky(network, weightMap, *lr19->getOutput(0), 256, 3, 1, 1, 20);
+  auto ew21 = network->addElementWise(*lr20->getOutput(0), *ew18->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr22 = convBnLeaky(network, weightMap, *ew21->getOutput(0), 128, 1, 1, 0, 22);
+  auto lr23 = convBnLeaky(network, weightMap, *lr22->getOutput(0), 256, 3, 1, 1, 23);
+  auto ew24 = network->addElementWise(*lr23->getOutput(0), *ew21->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr25 = convBnLeaky(network, weightMap, *ew24->getOutput(0), 128, 1, 1, 0, 25);
+  auto lr26 = convBnLeaky(network, weightMap, *lr25->getOutput(0), 256, 3, 1, 1, 26);
+  auto ew27 = network->addElementWise(*lr26->getOutput(0), *ew24->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr28 = convBnLeaky(network, weightMap, *ew27->getOutput(0), 128, 1, 1, 0, 28);
+  auto lr29 = convBnLeaky(network, weightMap, *lr28->getOutput(0), 256, 3, 1, 1, 29);
+  auto ew30 = network->addElementWise(*lr29->getOutput(0), *ew27->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr31 = convBnLeaky(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31);
+  auto lr32 = convBnLeaky(network, weightMap, *lr31->getOutput(0), 256, 3, 1, 1, 32);
+  auto ew33 = network->addElementWise(*lr32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr34 = convBnLeaky(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34);
+  auto lr35 = convBnLeaky(network, weightMap, *lr34->getOutput(0), 256, 3, 1, 1, 35);
+  auto ew36 = network->addElementWise(*lr35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr37 = convBnLeaky(network, weightMap, *ew36->getOutput(0), 512, 3, 2, 1, 37);
+  auto lr38 = convBnLeaky(network, weightMap, *lr37->getOutput(0), 256, 1, 1, 0, 38);
+  auto lr39 = convBnLeaky(network, weightMap, *lr38->getOutput(0), 512, 3, 1, 1, 39);
+  auto ew40 = network->addElementWise(*lr39->getOutput(0), *lr37->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr41 = convBnLeaky(network, weightMap, *ew40->getOutput(0), 256, 1, 1, 0, 41);
+  auto lr42 = convBnLeaky(network, weightMap, *lr41->getOutput(0), 512, 3, 1, 1, 42);
+  auto ew43 = network->addElementWise(*lr42->getOutput(0), *ew40->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr44 = convBnLeaky(network, weightMap, *ew43->getOutput(0), 256, 1, 1, 0, 44);
+  auto lr45 = convBnLeaky(network, weightMap, *lr44->getOutput(0), 512, 3, 1, 1, 45);
+  auto ew46 = network->addElementWise(*lr45->getOutput(0), *ew43->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr47 = convBnLeaky(network, weightMap, *ew46->getOutput(0), 256, 1, 1, 0, 47);
+  auto lr48 = convBnLeaky(network, weightMap, *lr47->getOutput(0), 512, 3, 1, 1, 48);
+  auto ew49 = network->addElementWise(*lr48->getOutput(0), *ew46->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr50 = convBnLeaky(network, weightMap, *ew49->getOutput(0), 256, 1, 1, 0, 50);
+  auto lr51 = convBnLeaky(network, weightMap, *lr50->getOutput(0), 512, 3, 1, 1, 51);
+  auto ew52 = network->addElementWise(*lr51->getOutput(0), *ew49->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr53 = convBnLeaky(network, weightMap, *ew52->getOutput(0), 256, 1, 1, 0, 53);
+  auto lr54 = convBnLeaky(network, weightMap, *lr53->getOutput(0), 512, 3, 1, 1, 54);
+  auto ew55 = network->addElementWise(*lr54->getOutput(0), *ew52->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr56 = convBnLeaky(network, weightMap, *ew55->getOutput(0), 256, 1, 1, 0, 56);
+  auto lr57 = convBnLeaky(network, weightMap, *lr56->getOutput(0), 512, 3, 1, 1, 57);
+  auto ew58 = network->addElementWise(*lr57->getOutput(0), *ew55->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr59 = convBnLeaky(network, weightMap, *ew58->getOutput(0), 256, 1, 1, 0, 59);
+  auto lr60 = convBnLeaky(network, weightMap, *lr59->getOutput(0), 512, 3, 1, 1, 60);
+  auto ew61 = network->addElementWise(*lr60->getOutput(0), *ew58->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr62 = convBnLeaky(network, weightMap, *ew61->getOutput(0), 1024, 3, 2, 1, 62);
+  auto lr63 = convBnLeaky(network, weightMap, *lr62->getOutput(0), 512, 1, 1, 0, 63);
+  auto lr64 = convBnLeaky(network, weightMap, *lr63->getOutput(0), 1024, 3, 1, 1, 64);
+  auto ew65 = network->addElementWise(*lr64->getOutput(0), *lr62->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr66 = convBnLeaky(network, weightMap, *ew65->getOutput(0), 512, 1, 1, 0, 66);
+  auto lr67 = convBnLeaky(network, weightMap, *lr66->getOutput(0), 1024, 3, 1, 1, 67);
+  auto ew68 = network->addElementWise(*lr67->getOutput(0), *ew65->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr69 = convBnLeaky(network, weightMap, *ew68->getOutput(0), 512, 1, 1, 0, 69);
+  auto lr70 = convBnLeaky(network, weightMap, *lr69->getOutput(0), 1024, 3, 1, 1, 70);
+  auto ew71 = network->addElementWise(*lr70->getOutput(0), *ew68->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr72 = convBnLeaky(network, weightMap, *ew71->getOutput(0), 512, 1, 1, 0, 72);
+  auto lr73 = convBnLeaky(network, weightMap, *lr72->getOutput(0), 1024, 3, 1, 1, 73);
+  auto ew74 = network->addElementWise(*lr73->getOutput(0), *ew71->getOutput(0), ElementWiseOperation::kSUM);
+  auto lr75 = convBnLeaky(network, weightMap, *ew74->getOutput(0), 512, 1, 1, 0, 75);
+  auto lr76 = convBnLeaky(network, weightMap, *lr75->getOutput(0), 1024, 3, 1, 1, 76);
+  auto lr77 = convBnLeaky(network, weightMap, *lr76->getOutput(0), 512, 1, 1, 0, 77);
+  auto lr78 = convBnLeaky(network, weightMap, *lr77->getOutput(0), 1024, 3, 1, 1, 78);
+  auto lr79 = convBnLeaky(network, weightMap, *lr78->getOutput(0), 512, 1, 1, 0, 79);
+  auto lr80 = convBnLeaky(network, weightMap, *lr79->getOutput(0), 1024, 3, 1, 1, 80);
+  IConvolutionLayer* conv81 = network->addConvolutionNd(*lr80->getOutput(0), 3 * (CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.81.Conv2d.weight"], weightMap["module_list.81.Conv2d.bias"]);
+  assert(conv81);
+  // 82 is yolo
+  auto l83 = lr79;
+  auto lr84 = convBnLeaky(network, weightMap, *l83->getOutput(0), 256, 1, 1, 0, 84);
+
+  float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
+  for (int i = 0; i < 256 * 2 * 2; i++) {
+    deval[i] = 1.0;
+  }
+  Weights deconvwts85{DataType::kFLOAT, deval, 256 * 2 * 2};
+  IDeconvolutionLayer* deconv85 = network->addDeconvolutionNd(*lr84->getOutput(0), 256, DimsHW{2, 2}, deconvwts85, emptywts);
+  assert(deconv85);
+  deconv85->setStrideNd(DimsHW{2, 2});
+  deconv85->setNbGroups(256);
+  weightMap["deconv85"] = deconvwts85;
+
+  ITensor* inputTensors[] = {deconv85->getOutput(0), ew61->getOutput(0)};
+  auto cat86 = network->addConcatenation(inputTensors, 2);
+  auto lr87 = convBnLeaky(network, weightMap, *cat86->getOutput(0), 256, 1, 1, 0, 87);
+  auto lr88 = convBnLeaky(network, weightMap, *lr87->getOutput(0), 512, 3, 1, 1, 88);
+  auto lr89 = convBnLeaky(network, weightMap, *lr88->getOutput(0), 256, 1, 1, 0, 89);
+  auto lr90 = convBnLeaky(network, weightMap, *lr89->getOutput(0), 512, 3, 1, 1, 90);
+  auto lr91 = convBnLeaky(network, weightMap, *lr90->getOutput(0), 256, 1, 1, 0, 91);
+  auto lr92 = convBnLeaky(network, weightMap, *lr91->getOutput(0), 512, 3, 1, 1, 92);
+  IConvolutionLayer* conv93 = network->addConvolutionNd(*lr92->getOutput(0), 3 * (CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.93.Conv2d.weight"], weightMap["module_list.93.Conv2d.bias"]);
+  assert(conv93);
+  // 94 is yolo
+  auto l95 = lr91;
+  auto lr96 = convBnLeaky(network, weightMap, *l95->getOutput(0), 128, 1, 1, 0, 96);
+  Weights deconvwts97{DataType::kFLOAT, deval, 128 * 2 * 2};
+  IDeconvolutionLayer* deconv97 = network->addDeconvolutionNd(*lr96->getOutput(0), 128, DimsHW{2, 2}, deconvwts97, emptywts);
+  assert(deconv97);
+  deconv97->setStrideNd(DimsHW{2, 2});
+  deconv97->setNbGroups(128);
+  ITensor* inputTensors1[] = {deconv97->getOutput(0), ew36->getOutput(0)};
+  auto cat98 = network->addConcatenation(inputTensors1, 2);
+  auto lr99 = convBnLeaky(network, weightMap, *cat98->getOutput(0), 128, 1, 1, 0, 99);
+  auto lr100 = convBnLeaky(network, weightMap, *lr99->getOutput(0), 256, 3, 1, 1, 100);
+  auto lr101 = convBnLeaky(network, weightMap, *lr100->getOutput(0), 128, 1, 1, 0, 101);
+  auto lr102 = convBnLeaky(network, weightMap, *lr101->getOutput(0), 256, 3, 1, 1, 102);
+  auto lr103 = convBnLeaky(network, weightMap, *lr102->getOutput(0), 128, 1, 1, 0, 103);
+  auto lr104 = convBnLeaky(network, weightMap, *lr103->getOutput(0), 256, 3, 1, 1, 104);
+  IConvolutionLayer* conv105 = network->addConvolutionNd(*lr104->getOutput(0), 3 * (CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.105.Conv2d.weight"], weightMap["module_list.105.Conv2d.bias"]);
+  assert(conv105);
+
+  auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1");
+  auto class_num = CLASS_NUM;
+  std::vector<PluginField> vpf{
+      PluginField{"numClasses", &class_num, PluginFieldType::kINT32, 1}
+  };
+
+  PluginFieldCollection pfc{ (int)vpf.size(), vpf.data() };
+  IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pfc);
+  ITensor* inputTensors_yolo[] = {conv81->getOutput(0), conv93->getOutput(0), conv105->getOutput(0)};
+  auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj);
+
+  yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
+  network->markOutput(*yolo->getOutput(0));
+
+  // Build engine
+  builder->setMaxBatchSize(maxBatchSize);
+  config->setMaxWorkspaceSize(16 * (1 << 20));  // 16MB
+#ifdef USE_FP16
+  config->setFlag(BuilderFlag::kFP16);
+#endif
+  std::cout << "Building engine, please wait for a while..." << std::endl;
+  ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
+  std::cout << "Build engine successfully!" << std::endl;
+
+  // Don't need the network any more
+  network->destroy();
+
+  // Release host memory
+  for (auto& mem : weightMap)
+  {
+    free((void*) (mem.second.values));
+  }
+
+  return engine;
+}
+
+void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, const std::string& weights_path) {
+  // Create builder
+  IBuilder* builder = createInferBuilder(gLogger);
+  IBuilderConfig* config = builder->createBuilderConfig();
+
+  // Create model to populate the network, then set the outputs and create an engine
+  ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, weights_path);
+  assert(engine != nullptr);
+
+  // Serialize the engine
+  (*modelStream) = engine->serialize();
+
+  // Close everything down
+  engine->destroy();
+  builder->destroy();
+}
+
+void doInference(IExecutionContext& context, float* input, float* output, int batchSize) {
+  const ICudaEngine& engine = context.getEngine();
+
+  // Pointers to input and output device buffers to pass to engine.
+  // Engine requires exactly IEngine::getNbBindings() number of buffers.
+  assert(engine.getNbBindings() == 2);
+  void* buffers[2];
+
+  // In order to bind the buffers, we need to know the names of the input and output tensors.
+  // Note that indices are guaranteed to be less than IEngine::getNbBindings()
+  const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
+  const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
+
+  // Create GPU buffers on device
+  CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
+  CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
+
+  // Create stream
+  cudaStream_t stream;
+  CHECK(cudaStreamCreate(&stream));
+
+  // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
+  CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
+  context.enqueue(batchSize, buffers, stream, nullptr);
+  CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+
+  // Release stream and buffers
+  cudaStreamDestroy(stream);
+  CHECK(cudaFree(buffers[inputIndex]));
+  CHECK(cudaFree(buffers[outputIndex]));
+}
+
+int read_files_in_dir(const char *p_dir_name, std::vector<std::string> &file_names) {
+  DIR *p_dir = opendir(p_dir_name);
+  if (p_dir == nullptr) {
+    return -1;
+  }
+
+  struct dirent* p_file = nullptr;
+  while ((p_file = readdir(p_dir)) != nullptr) {
+    if (strcmp(p_file->d_name, ".") != 0 &&
+        strcmp(p_file->d_name, "..") != 0) {
+      //std::string cur_file_name(p_dir_name);
+      //cur_file_name += "/";
+      //cur_file_name += p_file->d_name;
+      std::string cur_file_name(p_file->d_name);
+      file_names.push_back(cur_file_name);
+    }
+  }
+
+  closedir(p_dir);
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  cudaSetDevice(DEVICE);
+  // create a model using the API directly and serialize it to a stream
+  char *trtModelStream{nullptr};
+  size_t size{0};
+
+  if (argc == 2 && std::string(argv[1]) == "-s") {
+    IHostMemory* modelStream{nullptr};
+    APIToModel(1, &modelStream, "yolov3.wts");
+    assert(modelStream != nullptr);
+    std::ofstream p("yolov3.engine", std::ios::binary);
+    if (!p) {
+      std::cerr << "could not open plan output file" << std::endl;
+      return -1;
+    }
+    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
+    modelStream->destroy();
+    return 0;
+  } else if (argc == 3 && std::string(argv[1]) == "-d") {
+    std::ifstream file("yolov3.engine", std::ios::binary);
+    if (file.good()) {
+      file.seekg(0, file.end);
+      size = file.tellg();
+      file.seekg(0, file.beg);
+      trtModelStream = new char[size];
+      assert(trtModelStream);
+      file.read(trtModelStream, size);
+      file.close();
+    }
+  } else {
+    std::cerr << "arguments not right!" << std::endl;
+    std::cerr << "./yolov3 -s  // serialize model to plan file" << std::endl;
+    std::cerr << "./yolov3 -d ../samples  // deserialize plan file and run inference" << std::endl;
+    return -1;
+  }
+
+  std::vector<std::string> file_names;
+  if (read_files_in_dir(argv[2], file_names) < 0) {
+    std::cout << "read_files_in_dir failed." << std::endl;
+    return -1;
+  }
+
+  // prepare input data ---------------------------
+  static float data[3 * INPUT_H * INPUT_W];
+  //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
+  //    data[i] = 1.0;
+  static float prob[OUTPUT_SIZE];
+  IRuntime* runtime = createInferRuntime(gLogger);
+  assert(runtime != nullptr);
+  ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
+  assert(engine != nullptr);
+  IExecutionContext* context = engine->createExecutionContext();
+  assert(context != nullptr);
+  delete[] trtModelStream;
+
+  int fcount = 0;
+  for (auto f: file_names) {
+    fcount++;
+    std::cout << fcount << "  " << f << std::endl;
+    cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f);
+    if (img.empty()) continue;
+    cv::Mat pr_img = preprocess_img(img);
+    for (int i = 0; i < INPUT_H * INPUT_W; i++) {
+      data[i] = pr_img.at<cv::Vec3b>(i)[2] / 255.0;
+      data[i + INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[1] / 255.0;
+      data[i + 2 * INPUT_H * INPUT_W] = pr_img.at<cv::Vec3b>(i)[0] / 255.0;
+    }
+
+    // Run inference
+    auto start = std::chrono::system_clock::now();
+    doInference(*context, data, prob, 1);
+    auto end = std::chrono::system_clock::now();
+    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
+    std::vector<Yolo::Detection> res;
+    nms(res, prob);
+    for (int i=0; i<20; i++) {
+      std::cout << prob[i] << ",";
+    }
+    std::cout << res.size() << std::endl;
+    for (size_t j = 0; j < res.size(); j++) {
+      float *p = (float*)&res[j];
+      for (size_t k = 0; k < 7; k++) {
+        std::cout << p[k] << ", ";
+      }
+      std::cout << std::endl;
+      cv::Rect r = get_rect(img, res[j].bbox);
+      cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
+      cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
+    }
+    cv::imwrite("_" + f, img);
+  }
+
+  // Destroy the engine
+  context->destroy();
+  engine->destroy();
+  runtime->destroy();
+  return 0;
+}
\ No newline at end of file
diff --git a/uw_detection/src/yolov3_prepare_plan.cpp b/uw_detection/src/yolov3_prepare_plan.cpp
new file mode 100644
index 0000000..1ee7079
--- /dev/null
+++ b/uw_detection/src/yolov3_prepare_plan.cpp
@@ -0,0 +1,35 @@
+#include "NvInfer.h"
+#include "cuda_runtime_api.h"
+#include <chrono>
+#include <dirent.h>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <opencv2/opencv.hpp>
+#include <ros/package.h>
+#include <sstream>
+#include <uw_detection/yololayer.h>
+#include <uw_detection/yolov3.h>
+#include <vector>
+
+#define DEVICE 0  // GPU id
+
+using namespace nvinfer1;
+
+int main(int argc, char** argv)
+{
+  cudaSetDevice(DEVICE);
+  IHostMemory* modelStream{ nullptr };
+  std::string path = ros::package::getPath("uw_detection");
+  APIToModel(1, &modelStream, path + "/share/yolov3.wts");
+  assert(modelStream != nullptr);
+  std::ofstream p(path + "/share/yolov3.engine", std::ios::binary);
+  if (!p)
+  {
+    std::cerr << "could not open plan output file" << std::endl;
+    return -1;
+  }
+  p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
+  modelStream->destroy();
+  return 0;
+}
diff --git a/uw_detection/src/yolov3_spp_prepare_plan.cpp b/uw_detection/src/yolov3_spp_prepare_plan.cpp
new file mode 100644
index 0000000..96df97b
--- /dev/null
+++ b/uw_detection/src/yolov3_spp_prepare_plan.cpp
@@ -0,0 +1,35 @@
+#include "NvInfer.h"
+#include "cuda_runtime_api.h"
+#include <chrono>
+#include <dirent.h>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <opencv2/opencv.hpp>
+#include <ros/package.h>
+#include <sstream>
+#include <uw_detection/yololayer.h>
+#include <uw_detection/yolov3-spp.h>
+#include <vector>
+
+#define DEVICE 0  // GPU id
+
+using namespace nvinfer1;
+
+int main(int argc, char** argv)
+{
+  cudaSetDevice(DEVICE);
+  IHostMemory* modelStream{ nullptr };
+  std::string path = ros::package::getPath("uw_detection");
+  APIToModel(1, &modelStream, path + "/share/yolov3-spp_ultralytics68.wts");
+  assert(modelStream != nullptr);
+  std::ofstream p(path + "/share/yolov3-spp.engine", std::ios::binary);
+  if (!p)
+  {
+    std::cerr << "could not open plan output file" << std::endl;
+    return -1;
+  }
+  p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
+  modelStream->destroy();
+  return 0;
+}