diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..20d75f4 --- /dev/null +++ b/.clang-format @@ -0,0 +1,64 @@ +--- +BasedOnStyle: Google +AccessModifierOffset: -2 +ConstructorInitializerIndentWidth: 2 +AlignEscapedNewlinesLeft: false +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AlwaysBreakTemplateDeclarations: true +AlwaysBreakBeforeMultilineStrings: false +BreakBeforeBinaryOperators: false +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: true +BinPackParameters: true +ColumnLimit: 120 +ConstructorInitializerAllOnOneLineOrOnePerLine: true +DerivePointerBinding: false +PointerBindsToType: true +ExperimentalAutoDetectBinPacking: false +IndentCaseLabels: true +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 60 +PenaltyBreakString: 100 +PenaltyBreakFirstLessLess: 1000 +PenaltyExcessCharacter: 1000 +PenaltyReturnTypeOnItsOwnLine: 70 +SpacesBeforeTrailingComments: 2 +Cpp11BracedListStyle: false +Standard: Auto +IndentWidth: 2 +TabWidth: 2 +UseTab: Never +IndentFunctionDeclarationAfterType: false +SpacesInParentheses: false +SpacesInAngles: false +SpaceInEmptyParentheses: false +SpacesInCStyleCastParentheses: false +SpaceAfterControlStatementKeyword: true +SpaceBeforeAssignmentOperators: true +ContinuationIndentWidth: 4 +SortIncludes: false +SpaceAfterCStyleCast: false + +# Configure each individual brace in BraceWrapping +BreakBeforeBraces: Custom + +# Control of individual brace wrapping cases +BraceWrapping: { + AfterClass: 'true' + AfterControlStatement: 'true' + AfterEnum : 'true' + AfterFunction : 'true' + AfterNamespace : 'true' + AfterStruct : 'true' + AfterUnion : 'true' + BeforeCatch : 'true' + BeforeElse : 'true' + IndentBraces : 'false' +} \ No newline at end of file diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..6f40a77 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,49 @@ +--- +Checks: '-*, + performance-*, + llvm-namespace-comment, + modernize-redundant-void-arg, + modernize-use-nullptr, + modernize-use-default, + modernize-use-override, + modernize-loop-convert, + readability-named-parameter, + readability-redundant-smartptr-get, + readability-redundant-string-cstr, + readability-simplify-boolean-expr, + readability-container-size-empty, + readability-identifier-naming, + ' +HeaderFilterRegex: '' +AnalyzeTemporaryDtors: false +CheckOptions: + - key: llvm-namespace-comment.ShortNamespaceLines + value: '10' + - key: llvm-namespace-comment.SpacesBeforeComments + value: '2' + - key: readability-braces-around-statements.ShortStatementLines + value: '2' + # type names + - key: readability-identifier-naming.ClassCase + value: CamelCase + - key: readability-identifier-naming.EnumCase + value: CamelCase + - key: readability-identifier-naming.UnionCase + value: CamelCase + # method names + - key: readability-identifier-naming.MethodCase + value: camelBack + # variable names + - key: readability-identifier-naming.VariableCase + value: lower_case + - key: readability-identifier-naming.ClassMemberSuffix + value: '_' + # const static or global variables are UPPER_CASE + - key: readability-identifier-naming.EnumConstantCase + value: UPPER_CASE + - key: readability-identifier-naming.StaticConstantCase + value: UPPER_CASE + - key: readability-identifier-naming.ClassConstantCase + value: UPPER_CASE + - key: readability-identifier-naming.GlobalVariableCase + value: UPPER_CASE \ No newline at end of file diff --git a/.flake8 b/.flake8 index 1d0f20d..7877ccc 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,6 @@ [flake8] max-line-length = 160 -exclude = tests/* +exclude = tests/* uw_detection/src/yolov3 max-complexity = 10 # We import modules here to make the API cleaner per-file-ignores = **/recognize_speech/__init__.py:F401 diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4e0e4d4..0cf493d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -7,6 +7,7 @@ jobs: env: AFTER_SCRIPT: 'cd $target_ws && rosenv && catkin build $(catkin list --depends-on roslint -u) --no-deps --no-status --verbose --make-args roslint' CATKIN_LINT: true + CATKIN_LINT_ARGS: "--ignore missing_directory --ignore external_link_directory" ROS_DISTRO: melodic ROS_REPO: main strategy: @@ -15,7 +16,9 @@ jobs: - {} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 + with: + submodules: true - uses: 'ros-industrial/industrial_ci@master' env: ${{matrix.env}} lint: diff --git a/uw_detection/CMakeLists.txt b/uw_detection/CMakeLists.txt new file mode 100644 index 0000000..836d796 --- /dev/null +++ b/uw_detection/CMakeLists.txt @@ -0,0 +1,70 @@ +cmake_minimum_required(VERSION 2.8.12) +project(uw_detection) + +find_package(catkin REQUIRED COMPONENTS + cv_bridge + geometry_msgs + image_transport + message_generation + message_filters + message_runtime + roscpp + sensor_msgs + std_msgs + std_srvs + tf2 + tf2_eigen + visualization_msgs) + +catkin_python_setup() + +catkin_package() + + +option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) +set(CMAKE_CXX_STANDARD 11) + +find_package(OpenCV REQUIRED) +include_directories(OpenCV_INCLUDE_DIRS + ${catkin_INCLUDE_DIRS} + include) + +find_package(CUDA) + +if (CUDA_FOUND) +set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30) + + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + message("embed_platform on") + include_directories(/usr/local/cuda/targets/aarch64-linux/include) + link_directories(/usr/local/cuda/targets/aarch64-linux/lib) +else() + message("embed_platform off") + include_directories(/usr/local/cuda/include) + link_directories(/usr/local/cuda/lib64) +endif() + + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") + +cuda_add_library(yololayer SHARED src/yolov3/yololayer.cu) + +add_library(yolov3-spp src/yolov3/yolov3-spp.cpp) +target_link_libraries(yolov3-spp nvinfer cudart yololayer ${OpenCV_LIBS}) + +add_library(yolov3 src/yolov3/yolov3.cpp) +target_link_libraries(yolov3 nvinfer cudart yololayer ${OpenCV_LIBS}) + +add_executable(yolov3_spp_prepare_plan src/yolov3_spp_prepare_plan.cpp) +target_link_libraries(yolov3_spp_prepare_plan yolov3 nvinfer cudart yololayer ${OpenCV_LIBS} ${catkin_LIBRARIES}) + +add_executable(yolov3_prepare_plan src/yolov3_prepare_plan.cpp) +target_link_libraries(yolov3_prepare_plan yolov3 nvinfer cudart yololayer ${OpenCV_LIBS} ${catkin_LIBRARIES}) + +add_executable(yolo_ros_image_node src/yolo_ros_image_node.cpp) +target_link_libraries(yolo_ros_image_node yolov3 nvinfer cudart yololayer ${OpenCV_LIBS} ${catkin_LIBRARIES}) + +add_definitions(-O2 -pthread) +endif() + diff --git a/uw_detection/README.md b/uw_detection/README.md new file mode 100644 index 0000000..67778c5 --- /dev/null +++ b/uw_detection/README.md @@ -0,0 +1,17 @@ +# uw_detection + +Wrappers for detection models. Tools for preparing data and training. + +TensorRT YOLOv3 implementations taken from [tensorrtx](https://github.com/wang-xinyu/tensorrtx). Additional modifactions based on [this implementation](https://github.com/jkjung-avt/tensorrt_demos/blob/master/plugins/yolo_layer.cu#L39) + +## Usage + +### YOLO + +All of the YOLO targets described here are only available when CMake can detect a working CUDA and TensorRT installation. Ensure these are set up correctly before beginning. They can't be run on the CPU + +These implementations come with a helper for loading weights from the Ultralytics PyTorch trained models. Follow the steps in [their readme](https://github.com/wang-xinyu/tensorrtx/blob/master/yolov3/README.md) (the process involves using one of the `gen_wts.py` scripts to load the Torch graph and dump it into the necessary format). You may need to change the CUDA device set in the script. + +TensorRT requires that you precompute an inference graph ("engine") before you can actually use the weights you've obtained. The `*_prepare_plan.cpp` targets will take care of this. They expect the weights file to be placed in the `share` folder at the root of the package. When they've run successfully (only required once), they'll output the serialized engine into the `share` folder. Note that if you're using a customized class list, you'll need to adjust the definition for the number of classes in "yololayer.h". + +Now you can test the model on a ROS image topic using `yolo_ros_image_node.cpp`. It's configured to take images from Fetch's head camera topic and feed them through YOLOv3, outputting the results to the console and drawing them in a CV window. It'll work in simulation without changes assuming you've followed the previous steps for the Ultralytics provided yolov3 checkpoint. \ No newline at end of file diff --git a/uw_detection/include/uw_detection/Utils.h b/uw_detection/include/uw_detection/Utils.h new file mode 100644 index 0000000..0de663c --- /dev/null +++ b/uw_detection/include/uw_detection/Utils.h @@ -0,0 +1,94 @@ +#ifndef __TRT_UTILS_H_ +#define __TRT_UTILS_H_ + +#include +#include +#include +#include + +#ifndef CUDA_CHECK + +#define CUDA_CHECK(callstr) \ + { \ + cudaError_t error_code = callstr; \ + if (error_code != cudaSuccess) { \ + std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \ + assert(0); \ + } \ + } + +#endif + +namespace Tn +{ + class Profiler : public nvinfer1::IProfiler + { + public: + void printLayerTimes(int itrationsTimes) + { + float totalTime = 0; + for (size_t i = 0; i < mProfile.size(); i++) + { + printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), mProfile[i].second / itrationsTimes); + totalTime += mProfile[i].second; + } + printf("Time over all layers: %4.3f\n", totalTime / itrationsTimes); + } + private: + typedef std::pair Record; + std::vector mProfile; + + virtual void reportLayerTime(const char* layerName, float ms) + { + auto record = std::find_if(mProfile.begin(), mProfile.end(), [&](const Record& r){ return r.first == layerName; }); + if (record == mProfile.end()) + mProfile.push_back(std::make_pair(layerName, ms)); + else + record->second += ms; + } + }; + + //Logger for TensorRT info/warning/errors + class Logger : public nvinfer1::ILogger + { + public: + + Logger(): Logger(Severity::kWARNING) {} + + Logger(Severity severity): reportableSeverity(severity) {} + + void log(Severity severity, const char* msg) override + { + // suppress messages with severity enum value greater than the reportable + if (severity > reportableSeverity) return; + + switch (severity) + { + case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break; + case Severity::kERROR: std::cerr << "ERROR: "; break; + case Severity::kWARNING: std::cerr << "WARNING: "; break; + case Severity::kINFO: std::cerr << "INFO: "; break; + default: std::cerr << "UNKNOWN: "; break; + } + std::cerr << msg << std::endl; + } + + Severity reportableSeverity{Severity::kWARNING}; + }; + + template + void write(char*& buffer, const T& val) + { + *reinterpret_cast(buffer) = val; + buffer += sizeof(T); + } + + template + void read(const char*& buffer, T& val) + { + val = *reinterpret_cast(buffer); + buffer += sizeof(T); + } +} + +#endif \ No newline at end of file diff --git a/uw_detection/include/uw_detection/logging.h b/uw_detection/include/uw_detection/logging.h new file mode 100644 index 0000000..602b69f --- /dev/null +++ b/uw_detection/include/uw_detection/logging.h @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORRT_LOGGING_H +#define TENSORRT_LOGGING_H + +#include "NvInferRuntimeCommon.h" +#include +#include +#include +#include +#include +#include +#include + +using Severity = nvinfer1::ILogger::Severity; + +class LogStreamConsumerBuffer : public std::stringbuf +{ +public: + LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mOutput(stream) + , mPrefix(prefix) + , mShouldLog(shouldLog) + { + } + + LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) + : mOutput(other.mOutput) + { + } + + ~LogStreamConsumerBuffer() + { + // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence + // std::streambuf::pptr() gives a pointer to the current position of the output sequence + // if the pointer to the beginning is not equal to the pointer to the current position, + // call putOutput() to log the output to the stream + if (pbase() != pptr()) + { + putOutput(); + } + } + + // synchronizes the stream buffer and returns 0 on success + // synchronizing the stream buffer consists of inserting the buffer contents into the stream, + // resetting the buffer and flushing the stream + virtual int sync() + { + putOutput(); + return 0; + } + + void putOutput() + { + if (mShouldLog) + { + // prepend timestamp + std::time_t timestamp = std::time(nullptr); + tm* tm_local = std::localtime(×tamp); + std::cout << "["; + std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; + std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; + std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; + std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; + std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; + std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; + // std::stringbuf::str() gets the string contents of the buffer + // insert the buffer contents pre-appended by the appropriate prefix into the stream + mOutput << mPrefix << str(); + // set the buffer to empty + str(""); + // flush the stream + mOutput.flush(); + } + } + + void setShouldLog(bool shouldLog) + { + mShouldLog = shouldLog; + } + +private: + std::ostream& mOutput; + std::string mPrefix; + bool mShouldLog; +}; + +//! +//! \class LogStreamConsumerBase +//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer +//! +class LogStreamConsumerBase +{ +public: + LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) + : mBuffer(stream, prefix, shouldLog) + { + } + +protected: + LogStreamConsumerBuffer mBuffer; +}; + +//! +//! \class LogStreamConsumer +//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. +//! Order of base classes is LogStreamConsumerBase and then std::ostream. +//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field +//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. +//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. +//! Please do not change the order of the parent classes. +//! +class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream +{ +public: + //! \brief Creates a LogStreamConsumer which logs messages with level severity. + //! Reportable severity determines if the messages are severe enough to be logged. + LogStreamConsumer(Severity reportableSeverity, Severity severity) + : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(severity <= reportableSeverity) + , mSeverity(severity) + { + } + + LogStreamConsumer(LogStreamConsumer&& other) + : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) + , std::ostream(&mBuffer) // links the stream buffer with the stream + , mShouldLog(other.mShouldLog) + , mSeverity(other.mSeverity) + { + } + + void setReportableSeverity(Severity reportableSeverity) + { + mShouldLog = mSeverity <= reportableSeverity; + mBuffer.setShouldLog(mShouldLog); + } + +private: + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + static std::string severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + bool mShouldLog; + Severity mSeverity; +}; + +//! \class Logger +//! +//! \brief Class which manages logging of TensorRT tools and samples +//! +//! \details This class provides a common interface for TensorRT tools and samples to log information to the console, +//! and supports logging two types of messages: +//! +//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) +//! - Test pass/fail messages +//! +//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is +//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. +//! +//! In the future, this class could be extended to support dumping test results to a file in some standard format +//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). +//! +//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger +//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT +//! library and messages coming from the sample. +//! +//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the +//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger +//! object. + +class Logger : public nvinfer1::ILogger +{ +public: + Logger(Severity severity = Severity::kWARNING) + : mReportableSeverity(severity) + { + } + + //! + //! \enum TestResult + //! \brief Represents the state of a given test + //! + enum class TestResult + { + kRUNNING, //!< The test is running + kPASSED, //!< The test passed + kFAILED, //!< The test failed + kWAIVED //!< The test was waived + }; + + //! + //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger + //! \return The nvinfer1::ILogger associated with this Logger + //! + //! TODO Once all samples are updated to use this method to register the logger with TensorRT, + //! we can eliminate the inheritance of Logger from ILogger + //! + nvinfer1::ILogger& getTRTLogger() + { + return *this; + } + + //! + //! \brief Implementation of the nvinfer1::ILogger::log() virtual method + //! + //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the + //! inheritance from nvinfer1::ILogger + //! + void log(Severity severity, const char* msg) override + { + LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; + } + + //! + //! \brief Method for controlling the verbosity of logging output + //! + //! \param severity The logger will only emit messages that have severity of this level or higher. + //! + void setReportableSeverity(Severity severity) + { + mReportableSeverity = severity; + } + + //! + //! \brief Opaque handle that holds logging information for a particular test + //! + //! This object is an opaque handle to information used by the Logger to print test results. + //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used + //! with Logger::reportTest{Start,End}(). + //! + class TestAtom + { + public: + TestAtom(TestAtom&&) = default; + + private: + friend class Logger; + + TestAtom(bool started, const std::string& name, const std::string& cmdline) + : mStarted(started) + , mName(name) + , mCmdline(cmdline) + { + } + + bool mStarted; + std::string mName; + std::string mCmdline; + }; + + //! + //! \brief Define a test for logging + //! + //! \param[in] name The name of the test. This should be a string starting with + //! "TensorRT" and containing dot-separated strings containing + //! the characters [A-Za-z0-9_]. + //! For example, "TensorRT.sample_googlenet" + //! \param[in] cmdline The command line used to reproduce the test + // + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + //! + static TestAtom defineTest(const std::string& name, const std::string& cmdline) + { + return TestAtom(false, name, cmdline); + } + + //! + //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments + //! as input + //! + //! \param[in] name The name of the test + //! \param[in] argc The number of command-line arguments + //! \param[in] argv The array of command-line arguments (given as C strings) + //! + //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). + static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) + { + auto cmdline = genCmdlineString(argc, argv); + return defineTest(name, cmdline); + } + + //! + //! \brief Report that a test has started. + //! + //! \pre reportTestStart() has not been called yet for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has started + //! + static void reportTestStart(TestAtom& testAtom) + { + reportTestResult(testAtom, TestResult::kRUNNING); + assert(!testAtom.mStarted); + testAtom.mStarted = true; + } + + //! + //! \brief Report that a test has ended. + //! + //! \pre reportTestStart() has been called for the given testAtom + //! + //! \param[in] testAtom The handle to the test that has ended + //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, + //! TestResult::kFAILED, TestResult::kWAIVED + //! + static void reportTestEnd(const TestAtom& testAtom, TestResult result) + { + assert(result != TestResult::kRUNNING); + assert(testAtom.mStarted); + reportTestResult(testAtom, result); + } + + static int reportPass(const TestAtom& testAtom) + { + reportTestEnd(testAtom, TestResult::kPASSED); + return EXIT_SUCCESS; + } + + static int reportFail(const TestAtom& testAtom) + { + reportTestEnd(testAtom, TestResult::kFAILED); + return EXIT_FAILURE; + } + + static int reportWaive(const TestAtom& testAtom) + { + reportTestEnd(testAtom, TestResult::kWAIVED); + return EXIT_SUCCESS; + } + + static int reportTest(const TestAtom& testAtom, bool pass) + { + return pass ? reportPass(testAtom) : reportFail(testAtom); + } + + Severity getReportableSeverity() const + { + return mReportableSeverity; + } + +private: + //! + //! \brief returns an appropriate string for prefixing a log message with the given severity + //! + static const char* severityPrefix(Severity severity) + { + switch (severity) + { + case Severity::kINTERNAL_ERROR: return "[F] "; + case Severity::kERROR: return "[E] "; + case Severity::kWARNING: return "[W] "; + case Severity::kINFO: return "[I] "; + case Severity::kVERBOSE: return "[V] "; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate string for prefixing a test result message with the given result + //! + static const char* testResultString(TestResult result) + { + switch (result) + { + case TestResult::kRUNNING: return "RUNNING"; + case TestResult::kPASSED: return "PASSED"; + case TestResult::kFAILED: return "FAILED"; + case TestResult::kWAIVED: return "WAIVED"; + default: assert(0); return ""; + } + } + + //! + //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity + //! + static std::ostream& severityOstream(Severity severity) + { + return severity >= Severity::kINFO ? std::cout : std::cerr; + } + + //! + //! \brief method that implements logging test results + //! + static void reportTestResult(const TestAtom& testAtom, TestResult result) + { + severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " + << testAtom.mCmdline << std::endl; + } + + //! + //! \brief generate a command line string from the given (argc, argv) values + //! + static std::string genCmdlineString(int argc, char const* const* argv) + { + std::stringstream ss; + for (int i = 0; i < argc; i++) + { + if (i > 0) + ss << " "; + ss << argv[i]; + } + return ss.str(); + } + + Severity mReportableSeverity; +}; + +namespace +{ + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE +//! +//! Example usage: +//! +//! LOG_VERBOSE(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO +//! +//! Example usage: +//! +//! LOG_INFO(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_INFO(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING +//! +//! Example usage: +//! +//! LOG_WARN(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_WARN(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR +//! +//! Example usage: +//! +//! LOG_ERROR(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_ERROR(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); +} + +//! +//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR +// ("fatal" severity) +//! +//! Example usage: +//! +//! LOG_FATAL(logger) << "hello world" << std::endl; +//! +inline LogStreamConsumer LOG_FATAL(const Logger& logger) +{ + return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); +} + +} // anonymous namespace + +#endif // TENSORRT_LOGGING_H diff --git a/uw_detection/include/uw_detection/yololayer.h b/uw_detection/include/uw_detection/yololayer.h new file mode 100644 index 0000000..91af21e --- /dev/null +++ b/uw_detection/include/uw_detection/yololayer.h @@ -0,0 +1,158 @@ +#ifndef _YOLO_LAYER_H +#define _YOLO_LAYER_H + +#include +#include +#include +#include +#include "NvInfer.h" +#include +#include + +namespace Yolo +{ + static constexpr int CHECK_COUNT = 3; + static constexpr float IGNORE_THRESH = 0.1f; + static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000; + static constexpr int INPUT_H = 608; + static constexpr int INPUT_W = 608; + + struct YoloKernel + { + int width; + int height; + float anchors[CHECK_COUNT*2]; + }; + + static constexpr YoloKernel yolo1 = { + INPUT_W / 32, + INPUT_H / 32, + {116,90, 156,198, 373,326} + }; + static constexpr YoloKernel yolo2 = { + INPUT_W / 16, + INPUT_H / 16, + {30,61, 62,45, 59,119} + }; + static constexpr YoloKernel yolo3 = { + INPUT_W / 8, + INPUT_H / 8, + {10,13, 16,30, 33,23} + }; + + static constexpr int LOCATIONS = 4; + struct alignas(float) Detection{ + //x y w h + float bbox[LOCATIONS]; + float det_confidence; + float class_id; + float class_confidence; + }; +} + + +namespace nvinfer1 +{ + class YoloLayerPlugin: public IPluginV2IOExt + { + public: + explicit YoloLayerPlugin(int num_classes); + YoloLayerPlugin(const void* data, size_t length); + + ~YoloLayerPlugin(); + + int getNbOutputs() const override + { + return 1; + } + + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; + + int initialize() override; + + virtual void terminate() override {}; + + virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} + + virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; + + virtual size_t getSerializationSize() const override; + + virtual void serialize(void* buffer) const override; + + bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { + return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; + } + + const char* getPluginType() const override; + + const char* getPluginVersion() const override; + + void destroy() override; + + IPluginV2IOExt* clone() const override; + + void setPluginNamespace(const char* pluginNamespace) override; + + const char* getPluginNamespace() const override; + + DataType getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const override; + + bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; + + bool canBroadcastInputAcrossBatch(int inputIndex) const override; + + void attachToContext( + cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; + + void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override; + + void detachFromContext() override; + + private: + void forwardGpu(const float *const * inputs,float * output, cudaStream_t stream,int batchSize = 1); + int mClassCount; + int mKernelCount; + std::vector mYoloKernel; + int mThreadCount = 256; + const char* mPluginNamespace; + }; + + class YoloPluginCreator : public IPluginCreator + { + public: + YoloPluginCreator(); + + ~YoloPluginCreator() override = default; + + const char* getPluginName() const override; + + const char* getPluginVersion() const override; + + const PluginFieldCollection* getFieldNames() override; + + IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; + + IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; + + void setPluginNamespace(const char* libNamespace) override + { + mNamespace = libNamespace; + } + + const char* getPluginNamespace() const override + { + return mNamespace.c_str(); + } + + private: + std::string mNamespace; + static PluginFieldCollection mFC; + static std::vector mPluginAttributes; + }; + REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); + + +}; + +#endif diff --git a/uw_detection/include/uw_detection/yolov3-spp.h b/uw_detection/include/uw_detection/yolov3-spp.h new file mode 100644 index 0000000..a39da6a --- /dev/null +++ b/uw_detection/include/uw_detection/yolov3-spp.h @@ -0,0 +1,13 @@ +#pragma once +#include "NvInfer.h" +#include + +void APIToModel(unsigned int maxBatchSize, nvinfer1::IHostMemory** modelStream, const std::string& weights_path); + +void doInference(nvinfer1::IExecutionContext& context, float* input, float* output, int batchSize); + +cv::Mat preprocess_img(cv::Mat& img); + +cv::Rect get_rect(cv::Mat& img, float bbox[4]); + +void nms(std::vector& res, float* output, float nms_thresh); \ No newline at end of file diff --git a/uw_detection/include/uw_detection/yolov3.h b/uw_detection/include/uw_detection/yolov3.h new file mode 100644 index 0000000..6cdc155 --- /dev/null +++ b/uw_detection/include/uw_detection/yolov3.h @@ -0,0 +1,14 @@ +#pragma once +#include "NvInfer.h" +#include +#include + +void APIToModel(unsigned int maxBatchSize, nvinfer1::IHostMemory** modelStream, const std::string& weights_path); + +void doInference(nvinfer1::IExecutionContext& context, float* input, float* output, int batchSize); + +cv::Mat preprocess_img(cv::Mat& img); + +cv::Rect get_rect(cv::Mat& img, float bbox[4]); + +void nms(std::vector& res, float* output, float nms_thresh); \ No newline at end of file diff --git a/uw_detection/package.xml b/uw_detection/package.xml new file mode 100644 index 0000000..772408d --- /dev/null +++ b/uw_detection/package.xml @@ -0,0 +1,33 @@ + + uw_detection + 0.1.0 + + Detection models, as well as data prep, labeling, and training tools + + MIT + UW RoboCup Contributors + UW RoboCup Contributors + + + catkin + + catkin + rospack + + cv_bridge + roscpp + std_msgs + sensor_msgs + geometry_msgs + visualization_msgs + message_generation + message_runtime + + image_transport + octomap + std_srvs + tf2 + tf2_eigen + message_filters + + \ No newline at end of file diff --git a/uw_detection/scripts/camera_detect_demo b/uw_detection/scripts/camera_detect_demo new file mode 100755 index 0000000..ba8340c --- /dev/null +++ b/uw_detection/scripts/camera_detect_demo @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +import argparse +from sys import platform + +from yolov3.models import * # set ONNX_EXPORT in models.py +from yolov3.utils.datasets import * +from yolov3.utils.utils import * +import cv2 +import pyrealsense2 as rs +import torchvision.transforms as transforms + + +def detect(pipe=None, save_img=False): + img_size = (320, 192) if ONNX_EXPORT else opt.img_size # (320, 192) or (416, 256) or (608, 352) for (height, width) + out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt + webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt') + + # Initialize + device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device) + if os.path.exists(out): + shutil.rmtree(out) # delete output folder + os.makedirs(out) # make new output folder + + # Initialize model + model = Darknet(opt.cfg, img_size) + + # Load weights + attempt_download(weights) + if weights.endswith('.pt'): # pytorch format + model.load_state_dict(torch.load(weights, map_location=device)['model']) + else: # darknet format + load_darknet_weights(model, weights) + + # Fuse Conv2d + BatchNorm2d layers + # model.fuse() + # torch_utils.model_info(model, report='summary') # 'full' or 'summary' + + # Eval mode + model.to(device).eval() + + # Export mode + if ONNX_EXPORT: + model.fuse() + img = torch.zeros((1, 3) + img_size) # (1, 3, 320, 192) + torch.onnx.export(model, img, 'weights/export.onnx', verbose=False, opset_version=11) + + # Validate exported model + import onnx + model = onnx.load('weights/export.onnx') # Load the ONNX model + onnx.checker.check_model(model) # Check that the IR is well formed + print(onnx.helper.printable_graph(model.graph)) # Print a human readable representation of the graph + return + + # Half precision + half = half and device.type != 'cpu' # half precision only supported on CUDA + if half: + model.half() + + # Set Dataloader + vid_path, vid_writer = None, None + if webcam: + view_img = True + torch.backends.cudnn.benchmark = True # set True to speed up constant image size inference + # dataset = LoadStreams(source, img_size=img_size, half=half) + else: + save_img = True + # dataset = LoadImages(source, img_size=img_size, half=half) + + # Get names and colors + names = load_classes(opt.names) + colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] + + # Run inference + t0 = time.time() + # for path, img, im0s, vid_cap in dataset: + while True: + t = time.time() + + # Get detections + img = pipe.wait_for_frames() + + # Align the depth frame to color frame + aligned_frames = align.process(img) + + # Get aligned frames + aligned_depth_frame = aligned_frames.get_depth_frame() # aligned_depth_frame is a 640x480 depth image + aligned_color_frame = aligned_frames.get_color_frame() + + aligned_color_frame = np.asanyarray(img.get_data()) + # im0 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + # im0 = cv2.resize(im0, (640,480)) + aligned_color_frame = cv2.resize(img, (320, 320)) + im0 = cv2.cvtColor(aligned_color_frame, cv2.COLOR_RGB2BGR) # im0 is original image + img = transforms.ToTensor()(im0).to(device) + + aligned_depth_frame = cv2.resize(aligned_depth_frame, (320, 320)) + + # img = torch.from_numpy(img).to(device) + if img.ndimension() == 3: + img = img.unsqueeze(0) + pred = model(img)[0] + + if opt.half: + pred = pred.float() + + # Apply NMS + pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) + + # Apply Classifier + # if classify: + # pred = apply_classifier(pred, modelc, img, im0s) + + # Process detections + for i, det in enumerate(pred): # detections per image + # depth mask for each of the objects + class_mask = np.zeros((320, 320, 21)) + + s = "" + # if webcam: # batch_size >= 1 + # p, s, im0 = " ", '%g: ' % i, im0s[i] + # else: + # p, s, im0 = path, '', im0s + + # save_path = str(Path(out) / Path(p).name) + s += '%gx%g ' % img.shape[2:] # print string + if det is not None and len(det): + # Rescale boxes from img_size to im0 size + det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() + + # Print results + for c in det[:, -1].unique(): + n = (det[:, -1] == c).sum() # detections per class + s += '%g %ss, ' % (n, names[int(c)]) # add to string + + # Write results + for *xyxy, conf, cls in det: + if save_img or view_img: # Add bbox to image + label = '%s %.2f' % (names[int(cls)], conf) + plot_one_box(xyxy, im0, label=label, color=colors[int(cls)]) + # print(det) + + # fill in bounding box area for mask + # cv2.contour for each channel + # for each contour -> find the centroid + + # depth + color image combine to overlay + + # Stream results + if True: + print(1 / (time.time() - t)) + # cv2.imshow("output", im0) + # cv2.imshow("resized", cv2.resize(im0, (1080,720))) + + if cv2.waitKey(1) == ord('q'): # q to quit + raise StopIteration + + # Save results (image with detections) + # if save_img: + # if dataset.mode == 'images': + # cv2.imwrite(save_path, im0) + # else: + # if vid_path != save_path: # new video + # vid_path = save_path + # if isinstance(vid_writer, cv2.VideoWriter): + # vid_writer.release() # release previous video writer + + # fps = vid_cap.get(cv2.CAP_PROP_FPS) + # w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + # h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + # vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h)) + # vid_writer.write(im0) + + # if save_txt or save_img: + # print('Results saved to %s' % os.getcwd() + os.sep + out) + # if platform == 'darwin': # MacOS + # os.system('open ' + out + ' ' + save_path) + + # print('Done. (%.3fs)' % (time.time() - t0)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path') + parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path') + parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path') + parser.add_argument('--source', type=str, default='data/samples', help='source') # input file/folder, 0 for webcam + parser.add_argument('--output', type=str, default='output', help='output folder') # output folder + parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)') + parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold') + parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS') + parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)') + parser.add_argument('--half', action='store_true', help='half precision FP16 inference') + parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu') + parser.add_argument('--view-img', action='store_true', help='display results') + parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') + parser.add_argument('--classes', nargs='+', type=int, help='filter by class') + parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS') + opt = parser.parse_args() + print(opt) + + align_to = rs.stream.color + align = rs.align(align_to) + + pipeline = rs.pipeline() + pipeline.start() + + with torch.no_grad(): + detect(pipe=pipeline) diff --git a/uw_detection/scripts/camera_detect_sim b/uw_detection/scripts/camera_detect_sim new file mode 100644 index 0000000..ba8340c --- /dev/null +++ b/uw_detection/scripts/camera_detect_sim @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +import argparse +from sys import platform + +from yolov3.models import * # set ONNX_EXPORT in models.py +from yolov3.utils.datasets import * +from yolov3.utils.utils import * +import cv2 +import pyrealsense2 as rs +import torchvision.transforms as transforms + + +def detect(pipe=None, save_img=False): + img_size = (320, 192) if ONNX_EXPORT else opt.img_size # (320, 192) or (416, 256) or (608, 352) for (height, width) + out, source, weights, half, view_img, save_txt = opt.output, opt.source, opt.weights, opt.half, opt.view_img, opt.save_txt + webcam = source == '0' or source.startswith('rtsp') or source.startswith('http') or source.endswith('.txt') + + # Initialize + device = torch_utils.select_device(device='cpu' if ONNX_EXPORT else opt.device) + if os.path.exists(out): + shutil.rmtree(out) # delete output folder + os.makedirs(out) # make new output folder + + # Initialize model + model = Darknet(opt.cfg, img_size) + + # Load weights + attempt_download(weights) + if weights.endswith('.pt'): # pytorch format + model.load_state_dict(torch.load(weights, map_location=device)['model']) + else: # darknet format + load_darknet_weights(model, weights) + + # Fuse Conv2d + BatchNorm2d layers + # model.fuse() + # torch_utils.model_info(model, report='summary') # 'full' or 'summary' + + # Eval mode + model.to(device).eval() + + # Export mode + if ONNX_EXPORT: + model.fuse() + img = torch.zeros((1, 3) + img_size) # (1, 3, 320, 192) + torch.onnx.export(model, img, 'weights/export.onnx', verbose=False, opset_version=11) + + # Validate exported model + import onnx + model = onnx.load('weights/export.onnx') # Load the ONNX model + onnx.checker.check_model(model) # Check that the IR is well formed + print(onnx.helper.printable_graph(model.graph)) # Print a human readable representation of the graph + return + + # Half precision + half = half and device.type != 'cpu' # half precision only supported on CUDA + if half: + model.half() + + # Set Dataloader + vid_path, vid_writer = None, None + if webcam: + view_img = True + torch.backends.cudnn.benchmark = True # set True to speed up constant image size inference + # dataset = LoadStreams(source, img_size=img_size, half=half) + else: + save_img = True + # dataset = LoadImages(source, img_size=img_size, half=half) + + # Get names and colors + names = load_classes(opt.names) + colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))] + + # Run inference + t0 = time.time() + # for path, img, im0s, vid_cap in dataset: + while True: + t = time.time() + + # Get detections + img = pipe.wait_for_frames() + + # Align the depth frame to color frame + aligned_frames = align.process(img) + + # Get aligned frames + aligned_depth_frame = aligned_frames.get_depth_frame() # aligned_depth_frame is a 640x480 depth image + aligned_color_frame = aligned_frames.get_color_frame() + + aligned_color_frame = np.asanyarray(img.get_data()) + # im0 = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + # im0 = cv2.resize(im0, (640,480)) + aligned_color_frame = cv2.resize(img, (320, 320)) + im0 = cv2.cvtColor(aligned_color_frame, cv2.COLOR_RGB2BGR) # im0 is original image + img = transforms.ToTensor()(im0).to(device) + + aligned_depth_frame = cv2.resize(aligned_depth_frame, (320, 320)) + + # img = torch.from_numpy(img).to(device) + if img.ndimension() == 3: + img = img.unsqueeze(0) + pred = model(img)[0] + + if opt.half: + pred = pred.float() + + # Apply NMS + pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms) + + # Apply Classifier + # if classify: + # pred = apply_classifier(pred, modelc, img, im0s) + + # Process detections + for i, det in enumerate(pred): # detections per image + # depth mask for each of the objects + class_mask = np.zeros((320, 320, 21)) + + s = "" + # if webcam: # batch_size >= 1 + # p, s, im0 = " ", '%g: ' % i, im0s[i] + # else: + # p, s, im0 = path, '', im0s + + # save_path = str(Path(out) / Path(p).name) + s += '%gx%g ' % img.shape[2:] # print string + if det is not None and len(det): + # Rescale boxes from img_size to im0 size + det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round() + + # Print results + for c in det[:, -1].unique(): + n = (det[:, -1] == c).sum() # detections per class + s += '%g %ss, ' % (n, names[int(c)]) # add to string + + # Write results + for *xyxy, conf, cls in det: + if save_img or view_img: # Add bbox to image + label = '%s %.2f' % (names[int(cls)], conf) + plot_one_box(xyxy, im0, label=label, color=colors[int(cls)]) + # print(det) + + # fill in bounding box area for mask + # cv2.contour for each channel + # for each contour -> find the centroid + + # depth + color image combine to overlay + + # Stream results + if True: + print(1 / (time.time() - t)) + # cv2.imshow("output", im0) + # cv2.imshow("resized", cv2.resize(im0, (1080,720))) + + if cv2.waitKey(1) == ord('q'): # q to quit + raise StopIteration + + # Save results (image with detections) + # if save_img: + # if dataset.mode == 'images': + # cv2.imwrite(save_path, im0) + # else: + # if vid_path != save_path: # new video + # vid_path = save_path + # if isinstance(vid_writer, cv2.VideoWriter): + # vid_writer.release() # release previous video writer + + # fps = vid_cap.get(cv2.CAP_PROP_FPS) + # w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + # h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + # vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h)) + # vid_writer.write(im0) + + # if save_txt or save_img: + # print('Results saved to %s' % os.getcwd() + os.sep + out) + # if platform == 'darwin': # MacOS + # os.system('open ' + out + ' ' + save_path) + + # print('Done. (%.3fs)' % (time.time() - t0)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', type=str, default='cfg/yolov3-spp.cfg', help='*.cfg path') + parser.add_argument('--names', type=str, default='data/coco.names', help='*.names path') + parser.add_argument('--weights', type=str, default='weights/yolov3-spp-ultralytics.pt', help='weights path') + parser.add_argument('--source', type=str, default='data/samples', help='source') # input file/folder, 0 for webcam + parser.add_argument('--output', type=str, default='output', help='output folder') # output folder + parser.add_argument('--img-size', type=int, default=416, help='inference size (pixels)') + parser.add_argument('--conf-thres', type=float, default=0.3, help='object confidence threshold') + parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS') + parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)') + parser.add_argument('--half', action='store_true', help='half precision FP16 inference') + parser.add_argument('--device', default='', help='device id (i.e. 0 or 0,1) or cpu') + parser.add_argument('--view-img', action='store_true', help='display results') + parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') + parser.add_argument('--classes', nargs='+', type=int, help='filter by class') + parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS') + opt = parser.parse_args() + print(opt) + + align_to = rs.stream.color + align = rs.align(align_to) + + pipeline = rs.pipeline() + pipeline.start() + + with torch.no_grad(): + detect(pipe=pipeline) diff --git a/uw_detection/scripts/camera_to_baselink b/uw_detection/scripts/camera_to_baselink new file mode 100755 index 0000000..1032c6f --- /dev/null +++ b/uw_detection/scripts/camera_to_baselink @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import tf +import rospy +from tf import TransformListener +from std_msgs.msg import Float32MultiArray, MultiArrayDimension +import numpy as np + +position = None + +def callback(data): + centroids_in_baselink = [] + if position is not None: + # Convert each centroid prediction in baselink frame from realsense frame + #rospy.loginfo("DATA: ", data) + #rospy.loginfo("POSITION: ", position) + # data is a 50x4 array, 50 samples of [x,y,z,class_idx] + data = np.array(data.data).reshape(50,4) + #print(position) + for i, row in enumerate(data): + new_x = row[0] + position[0] + new_y = row[1] + position[1] + new_z = row[2] + position[2] + data[i,:] = np.array([new_x, new_y, new_z, row[3]]) + # get row to be + # publish converted array + centroid_msg = Float32MultiArray() + centroid_msg.layout.data_offset = 0 + centroid_msg.layout.dim = [MultiArrayDimension(), MultiArrayDimension()] + centroid_msg.layout.dim[0].label = "channels" + centroid_msg.layout.dim[0].size = 4 + centroid_msg.layout.dim[0].stride = 200 + centroid_msg.layout.dim[1].label = "samples" + centroid_msg.layout.dim[1].size = 50 + centroid_msg.layout.dim[1].stride = 50 + centroid_msg.data = data.flatten() + pub.publish(centroid_msg) + rate.sleep() + +if __name__ == "__main__": + rospy.init_node('centroid_listener', anonymous = True) + rospy.Subscriber("object_centroids", Float32MultiArray, callback) + pub = rospy.Publisher('object_centroids_baselink', Float32MultiArray, queue_size = 10) + #rospy.init_node("baselink_tf_listener") + listener = TransformListener() + rate = rospy.Rate(100.) + while not rospy.is_shutdown(): + listener.waitForTransform("/base_link", "head_external_camera_depth_frame", rospy.Time(), rospy.Duration(3.0)) + #t = listener.getLatestCommonTime("/base_link", "/head_external_camera_aligned_depth_to_color_frame") + position, _ = listener.lookupTransform("/base_link", "/head_external_camera_depth_frame", rospy.Time()) + #rate.sleep() + rospy.spin() diff --git a/uw_detection/setup.py b/uw_detection/setup.py new file mode 100644 index 0000000..756d253 --- /dev/null +++ b/uw_detection/setup.py @@ -0,0 +1,11 @@ +# ! DO NOT MANUALLY INVOKE THIS setup.py, USE CATKIN INSTEAD + +from distutils.core import setup +from catkin_pkg.python_setup import generate_distutils_setup + +# fetch values from package.xml +setup_args = generate_distutils_setup( + packages=['yolov3'], + package_dir={'': 'src'}) + +setup(**setup_args) diff --git a/uw_detection/share/.gitignore b/uw_detection/share/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/uw_detection/share/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/uw_detection/src/yolo_ros_image_node.cpp b/uw_detection/src/yolo_ros_image_node.cpp new file mode 100644 index 0000000..5166c7f --- /dev/null +++ b/uw_detection/src/yolo_ros_image_node.cpp @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define NMS_THRESH 0.4 + +using namespace nvinfer1; + +// stuff we know about the network and the input/output blobs +static const int INPUT_H = Yolo::INPUT_H; +static const int INPUT_W = Yolo::INPUT_W; +static const int OUTPUT_SIZE = 1000 * 7 + 1; // we assume the yololayer outputs + // no more than 1000 boxes that + // conf >= 0.1 + +static Logger gLogger; +// prepare input data --------------------------- +static float data[3 * INPUT_H * INPUT_W]; +// for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) +// data[i] = 1.0; +static float prob[OUTPUT_SIZE]; + +static IExecutionContext* context; + +void imageCallback(const sensor_msgs::ImageConstPtr& msg) +{ + try + { + auto img = cv_bridge::toCvShare(msg, "bgr8")->image; + + cv::Mat pr_img = preprocess_img(img); + + for (int i = 0; i < INPUT_H * INPUT_W; i++) + { + data[i] = pr_img.at(i)[2] / 255.0; + data[i + INPUT_H * INPUT_W] = pr_img.at(i)[1] / 255.0; + data[i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[0] / 255.0; + } + + auto start = std::chrono::system_clock::now(); + doInference(*context, data, prob, 1); + auto end = std::chrono::system_clock::now(); + std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + std::vector res; + nms(res, prob, NMS_THRESH); + for (int i = 0; i < 20; i++) + { + std::cout << prob[i] << ","; + } + std::cout << res.size() << std::endl; + for (size_t j = 0; j < res.size(); j++) + { + float* p = (float*)&res[j]; + for (size_t k = 0; k < 7; k++) + { + std::cout << p[k] << ", "; + } + std::cout << std::endl; + cv::Rect r = get_rect(img, res[j].bbox); + cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); + cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, + cv::Scalar(0xFF, 0xFF, 0xFF), 2); + } + cv::imshow("view", img); + } + catch (cv_bridge::Exception& e) + { + ROS_ERROR("Could not convert from '%s' to 'bgr8'.", msg->encoding.c_str()); + } +} + +int main(int argc, char** argv) +{ + cudaSetDevice(0); + size_t size{ 0 }; + + char* trtModelStream{ nullptr }; + + ros::init(argc, argv, "image_listener"); + std::string path = ros::package::getPath("uw_detection"); + std::ifstream file(path + "/share/yolov3.engine", std::ios::binary); + if (file.good()) + { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + trtModelStream = new char[size]; + assert(trtModelStream); + file.read(trtModelStream, size); + file.close(); + } + + IRuntime* runtime = createInferRuntime(gLogger); + assert(runtime != nullptr); + ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); + assert(engine != nullptr); + context = engine->createExecutionContext(); + assert(context != nullptr); + delete[] trtModelStream; + ros::NodeHandle nh; + cv::namedWindow("view"); + cv::startWindowThread(); + image_transport::ImageTransport it(nh); + image_transport::Subscriber sub = it.subscribe("/head_camera/rgb/image_raw", 1, imageCallback); + ros::spin(); + cv::destroyWindow("view"); +} \ No newline at end of file diff --git a/uw_detection/src/yolov3/README.md b/uw_detection/src/yolov3/README.md new file mode 100644 index 0000000..ec38033 --- /dev/null +++ b/uw_detection/src/yolov3/README.md @@ -0,0 +1,64 @@ +This directory contains both the SPP and regular YOLOv3 implementations copied from tensorrtx. The minimum changes required to catkinize the code were applied; headers were moved into the package include directory, and the model API was exposed and modified to accept a path to weights file. + + + +# yolov3-spp + +yolov4 is [here](../yolov4). + +The Pytorch implementation is [ultralytics/yolov3](https://github.com/ultralytics/yolov3). It provides two trained weights of yolov3-spp, `yolov3-spp.pt` and `yolov3-spp-ultralytics.pt`(originally named `ultralytics68.pt`). + +Following tricks are used in this yolov3-spp: + +- Yololayer plugin is different from the plugin used in [this repo's yolov3](https://github.com/wang-xinyu/tensorrtx/tree/master/yolov3). In this version, three yololayer are implemented in one plugin to improve speed, codes derived from [lewes6369/TensorRT-Yolov3](https://github.com/lewes6369/TensorRT-Yolov3) +- Batchnorm layer, implemented by scale layer. + +## Excute: + +``` +1. generate yolov3-spp_ultralytics68.wts from pytorch implementation with yolov3-spp.cfg and yolov3-spp-ultralytics.pt + +git clone https://github.com/wang-xinyu/tensorrtx.git +git clone https://github.com/ultralytics/yolov3.git +// download its weights 'yolov3-spp-ultralytics.pt' +cd yolov3 +cp ../tensorrtx/yolov3-spp/gen_wts.py . +python gen_wts.py yolov3-spp-ultralytics.pt +// a file 'yolov3-spp_ultralytics68.wts' will be generated. +// the master branch of yolov3 should work, if not, you can checkout 4ac60018f6e6c1e24b496485f126a660d9c793d8 + +2. put yolov3-spp_ultralytics68.wts into yolov3-spp, build and run + +mv yolov3-spp_ultralytics68.wts ../tensorrtx/yolov3-spp/ +cd ../tensorrtx/yolov3-spp +mkdir build +cd build +cmake .. +make +sudo ./yolov3-spp -s // serialize model to plan file i.e. 'yolov3-spp.engine' +sudo ./yolov3-spp -d ../samples // deserialize plan file and run inference, the images in samples will be processed. + +3. check the images generated, as follows. _zidane.jpg and _bus.jpg +``` + +

+ +

+ +

+ +

+ +## Config + +- Input shape defined in yololayer.h +- Number of classes defined in yololayer.h +- FP16/FP32 can be selected by the macro in yolov3-spp.cpp +- GPU id can be selected by the macro in yolov3-spp.cpp +- NMS thresh in yolov3-spp.cpp +- BBox confidence thresh in yolov3-spp.cpp + +## More Information + +See the readme in [home page.](https://github.com/wang-xinyu/tensorrtx) + diff --git a/uw_detection/src/yolov3/gen_wts.py b/uw_detection/src/yolov3/gen_wts.py new file mode 100644 index 0000000..52c8e1c --- /dev/null +++ b/uw_detection/src/yolov3/gen_wts.py @@ -0,0 +1,23 @@ +import struct +import sys +from models import * +from utils.utils import * + +model = Darknet('cfg/yolov3.cfg', (608, 608)) +weights = sys.argv[1] +device = torch_utils.select_device('0') +if weights.endswith('.pt'): # pytorch format + model.load_state_dict(torch.load(weights, map_location=device)['model']) +else: # darknet format + load_darknet_weights(model, weights) +model = model.eval() + +f = open('yolov3.wts', 'w') +f.write('{}\n'.format(len(model.state_dict().keys()))) +for k, v in model.state_dict().items(): + vr = v.reshape(-1).cpu().numpy() + f.write('{} {} '.format(k, len(vr))) + for vv in vr: + f.write(' ') + f.write(struct.pack('>f',float(vv)).hex()) + f.write('\n') \ No newline at end of file diff --git a/uw_detection/src/yolov3/gen_wts_spp.py b/uw_detection/src/yolov3/gen_wts_spp.py new file mode 100644 index 0000000..7604c77 --- /dev/null +++ b/uw_detection/src/yolov3/gen_wts_spp.py @@ -0,0 +1,22 @@ +import struct +import sys +from models import * +from utils.utils import * + +model = Darknet('cfg/yolov3-spp.cfg', (416, 416)) +weights = sys.argv[1] +dev = '1' +device = torch_utils.select_device(dev) +model.load_state_dict(torch.load(weights, map_location=device)['model']) + + +f = open('yolov3-spp_ultralytics68.wts', 'w') +f.write('{}\n'.format(len(model.state_dict().keys()))) +for k, v in model.state_dict().items(): + vr = v.reshape(-1).cpu().numpy() + f.write('{} {} '.format(k, len(vr))) + for vv in vr: + f.write(' ') + f.write(struct.pack('>f',float(vv)).hex()) + f.write('\n') + diff --git a/uw_detection/src/yolov3/samples/bus.jpg b/uw_detection/src/yolov3/samples/bus.jpg new file mode 100644 index 0000000..b43e311 Binary files /dev/null and b/uw_detection/src/yolov3/samples/bus.jpg differ diff --git a/uw_detection/src/yolov3/samples/zidane.jpg b/uw_detection/src/yolov3/samples/zidane.jpg new file mode 100644 index 0000000..92d72ea Binary files /dev/null and b/uw_detection/src/yolov3/samples/zidane.jpg differ diff --git a/uw_detection/src/yolov3/yololayer.cu b/uw_detection/src/yolov3/yololayer.cu new file mode 100644 index 0000000..8ff75b6 --- /dev/null +++ b/uw_detection/src/yolov3/yololayer.cu @@ -0,0 +1,307 @@ +#include + +using namespace Yolo; + +namespace nvinfer1 +{ + YoloLayerPlugin::YoloLayerPlugin(int num_classes) + { + mClassCount = num_classes; + mYoloKernel.clear(); + mYoloKernel.push_back(yolo1); + mYoloKernel.push_back(yolo2); + mYoloKernel.push_back(yolo3); + + mKernelCount = mYoloKernel.size(); + } + + YoloLayerPlugin::~YoloLayerPlugin() + { + } + + // create the plugin at runtime from a byte stream + YoloLayerPlugin::YoloLayerPlugin(const void* data, size_t length) + { + using namespace Tn; + const char *d = reinterpret_cast(data), *a = d; + read(d, mClassCount); + read(d, mThreadCount); + read(d, mKernelCount); + mYoloKernel.resize(mKernelCount); + auto kernelSize = mKernelCount*sizeof(YoloKernel); + memcpy(mYoloKernel.data(),d,kernelSize); + d += kernelSize; + + assert(d == a + length); + } + + void YoloLayerPlugin::serialize(void* buffer) const + { + using namespace Tn; + char* d = static_cast(buffer), *a = d; + write(d, mClassCount); + write(d, mThreadCount); + write(d, mKernelCount); + auto kernelSize = mKernelCount*sizeof(YoloKernel); + memcpy(d,mYoloKernel.data(),kernelSize); + d += kernelSize; + + assert(d == a + getSerializationSize()); + } + + size_t YoloLayerPlugin::getSerializationSize() const + { + return sizeof(mClassCount) + sizeof(mThreadCount) + sizeof(mKernelCount) + sizeof(Yolo::YoloKernel) * mYoloKernel.size(); + } + + int YoloLayerPlugin::initialize() + { + return 0; + } + + Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) + { + //output the result to channel + int totalsize = MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); + + return Dims3(totalsize + 1, 1, 1); + } + + // Set plugin namespace + void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) + { + mPluginNamespace = pluginNamespace; + } + + const char* YoloLayerPlugin::getPluginNamespace() const + { + return mPluginNamespace; + } + + // Return the DataType of the plugin output at the requested index + DataType YoloLayerPlugin::getOutputDataType(int index, const nvinfer1::DataType* inputTypes, int nbInputs) const + { + return DataType::kFLOAT; + } + + // Return true if output tensor is broadcast across a batch. + bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const + { + return false; + } + + // Return true if plugin can use input that is broadcast across batch without replication. + bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const + { + return false; + } + + void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) + { + } + + // Attach the plugin object to an execution context and grant the plugin the access to some context resource. + void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) + { + } + + // Detach the plugin object from its execution context. + void YoloLayerPlugin::detachFromContext() {} + + const char* YoloLayerPlugin::getPluginType() const + { + return "YoloLayer_TRT"; + } + + const char* YoloLayerPlugin::getPluginVersion() const + { + return "1"; + } + + void YoloLayerPlugin::destroy() + { + delete this; + } + + // Clone the plugin + IPluginV2IOExt* YoloLayerPlugin::clone() const + { + YoloLayerPlugin *p = new YoloLayerPlugin(mClassCount); + p->setPluginNamespace(mPluginNamespace); + return p; + } + + __device__ float Logist(float data){ return 1.0f / (1.0f + expf(-data)); }; + + __global__ void CalDetection(const float *input, float *output,int noElements, + int yoloWidth,int yoloHeight,const float anchors[CHECK_COUNT*2],int classes,int outputElem) { + + int idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx >= noElements) return; + + int total_grid = yoloWidth * yoloHeight; + int bnIdx = idx / total_grid; + idx = idx - total_grid*bnIdx; + int info_len_i = 5 + classes; + const float* curInput = input + bnIdx * (info_len_i * total_grid * CHECK_COUNT); + + for (int k = 0; k < 3; ++k) { + int class_id = 0; + float max_cls_prob = 0.0; + for (int i = 5; i < info_len_i; ++i) { + float p = Logist(curInput[idx + k * info_len_i * total_grid + i * total_grid]); + if (p > max_cls_prob) { + max_cls_prob = p; + class_id = i - 5; + } + } + float box_prob = Logist(curInput[idx + k * info_len_i * total_grid + 4 * total_grid]); + if (max_cls_prob < IGNORE_THRESH || box_prob < IGNORE_THRESH) continue; + + float *res_count = output + bnIdx*outputElem; + int count = (int)atomicAdd(res_count, 1); + if (count >= MAX_OUTPUT_BBOX_COUNT) return; + char* data = (char * )res_count + sizeof(float) + count*sizeof(Detection); + Detection* det = (Detection*)(data); + + int row = idx / yoloWidth; + int col = idx % yoloWidth; + + //Location + det->bbox[0] = (col + Logist(curInput[idx + k * info_len_i * total_grid + 0 * total_grid])) * INPUT_W / yoloWidth; + det->bbox[1] = (row + Logist(curInput[idx + k * info_len_i * total_grid + 1 * total_grid])) * INPUT_H / yoloHeight; + det->bbox[2] = expf(curInput[idx + k * info_len_i * total_grid + 2 * total_grid]) * anchors[2*k]; + det->bbox[3] = expf(curInput[idx + k * info_len_i * total_grid + 3 * total_grid]) * anchors[2*k + 1]; + det->det_confidence = box_prob; + det->class_id = class_id; + det->class_confidence = max_cls_prob; + } + } + + void YoloLayerPlugin::forwardGpu(const float *const * inputs, float* output, cudaStream_t stream, int batchSize) { + void* devAnchor; + size_t AnchorLen = sizeof(float)* CHECK_COUNT*2; + CUDA_CHECK(cudaMalloc(&devAnchor,AnchorLen)); + + int outputElem = 1 + MAX_OUTPUT_BBOX_COUNT * sizeof(Detection) / sizeof(float); + + for(int idx = 0 ; idx < batchSize; ++idx) { + CUDA_CHECK(cudaMemset(output + idx*outputElem, 0, sizeof(float))); + } + int numElem = 0; + for (unsigned int i = 0;i< mYoloKernel.size();++i) + { + const auto& yolo = mYoloKernel[i]; + numElem = yolo.width*yolo.height*batchSize; + if (numElem < mThreadCount) + mThreadCount = numElem; + CUDA_CHECK(cudaMemcpy(devAnchor, yolo.anchors, AnchorLen, cudaMemcpyHostToDevice)); + CalDetection<<< (yolo.width*yolo.height*batchSize + mThreadCount - 1) / mThreadCount, mThreadCount>>> + (inputs[i],output, numElem, yolo.width, yolo.height, (float *)devAnchor, mClassCount ,outputElem); + } + + CUDA_CHECK(cudaFree(devAnchor)); + } + + + int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) + { + //assert(batchSize == 1); + //GPU + //CUDA_CHECK(cudaStreamSynchronize(stream)); + forwardGpu((const float *const *)inputs, (float*)outputs[0], stream, batchSize); + + return 0; + } + + PluginFieldCollection YoloPluginCreator::mFC{}; + std::vector YoloPluginCreator::mPluginAttributes; + + YoloPluginCreator::YoloPluginCreator() + { + mPluginAttributes.clear(); + mPluginAttributes.emplace_back(PluginField("numClasses", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(PluginField("yoloWidth", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(PluginField("yoloHeight", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(PluginField("inputWidth", nullptr, PluginFieldType::kINT32, 1)); + mPluginAttributes.emplace_back(PluginField("inputHeight", nullptr, PluginFieldType::kINT32, 1)); + + mFC.nbFields = mPluginAttributes.size(); + mFC.fields = mPluginAttributes.data(); + } + + const char* YoloPluginCreator::getPluginName() const + { + return "YoloLayer_TRT"; + } + + const char* YoloPluginCreator::getPluginVersion() const + { + return "1"; + } + + const PluginFieldCollection* YoloPluginCreator::getFieldNames() + { + return &mFC; + } + + IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) + { + //assert(!strcmp(name, getPluginName())); + const PluginField* fields = fc->fields; + int yolo_width, yolo_height, num_anchors = 0; + int num_classes; + int input_width, input_height; + + for (int i = 0; i < fc->nbFields; ++i) + { + const char* attrName = fields[i].name; + if (!strcmp(attrName, "yoloWidth")) + { + assert(fields[i].type == PluginFieldType::kINT32); + yolo_width = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "yoloHeight")) + { + assert(fields[i].type == PluginFieldType::kINT32); + yolo_height = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "numAnchors")) + { + assert(fields[i].type == PluginFieldType::kINT32); + num_anchors = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "numClasses")) + { + assert(fields[i].type == PluginFieldType::kINT32); + num_classes = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "inputWidth")) + { + assert(fields[i].type == PluginFieldType::kINT32); + input_width = *(static_cast(fields[i].data)); + } + else if (!strcmp(attrName, "inputHeight")) + { + assert(fields[i].type == PluginFieldType::kINT32); + input_height = *(static_cast(fields[i].data)); + } + } + //assert(yolo_width > 0 && yolo_height > 0); + //assert(num_classes > 0); + //assert(input_width > 0 && input_height > 0); + YoloLayerPlugin* obj = new YoloLayerPlugin(num_classes); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + + IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) + { + // This object will be deleted when the network is destroyed, which will + // call MishPlugin::destroy() + YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); + obj->setPluginNamespace(mNamespace.c_str()); + return obj; + } + +} diff --git a/uw_detection/src/yolov3/yolov3-spp.cpp b/uw_detection/src/yolov3/yolov3-spp.cpp new file mode 100644 index 0000000..24e3aad --- /dev/null +++ b/uw_detection/src/yolov3/yolov3-spp.cpp @@ -0,0 +1,572 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "NvInfer.h" +#include "cuda_runtime_api.h" +#include +#include + +#define CHECK(status) \ + do\ + {\ + auto ret = (status);\ + if (ret != 0)\ + {\ + std::cerr << "Cuda failure: " << ret << std::endl;\ + abort();\ + }\ + } while (0) + + +#define USE_FP16 // comment out this if want to use FP32 +#define DEVICE 0 // GPU id +#define NMS_THRESH 0.4 +#define BBOX_CONF_THRESH 0.5 + +using namespace nvinfer1; + +// stuff we know about the network and the input/output blobs +static const int INPUT_H = Yolo::INPUT_H; +static const int INPUT_W = Yolo::INPUT_W; +static const int OUTPUT_SIZE = 1000 * 7 + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1 +const char* INPUT_BLOB_NAME = "data"; +const char* OUTPUT_BLOB_NAME = "prob"; +static Logger gLogger; + +cv::Mat preprocess_img(cv::Mat& img) { + int w, h, x, y; + float r_w = INPUT_W / (img.cols*1.0); + float r_h = INPUT_H / (img.rows*1.0); + if (r_h > r_w) { + w = INPUT_W; + h = r_w * img.rows; + x = 0; + y = (INPUT_H - h) / 2; + } else { + w = r_h* img.cols; + h = INPUT_H; + x = (INPUT_W - w) / 2; + y = 0; + } + cv::Mat re(h, w, CV_8UC3); + cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); + cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); + re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); + return out; +} + +cv::Rect get_rect(const cv::Mat& img, float bbox[4]) { + int l, r, t, b; + float r_w = INPUT_W / (img.cols * 1.0); + float r_h = INPUT_H / (img.rows * 1.0); + if (r_h > r_w) { + l = bbox[0] - bbox[2]/2.f; + r = bbox[0] + bbox[2]/2.f; + t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; + b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; + l = l / r_w; + r = r / r_w; + t = t / r_w; + b = b / r_w; + } else { + l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; + r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; + t = bbox[1] - bbox[3]/2.f; + b = bbox[1] + bbox[3]/2.f; + l = l / r_h; + r = r / r_h; + t = t / r_h; + b = b / r_h; + } + return cv::Rect(l, t, r-l, b-t); +} + +float iou(float lbox[4], float rbox[4]) { + float interBox[] = { + std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left + std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right + std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top + std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom + }; + + if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) + return 0.0f; + + float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); + return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); +} + +bool cmp(Yolo::Detection& a, Yolo::Detection& b) { + return a.det_confidence > b.det_confidence; +} + +void nms(std::vector& res, float *output, float nms_thresh = NMS_THRESH) { + std::map> m; + for (int i = 0; i < output[0] && i < 1000; i++) { + if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue; + Yolo::Detection det; + memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float)); + if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); + m[det.class_id].push_back(det); + } + for (auto it = m.begin(); it != m.end(); it++) { + //std::cout << it->second[0].class_id << " --- " << std::endl; + auto& dets = it->second; + std::sort(dets.begin(), dets.end(), cmp); + for (size_t m = 0; m < dets.size(); ++m) { + auto& item = dets[m]; + res.push_back(item); + for (size_t n = m + 1; n < dets.size(); ++n) { + if (iou(item.bbox, dets[n].bbox) > nms_thresh) { + dets.erase(dets.begin()+n); + --n; + } + } + } + } +} + +// TensorRT weight files have a simple space delimited format: +// [type] [size] +std::map loadWeights(const std::string file) { + std::cout << "Loading weights: " << file << std::endl; + std::map weightMap; + + // Open weights file + std::ifstream input(file); + assert(input.is_open() && "Unable to load weight file."); + + // Read number of weight blobs + int32_t count; + input >> count; + assert(count > 0 && "Invalid weight map file."); + + while (count--) + { + Weights wt{DataType::kFLOAT, nullptr, 0}; + uint32_t size; + + // Read name and type of blob + std::string name; + input >> name >> std::dec >> size; + wt.type = DataType::kFLOAT; + + // Load blob + uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); + for (uint32_t x = 0, y = size; x < y; ++x) + { + input >> std::hex >> val[x]; + } + wt.values = val; + + wt.count = size; + weightMap[name] = wt; + } + + return weightMap; +} + +IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { + float *gamma = (float*)weightMap[lname + ".weight"].values; + float *beta = (float*)weightMap[lname + ".bias"].values; + float *mean = (float*)weightMap[lname + ".running_mean"].values; + float *var = (float*)weightMap[lname + ".running_var"].values; + int len = weightMap[lname + ".running_var"].count; + + float *scval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + scval[i] = gamma[i] / sqrt(var[i] + eps); + } + Weights scale{DataType::kFLOAT, scval, len}; + + float *shval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); + } + Weights shift{DataType::kFLOAT, shval, len}; + + float *pval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + pval[i] = 1.0; + } + Weights power{DataType::kFLOAT, pval, len}; + + weightMap[lname + ".scale"] = scale; + weightMap[lname + ".shift"] = shift; + weightMap[lname + ".power"] = power; + IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); + assert(scale_1); + return scale_1; +} + +ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); + assert(conv1); + conv1->setStrideNd(DimsHW{s, s}); + conv1->setPaddingNd(DimsHW{p, p}); + + IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5); + + auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); + lr->setAlpha(0.1); + + return lr; +} + +// Creat the engine using only the API and not any parser. +ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& weights_path) { + INetworkDefinition* network = builder->createNetworkV2(0U); + + // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME + ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); + assert(data); + + std::map weightMap = loadWeights(weights_path); + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + + // Yeah I am stupid, I just want to expand the complete arch of darknet.. + auto lr0 = convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0); + auto lr1 = convBnLeaky(network, weightMap, *lr0->getOutput(0), 64, 3, 2, 1, 1); + auto lr2 = convBnLeaky(network, weightMap, *lr1->getOutput(0), 32, 1, 1, 0, 2); + auto lr3 = convBnLeaky(network, weightMap, *lr2->getOutput(0), 64, 3, 1, 1, 3); + auto ew4 = network->addElementWise(*lr3->getOutput(0), *lr1->getOutput(0), ElementWiseOperation::kSUM); + auto lr5 = convBnLeaky(network, weightMap, *ew4->getOutput(0), 128, 3, 2, 1, 5); + auto lr6 = convBnLeaky(network, weightMap, *lr5->getOutput(0), 64, 1, 1, 0, 6); + auto lr7 = convBnLeaky(network, weightMap, *lr6->getOutput(0), 128, 3, 1, 1, 7); + auto ew8 = network->addElementWise(*lr7->getOutput(0), *lr5->getOutput(0), ElementWiseOperation::kSUM); + auto lr9 = convBnLeaky(network, weightMap, *ew8->getOutput(0), 64, 1, 1, 0, 9); + auto lr10 = convBnLeaky(network, weightMap, *lr9->getOutput(0), 128, 3, 1, 1, 10); + auto ew11 = network->addElementWise(*lr10->getOutput(0), *ew8->getOutput(0), ElementWiseOperation::kSUM); + auto lr12 = convBnLeaky(network, weightMap, *ew11->getOutput(0), 256, 3, 2, 1, 12); + auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 128, 1, 1, 0, 13); + auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 256, 3, 1, 1, 14); + auto ew15 = network->addElementWise(*lr14->getOutput(0), *lr12->getOutput(0), ElementWiseOperation::kSUM); + auto lr16 = convBnLeaky(network, weightMap, *ew15->getOutput(0), 128, 1, 1, 0, 16); + auto lr17 = convBnLeaky(network, weightMap, *lr16->getOutput(0), 256, 3, 1, 1, 17); + auto ew18 = network->addElementWise(*lr17->getOutput(0), *ew15->getOutput(0), ElementWiseOperation::kSUM); + auto lr19 = convBnLeaky(network, weightMap, *ew18->getOutput(0), 128, 1, 1, 0, 19); + auto lr20 = convBnLeaky(network, weightMap, *lr19->getOutput(0), 256, 3, 1, 1, 20); + auto ew21 = network->addElementWise(*lr20->getOutput(0), *ew18->getOutput(0), ElementWiseOperation::kSUM); + auto lr22 = convBnLeaky(network, weightMap, *ew21->getOutput(0), 128, 1, 1, 0, 22); + auto lr23 = convBnLeaky(network, weightMap, *lr22->getOutput(0), 256, 3, 1, 1, 23); + auto ew24 = network->addElementWise(*lr23->getOutput(0), *ew21->getOutput(0), ElementWiseOperation::kSUM); + auto lr25 = convBnLeaky(network, weightMap, *ew24->getOutput(0), 128, 1, 1, 0, 25); + auto lr26 = convBnLeaky(network, weightMap, *lr25->getOutput(0), 256, 3, 1, 1, 26); + auto ew27 = network->addElementWise(*lr26->getOutput(0), *ew24->getOutput(0), ElementWiseOperation::kSUM); + auto lr28 = convBnLeaky(network, weightMap, *ew27->getOutput(0), 128, 1, 1, 0, 28); + auto lr29 = convBnLeaky(network, weightMap, *lr28->getOutput(0), 256, 3, 1, 1, 29); + auto ew30 = network->addElementWise(*lr29->getOutput(0), *ew27->getOutput(0), ElementWiseOperation::kSUM); + auto lr31 = convBnLeaky(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31); + auto lr32 = convBnLeaky(network, weightMap, *lr31->getOutput(0), 256, 3, 1, 1, 32); + auto ew33 = network->addElementWise(*lr32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM); + auto lr34 = convBnLeaky(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34); + auto lr35 = convBnLeaky(network, weightMap, *lr34->getOutput(0), 256, 3, 1, 1, 35); + auto ew36 = network->addElementWise(*lr35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM); + auto lr37 = convBnLeaky(network, weightMap, *ew36->getOutput(0), 512, 3, 2, 1, 37); + auto lr38 = convBnLeaky(network, weightMap, *lr37->getOutput(0), 256, 1, 1, 0, 38); + auto lr39 = convBnLeaky(network, weightMap, *lr38->getOutput(0), 512, 3, 1, 1, 39); + auto ew40 = network->addElementWise(*lr39->getOutput(0), *lr37->getOutput(0), ElementWiseOperation::kSUM); + auto lr41 = convBnLeaky(network, weightMap, *ew40->getOutput(0), 256, 1, 1, 0, 41); + auto lr42 = convBnLeaky(network, weightMap, *lr41->getOutput(0), 512, 3, 1, 1, 42); + auto ew43 = network->addElementWise(*lr42->getOutput(0), *ew40->getOutput(0), ElementWiseOperation::kSUM); + auto lr44 = convBnLeaky(network, weightMap, *ew43->getOutput(0), 256, 1, 1, 0, 44); + auto lr45 = convBnLeaky(network, weightMap, *lr44->getOutput(0), 512, 3, 1, 1, 45); + auto ew46 = network->addElementWise(*lr45->getOutput(0), *ew43->getOutput(0), ElementWiseOperation::kSUM); + auto lr47 = convBnLeaky(network, weightMap, *ew46->getOutput(0), 256, 1, 1, 0, 47); + auto lr48 = convBnLeaky(network, weightMap, *lr47->getOutput(0), 512, 3, 1, 1, 48); + auto ew49 = network->addElementWise(*lr48->getOutput(0), *ew46->getOutput(0), ElementWiseOperation::kSUM); + auto lr50 = convBnLeaky(network, weightMap, *ew49->getOutput(0), 256, 1, 1, 0, 50); + auto lr51 = convBnLeaky(network, weightMap, *lr50->getOutput(0), 512, 3, 1, 1, 51); + auto ew52 = network->addElementWise(*lr51->getOutput(0), *ew49->getOutput(0), ElementWiseOperation::kSUM); + auto lr53 = convBnLeaky(network, weightMap, *ew52->getOutput(0), 256, 1, 1, 0, 53); + auto lr54 = convBnLeaky(network, weightMap, *lr53->getOutput(0), 512, 3, 1, 1, 54); + auto ew55 = network->addElementWise(*lr54->getOutput(0), *ew52->getOutput(0), ElementWiseOperation::kSUM); + auto lr56 = convBnLeaky(network, weightMap, *ew55->getOutput(0), 256, 1, 1, 0, 56); + auto lr57 = convBnLeaky(network, weightMap, *lr56->getOutput(0), 512, 3, 1, 1, 57); + auto ew58 = network->addElementWise(*lr57->getOutput(0), *ew55->getOutput(0), ElementWiseOperation::kSUM); + auto lr59 = convBnLeaky(network, weightMap, *ew58->getOutput(0), 256, 1, 1, 0, 59); + auto lr60 = convBnLeaky(network, weightMap, *lr59->getOutput(0), 512, 3, 1, 1, 60); + auto ew61 = network->addElementWise(*lr60->getOutput(0), *ew58->getOutput(0), ElementWiseOperation::kSUM); + auto lr62 = convBnLeaky(network, weightMap, *ew61->getOutput(0), 1024, 3, 2, 1, 62); + auto lr63 = convBnLeaky(network, weightMap, *lr62->getOutput(0), 512, 1, 1, 0, 63); + auto lr64 = convBnLeaky(network, weightMap, *lr63->getOutput(0), 1024, 3, 1, 1, 64); + auto ew65 = network->addElementWise(*lr64->getOutput(0), *lr62->getOutput(0), ElementWiseOperation::kSUM); + auto lr66 = convBnLeaky(network, weightMap, *ew65->getOutput(0), 512, 1, 1, 0, 66); + auto lr67 = convBnLeaky(network, weightMap, *lr66->getOutput(0), 1024, 3, 1, 1, 67); + auto ew68 = network->addElementWise(*lr67->getOutput(0), *ew65->getOutput(0), ElementWiseOperation::kSUM); + auto lr69 = convBnLeaky(network, weightMap, *ew68->getOutput(0), 512, 1, 1, 0, 69); + auto lr70 = convBnLeaky(network, weightMap, *lr69->getOutput(0), 1024, 3, 1, 1, 70); + auto ew71 = network->addElementWise(*lr70->getOutput(0), *ew68->getOutput(0), ElementWiseOperation::kSUM); + auto lr72 = convBnLeaky(network, weightMap, *ew71->getOutput(0), 512, 1, 1, 0, 72); + auto lr73 = convBnLeaky(network, weightMap, *lr72->getOutput(0), 1024, 3, 1, 1, 73); + auto ew74 = network->addElementWise(*lr73->getOutput(0), *ew71->getOutput(0), ElementWiseOperation::kSUM); + auto lr75 = convBnLeaky(network, weightMap, *ew74->getOutput(0), 512, 1, 1, 0, 75); + auto lr76 = convBnLeaky(network, weightMap, *lr75->getOutput(0), 1024, 3, 1, 1, 76); + auto lr77 = convBnLeaky(network, weightMap, *lr76->getOutput(0), 512, 1, 1, 0, 77); + + auto pool78 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{5,5}); + pool78->setPaddingNd(DimsHW{2, 2}); + pool78->setStrideNd(DimsHW{1, 1}); + auto pool80 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{9,9}); + pool80->setPaddingNd(DimsHW{4, 4}); + pool80->setStrideNd(DimsHW{1, 1}); + auto pool82 = network->addPoolingNd(*lr77->getOutput(0), PoolingType::kMAX, DimsHW{13,13}); + pool82->setPaddingNd(DimsHW{6, 6}); + pool82->setStrideNd(DimsHW{1, 1}); + + ITensor* inputTensors83[] = {pool82->getOutput(0), pool80->getOutput(0), pool78->getOutput(0), lr77->getOutput(0)}; + auto cat83 = network->addConcatenation(inputTensors83, 4); + + auto lr84 = convBnLeaky(network, weightMap, *cat83->getOutput(0), 512, 1, 1, 0, 84); + auto lr85 = convBnLeaky(network, weightMap, *lr84->getOutput(0), 1024, 3, 1, 1, 85); + auto lr86 = convBnLeaky(network, weightMap, *lr85->getOutput(0), 512, 1, 1, 0, 86); + auto lr87 = convBnLeaky(network, weightMap, *lr86->getOutput(0), 1024, 3, 1, 1, 87); + IConvolutionLayer* conv88 = network->addConvolutionNd(*lr87->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.88.Conv2d.weight"], weightMap["module_list.88.Conv2d.bias"]); + assert(conv88); + auto lr91 = convBnLeaky(network, weightMap, *lr86->getOutput(0), 256, 1, 1, 0, 91); + + float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); + for (int i = 0; i < 256 * 2 * 2; i++) { + deval[i] = 1.0; + } + Weights deconvwts92{DataType::kFLOAT, deval, 256 * 2 * 2}; + IDeconvolutionLayer* deconv92 = network->addDeconvolutionNd(*lr91->getOutput(0), 256, DimsHW{2, 2}, deconvwts92, emptywts); + assert(deconv92); + deconv92->setStrideNd(DimsHW{2, 2}); + deconv92->setNbGroups(256); + weightMap["deconv92"] = deconvwts92; + + ITensor* inputTensors[] = {deconv92->getOutput(0), ew61->getOutput(0)}; + auto cat93 = network->addConcatenation(inputTensors, 2); + auto lr94 = convBnLeaky(network, weightMap, *cat93->getOutput(0), 256, 1, 1, 0, 94); + auto lr95 = convBnLeaky(network, weightMap, *lr94->getOutput(0), 512, 3, 1, 1, 95); + auto lr96 = convBnLeaky(network, weightMap, *lr95->getOutput(0), 256, 1, 1, 0, 96); + auto lr97 = convBnLeaky(network, weightMap, *lr96->getOutput(0), 512, 3, 1, 1, 97); + auto lr98 = convBnLeaky(network, weightMap, *lr97->getOutput(0), 256, 1, 1, 0, 98); + auto lr99 = convBnLeaky(network, weightMap, *lr98->getOutput(0), 512, 3, 1, 1, 99); + IConvolutionLayer* conv100 = network->addConvolutionNd(*lr99->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.100.Conv2d.weight"], weightMap["module_list.100.Conv2d.bias"]); + assert(conv100); + auto lr103 = convBnLeaky(network, weightMap, *lr98->getOutput(0), 128, 1, 1, 0, 103); + Weights deconvwts104{DataType::kFLOAT, deval, 128 * 2 * 2}; + IDeconvolutionLayer* deconv104 = network->addDeconvolutionNd(*lr103->getOutput(0), 128, DimsHW{2, 2}, deconvwts104, emptywts); + assert(deconv104); + deconv104->setStrideNd(DimsHW{2, 2}); + deconv104->setNbGroups(128); + ITensor* inputTensors1[] = {deconv104->getOutput(0), ew36->getOutput(0)}; + auto cat105 = network->addConcatenation(inputTensors1, 2); + auto lr106 = convBnLeaky(network, weightMap, *cat105->getOutput(0), 128, 1, 1, 0, 106); + auto lr107 = convBnLeaky(network, weightMap, *lr106->getOutput(0), 256, 3, 1, 1, 107); + auto lr108 = convBnLeaky(network, weightMap, *lr107->getOutput(0), 128, 1, 1, 0, 108); + auto lr109 = convBnLeaky(network, weightMap, *lr108->getOutput(0), 256, 3, 1, 1, 109); + auto lr110 = convBnLeaky(network, weightMap, *lr109->getOutput(0), 128, 1, 1, 0, 110); + auto lr111 = convBnLeaky(network, weightMap, *lr110->getOutput(0), 256, 3, 1, 1, 111); + IConvolutionLayer* conv112 = network->addConvolutionNd(*lr111->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.112.Conv2d.weight"], weightMap["module_list.112.Conv2d.bias"]); + assert(conv112); + + auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); + const PluginFieldCollection* pluginData = creator->getFieldNames(); + IPluginV2 *pluginObj = creator->createPlugin("yololayer", pluginData); + ITensor* inputTensors_yolo[] = {conv88->getOutput(0), conv100->getOutput(0), conv112->getOutput(0)}; + auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); + + yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); + network->markOutput(*yolo->getOutput(0)); + + // Build engine + builder->setMaxBatchSize(maxBatchSize); + config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB +#ifdef USE_FP16 + config->setFlag(BuilderFlag::kFP16); +#endif + std::cout << "Building engine, please wait for a while..." << std::endl; + ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + // Don't need the network any more + network->destroy(); + + // Release host memory + for (auto& mem : weightMap) + { + free((void*) (mem.second.values)); + } + + return engine; +} + +void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, const std::string& weights_path) { + // Create builder + IBuilder* builder = createInferBuilder(gLogger); + IBuilderConfig* config = builder->createBuilderConfig(); + + // Create model to populate the network, then set the outputs and create an engine + ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, weights_path); + assert(engine != nullptr); + + // Serialize the engine + (*modelStream) = engine->serialize(); + + // Close everything down + engine->destroy(); + builder->destroy(); +} + +void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { + const ICudaEngine& engine = context.getEngine(); + + // Pointers to input and output device buffers to pass to engine. + // Engine requires exactly IEngine::getNbBindings() number of buffers. + assert(engine.getNbBindings() == 2); + void* buffers[2]; + + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); + const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); + + // Create GPU buffers on device + CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); + CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); + + // Create stream + cudaStream_t stream; + CHECK(cudaStreamCreate(&stream)); + + // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host + CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); + context.enqueue(batchSize, buffers, stream, nullptr); + CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); + + // Release stream and buffers + cudaStreamDestroy(stream); + CHECK(cudaFree(buffers[inputIndex])); + CHECK(cudaFree(buffers[outputIndex])); +} + +int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { + DIR *p_dir = opendir(p_dir_name); + if (p_dir == nullptr) { + return -1; + } + + struct dirent* p_file = nullptr; + while ((p_file = readdir(p_dir)) != nullptr) { + if (strcmp(p_file->d_name, ".") != 0 && + strcmp(p_file->d_name, "..") != 0) { + //std::string cur_file_name(p_dir_name); + //cur_file_name += "/"; + //cur_file_name += p_file->d_name; + std::string cur_file_name(p_file->d_name); + file_names.push_back(cur_file_name); + } + } + + closedir(p_dir); + return 0; +} + +int main(int argc, char** argv) { + cudaSetDevice(DEVICE); + // create a model using the API directly and serialize it to a stream + char *trtModelStream{nullptr}; + size_t size{0}; + + if (argc == 2 && std::string(argv[1]) == "-s") { + IHostMemory* modelStream{nullptr}; + APIToModel(1, &modelStream, "yolov3-spp_ultralytics68.wts"); + assert(modelStream != nullptr); + std::ofstream p("yolov3-spp.engine", std::ios::binary); + if (!p) { + std::cerr << "could not open plan output file" << std::endl; + return -1; + } + p.write(reinterpret_cast(modelStream->data()), modelStream->size()); + modelStream->destroy(); + return 0; + } else if (argc == 3 && std::string(argv[1]) == "-d") { + std::ifstream file("yolov3-spp.engine", std::ios::binary); + if (file.good()) { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + trtModelStream = new char[size]; + assert(trtModelStream); + file.read(trtModelStream, size); + file.close(); + } + } else { + std::cerr << "arguments not right!" << std::endl; + std::cerr << "./yolov3-spp -s // serialize model to plan file" << std::endl; + std::cerr << "./yolov3-spp -d ../samples // deserialize plan file and run inference" << std::endl; + return -1; + } + + std::vector file_names; + if (read_files_in_dir(argv[2], file_names) < 0) { + std::cout << "read_files_in_dir failed." << std::endl; + return -1; + } + + // prepare input data --------------------------- + float data[3 * INPUT_H * INPUT_W]; + //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) + // data[i] = 1.0; + static float prob[OUTPUT_SIZE]; + IRuntime* runtime = createInferRuntime(gLogger); + assert(runtime != nullptr); + ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); + assert(engine != nullptr); + IExecutionContext* context = engine->createExecutionContext(); + assert(context != nullptr); + delete[] trtModelStream; + + int fcount = 0; + for (auto f: file_names) { + fcount++; + std::cout << fcount << " " << f << std::endl; + cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f); + if (img.empty()) continue; + cv::Mat pr_img = preprocess_img(img); + for (int i = 0; i < INPUT_H * INPUT_W; i++) { + data[i] = pr_img.at(i)[2] / 255.0; + data[i + INPUT_H * INPUT_W] = pr_img.at(i)[1] / 255.0; + data[i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[0] / 255.0; + } + + // Run inference + auto start = std::chrono::system_clock::now(); + doInference(*context, data, prob, 1); + auto end = std::chrono::system_clock::now(); + std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + std::vector res; + nms(res, prob); + for (int i=0; i<20; i++) { + std::cout << prob[i] << ","; + } + std::cout << res.size() << std::endl; + for (size_t j = 0; j < res.size(); j++) { + float *p = (float*)&res[j]; + for (size_t k = 0; k < 7; k++) { + std::cout << p[k] << ", "; + } + std::cout << std::endl; + cv::Rect r = get_rect(img, res[j].bbox); + cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); + cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); + } + cv::imwrite("_" + f, img); + } + + // Destroy the engine + context->destroy(); + engine->destroy(); + runtime->destroy(); + return 0; +} diff --git a/uw_detection/src/yolov3/yolov3.cpp b/uw_detection/src/yolov3/yolov3.cpp new file mode 100644 index 0000000..13e63b8 --- /dev/null +++ b/uw_detection/src/yolov3/yolov3.cpp @@ -0,0 +1,567 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "NvInfer.h" +#include "cuda_runtime_api.h" +#include +#include + +#define CHECK(status) \ + do\ + {\ + auto ret = (status);\ + if (ret != 0)\ + {\ + std::cerr << "Cuda failure: " << ret << std::endl;\ + abort();\ + }\ + } while (0) + + +#define USE_FP16 // comment out this if want to use FP32 +#define DEVICE 0 // GPU id +#define NMS_THRESH 0.4 +#define BBOX_CONF_THRESH 0.5 +#define CLASS_NUM 21 + +using namespace nvinfer1; + +// stuff we know about the network and the input/output blobs +static const int INPUT_H = Yolo::INPUT_H; +static const int INPUT_W = Yolo::INPUT_W; +static const int OUTPUT_SIZE = 1000 * 7 + 1; // we assume the yololayer outputs no more than 1000 boxes that conf >= 0.1 +const char* INPUT_BLOB_NAME = "data"; +const char* OUTPUT_BLOB_NAME = "prob"; +static Logger gLogger; + +cv::Mat preprocess_img(cv::Mat& img) { + int w, h, x, y; + float r_w = INPUT_W / (img.cols*1.0); + float r_h = INPUT_H / (img.rows*1.0); + if (r_h > r_w) { + w = INPUT_W; + h = r_w * img.rows; + x = 0; + y = (INPUT_H - h) / 2; + } else { + w = r_h* img.cols; + h = INPUT_H; + x = (INPUT_W - w) / 2; + y = 0; + } + cv::Mat re(h, w, CV_8UC3); + cv::resize(img, re, re.size(), 0, 0, cv::INTER_CUBIC); + cv::Mat out(INPUT_H, INPUT_W, CV_8UC3, cv::Scalar(128, 128, 128)); + re.copyTo(out(cv::Rect(x, y, re.cols, re.rows))); + return out; +} + +cv::Rect get_rect(const cv::Mat& img, float bbox[4]) { + int l, r, t, b; + float r_w = INPUT_W / (img.cols * 1.0); + float r_h = INPUT_H / (img.rows * 1.0); + if (r_h > r_w) { + l = bbox[0] - bbox[2]/2.f; + r = bbox[0] + bbox[2]/2.f; + t = bbox[1] - bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; + b = bbox[1] + bbox[3]/2.f - (INPUT_H - r_w * img.rows) / 2; + l = l / r_w; + r = r / r_w; + t = t / r_w; + b = b / r_w; + } else { + l = bbox[0] - bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; + r = bbox[0] + bbox[2]/2.f - (INPUT_W - r_h * img.cols) / 2; + t = bbox[1] - bbox[3]/2.f; + b = bbox[1] + bbox[3]/2.f; + l = l / r_h; + r = r / r_h; + t = t / r_h; + b = b / r_h; + } + return cv::Rect(l, t, r-l, b-t); +} + +float iou(float lbox[4], float rbox[4]) { + float interBox[] = { + std::max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left + std::min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right + std::max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top + std::min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom + }; + + if(interBox[2] > interBox[3] || interBox[0] > interBox[1]) + return 0.0f; + + float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]); + return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS); +} + +bool cmp(Yolo::Detection& a, Yolo::Detection& b) { + return a.det_confidence > b.det_confidence; +} + +void nms(std::vector& res, float *output, float nms_thresh = NMS_THRESH) { + std::map> m; + for (int i = 0; i < output[0] && i < 1000; i++) { + if (output[1 + 7 * i + 4] <= BBOX_CONF_THRESH) continue; + Yolo::Detection det; + memcpy(&det, &output[1 + 7 * i], 7 * sizeof(float)); + if (m.count(det.class_id) == 0) m.emplace(det.class_id, std::vector()); + m[det.class_id].push_back(det); + } + for (auto it = m.begin(); it != m.end(); it++) { + //std::cout << it->second[0].class_id << " --- " << std::endl; + auto& dets = it->second; + std::sort(dets.begin(), dets.end(), cmp); + for (size_t m = 0; m < dets.size(); ++m) { + auto& item = dets[m]; + res.push_back(item); + for (size_t n = m + 1; n < dets.size(); ++n) { + if (iou(item.bbox, dets[n].bbox) > nms_thresh) { + dets.erase(dets.begin()+n); + --n; + } + } + } + } +} + +// TensorRT weight files have a simple space delimited format: +// [type] [size] +std::map loadWeights(const std::string file) { + std::cout << "Loading weights: " << file << std::endl; + std::map weightMap; + + // Open weights file + std::ifstream input(file); + assert(input.is_open() && "Unable to load weight file."); + + // Read number of weight blobs + int32_t count; + input >> count; + assert(count > 0 && "Invalid weight map file."); + + while (count--) + { + Weights wt{DataType::kFLOAT, nullptr, 0}; + uint32_t size; + + // Read name and type of blob + std::string name; + input >> name >> std::dec >> size; + wt.type = DataType::kFLOAT; + + // Load blob + uint32_t* val = reinterpret_cast(malloc(sizeof(val) * size)); + for (uint32_t x = 0, y = size; x < y; ++x) + { + input >> std::hex >> val[x]; + } + wt.values = val; + + wt.count = size; + weightMap[name] = wt; + } + + return weightMap; +} + +IScaleLayer* addBatchNorm2d(INetworkDefinition *network, std::map& weightMap, ITensor& input, std::string lname, float eps) { + float *gamma = (float*)weightMap[lname + ".weight"].values; + float *beta = (float*)weightMap[lname + ".bias"].values; + float *mean = (float*)weightMap[lname + ".running_mean"].values; + float *var = (float*)weightMap[lname + ".running_var"].values; + int len = weightMap[lname + ".running_var"].count; + + float *scval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + scval[i] = gamma[i] / sqrt(var[i] + eps); + } + Weights scale{DataType::kFLOAT, scval, len}; + + float *shval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + shval[i] = beta[i] - mean[i] * gamma[i] / sqrt(var[i] + eps); + } + Weights shift{DataType::kFLOAT, shval, len}; + + float *pval = reinterpret_cast(malloc(sizeof(float) * len)); + for (int i = 0; i < len; i++) { + pval[i] = 1.0; + } + Weights power{DataType::kFLOAT, pval, len}; + + weightMap[lname + ".scale"] = scale; + weightMap[lname + ".shift"] = shift; + weightMap[lname + ".power"] = power; + IScaleLayer* scale_1 = network->addScale(input, ScaleMode::kCHANNEL, shift, scale, power); + assert(scale_1); + return scale_1; +} + +ILayer* convBnLeaky(INetworkDefinition *network, std::map& weightMap, ITensor& input, int outch, int ksize, int s, int p, int linx) { + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + IConvolutionLayer* conv1 = network->addConvolutionNd(input, outch, DimsHW{ksize, ksize}, weightMap["module_list." + std::to_string(linx) + ".Conv2d.weight"], emptywts); + assert(conv1); + conv1->setStrideNd(DimsHW{s, s}); + conv1->setPaddingNd(DimsHW{p, p}); + + IScaleLayer* bn1 = addBatchNorm2d(network, weightMap, *conv1->getOutput(0), "module_list." + std::to_string(linx) + ".BatchNorm2d", 1e-5); + + auto lr = network->addActivation(*bn1->getOutput(0), ActivationType::kLEAKY_RELU); + lr->setAlpha(0.1); + + return lr; +} + +// Creat the engine using only the API and not any parser. +ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config, DataType dt, const std::string& weights_path) { + INetworkDefinition* network = builder->createNetworkV2(0U); + + // Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME + ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{3, INPUT_H, INPUT_W}); + assert(data); + + std::map weightMap = loadWeights(weights_path); + Weights emptywts{DataType::kFLOAT, nullptr, 0}; + + // Yeah I am stupid, I just want to expand the complete arch of darknet.. + auto lr0 = convBnLeaky(network, weightMap, *data, 32, 3, 1, 1, 0); + auto lr1 = convBnLeaky(network, weightMap, *lr0->getOutput(0), 64, 3, 2, 1, 1); + auto lr2 = convBnLeaky(network, weightMap, *lr1->getOutput(0), 32, 1, 1, 0, 2); + auto lr3 = convBnLeaky(network, weightMap, *lr2->getOutput(0), 64, 3, 1, 1, 3); + auto ew4 = network->addElementWise(*lr3->getOutput(0), *lr1->getOutput(0), ElementWiseOperation::kSUM); + auto lr5 = convBnLeaky(network, weightMap, *ew4->getOutput(0), 128, 3, 2, 1, 5); + auto lr6 = convBnLeaky(network, weightMap, *lr5->getOutput(0), 64, 1, 1, 0, 6); + auto lr7 = convBnLeaky(network, weightMap, *lr6->getOutput(0), 128, 3, 1, 1, 7); + auto ew8 = network->addElementWise(*lr7->getOutput(0), *lr5->getOutput(0), ElementWiseOperation::kSUM); + auto lr9 = convBnLeaky(network, weightMap, *ew8->getOutput(0), 64, 1, 1, 0, 9); + auto lr10 = convBnLeaky(network, weightMap, *lr9->getOutput(0), 128, 3, 1, 1, 10); + auto ew11 = network->addElementWise(*lr10->getOutput(0), *ew8->getOutput(0), ElementWiseOperation::kSUM); + auto lr12 = convBnLeaky(network, weightMap, *ew11->getOutput(0), 256, 3, 2, 1, 12); + auto lr13 = convBnLeaky(network, weightMap, *lr12->getOutput(0), 128, 1, 1, 0, 13); + auto lr14 = convBnLeaky(network, weightMap, *lr13->getOutput(0), 256, 3, 1, 1, 14); + auto ew15 = network->addElementWise(*lr14->getOutput(0), *lr12->getOutput(0), ElementWiseOperation::kSUM); + auto lr16 = convBnLeaky(network, weightMap, *ew15->getOutput(0), 128, 1, 1, 0, 16); + auto lr17 = convBnLeaky(network, weightMap, *lr16->getOutput(0), 256, 3, 1, 1, 17); + auto ew18 = network->addElementWise(*lr17->getOutput(0), *ew15->getOutput(0), ElementWiseOperation::kSUM); + auto lr19 = convBnLeaky(network, weightMap, *ew18->getOutput(0), 128, 1, 1, 0, 19); + auto lr20 = convBnLeaky(network, weightMap, *lr19->getOutput(0), 256, 3, 1, 1, 20); + auto ew21 = network->addElementWise(*lr20->getOutput(0), *ew18->getOutput(0), ElementWiseOperation::kSUM); + auto lr22 = convBnLeaky(network, weightMap, *ew21->getOutput(0), 128, 1, 1, 0, 22); + auto lr23 = convBnLeaky(network, weightMap, *lr22->getOutput(0), 256, 3, 1, 1, 23); + auto ew24 = network->addElementWise(*lr23->getOutput(0), *ew21->getOutput(0), ElementWiseOperation::kSUM); + auto lr25 = convBnLeaky(network, weightMap, *ew24->getOutput(0), 128, 1, 1, 0, 25); + auto lr26 = convBnLeaky(network, weightMap, *lr25->getOutput(0), 256, 3, 1, 1, 26); + auto ew27 = network->addElementWise(*lr26->getOutput(0), *ew24->getOutput(0), ElementWiseOperation::kSUM); + auto lr28 = convBnLeaky(network, weightMap, *ew27->getOutput(0), 128, 1, 1, 0, 28); + auto lr29 = convBnLeaky(network, weightMap, *lr28->getOutput(0), 256, 3, 1, 1, 29); + auto ew30 = network->addElementWise(*lr29->getOutput(0), *ew27->getOutput(0), ElementWiseOperation::kSUM); + auto lr31 = convBnLeaky(network, weightMap, *ew30->getOutput(0), 128, 1, 1, 0, 31); + auto lr32 = convBnLeaky(network, weightMap, *lr31->getOutput(0), 256, 3, 1, 1, 32); + auto ew33 = network->addElementWise(*lr32->getOutput(0), *ew30->getOutput(0), ElementWiseOperation::kSUM); + auto lr34 = convBnLeaky(network, weightMap, *ew33->getOutput(0), 128, 1, 1, 0, 34); + auto lr35 = convBnLeaky(network, weightMap, *lr34->getOutput(0), 256, 3, 1, 1, 35); + auto ew36 = network->addElementWise(*lr35->getOutput(0), *ew33->getOutput(0), ElementWiseOperation::kSUM); + auto lr37 = convBnLeaky(network, weightMap, *ew36->getOutput(0), 512, 3, 2, 1, 37); + auto lr38 = convBnLeaky(network, weightMap, *lr37->getOutput(0), 256, 1, 1, 0, 38); + auto lr39 = convBnLeaky(network, weightMap, *lr38->getOutput(0), 512, 3, 1, 1, 39); + auto ew40 = network->addElementWise(*lr39->getOutput(0), *lr37->getOutput(0), ElementWiseOperation::kSUM); + auto lr41 = convBnLeaky(network, weightMap, *ew40->getOutput(0), 256, 1, 1, 0, 41); + auto lr42 = convBnLeaky(network, weightMap, *lr41->getOutput(0), 512, 3, 1, 1, 42); + auto ew43 = network->addElementWise(*lr42->getOutput(0), *ew40->getOutput(0), ElementWiseOperation::kSUM); + auto lr44 = convBnLeaky(network, weightMap, *ew43->getOutput(0), 256, 1, 1, 0, 44); + auto lr45 = convBnLeaky(network, weightMap, *lr44->getOutput(0), 512, 3, 1, 1, 45); + auto ew46 = network->addElementWise(*lr45->getOutput(0), *ew43->getOutput(0), ElementWiseOperation::kSUM); + auto lr47 = convBnLeaky(network, weightMap, *ew46->getOutput(0), 256, 1, 1, 0, 47); + auto lr48 = convBnLeaky(network, weightMap, *lr47->getOutput(0), 512, 3, 1, 1, 48); + auto ew49 = network->addElementWise(*lr48->getOutput(0), *ew46->getOutput(0), ElementWiseOperation::kSUM); + auto lr50 = convBnLeaky(network, weightMap, *ew49->getOutput(0), 256, 1, 1, 0, 50); + auto lr51 = convBnLeaky(network, weightMap, *lr50->getOutput(0), 512, 3, 1, 1, 51); + auto ew52 = network->addElementWise(*lr51->getOutput(0), *ew49->getOutput(0), ElementWiseOperation::kSUM); + auto lr53 = convBnLeaky(network, weightMap, *ew52->getOutput(0), 256, 1, 1, 0, 53); + auto lr54 = convBnLeaky(network, weightMap, *lr53->getOutput(0), 512, 3, 1, 1, 54); + auto ew55 = network->addElementWise(*lr54->getOutput(0), *ew52->getOutput(0), ElementWiseOperation::kSUM); + auto lr56 = convBnLeaky(network, weightMap, *ew55->getOutput(0), 256, 1, 1, 0, 56); + auto lr57 = convBnLeaky(network, weightMap, *lr56->getOutput(0), 512, 3, 1, 1, 57); + auto ew58 = network->addElementWise(*lr57->getOutput(0), *ew55->getOutput(0), ElementWiseOperation::kSUM); + auto lr59 = convBnLeaky(network, weightMap, *ew58->getOutput(0), 256, 1, 1, 0, 59); + auto lr60 = convBnLeaky(network, weightMap, *lr59->getOutput(0), 512, 3, 1, 1, 60); + auto ew61 = network->addElementWise(*lr60->getOutput(0), *ew58->getOutput(0), ElementWiseOperation::kSUM); + auto lr62 = convBnLeaky(network, weightMap, *ew61->getOutput(0), 1024, 3, 2, 1, 62); + auto lr63 = convBnLeaky(network, weightMap, *lr62->getOutput(0), 512, 1, 1, 0, 63); + auto lr64 = convBnLeaky(network, weightMap, *lr63->getOutput(0), 1024, 3, 1, 1, 64); + auto ew65 = network->addElementWise(*lr64->getOutput(0), *lr62->getOutput(0), ElementWiseOperation::kSUM); + auto lr66 = convBnLeaky(network, weightMap, *ew65->getOutput(0), 512, 1, 1, 0, 66); + auto lr67 = convBnLeaky(network, weightMap, *lr66->getOutput(0), 1024, 3, 1, 1, 67); + auto ew68 = network->addElementWise(*lr67->getOutput(0), *ew65->getOutput(0), ElementWiseOperation::kSUM); + auto lr69 = convBnLeaky(network, weightMap, *ew68->getOutput(0), 512, 1, 1, 0, 69); + auto lr70 = convBnLeaky(network, weightMap, *lr69->getOutput(0), 1024, 3, 1, 1, 70); + auto ew71 = network->addElementWise(*lr70->getOutput(0), *ew68->getOutput(0), ElementWiseOperation::kSUM); + auto lr72 = convBnLeaky(network, weightMap, *ew71->getOutput(0), 512, 1, 1, 0, 72); + auto lr73 = convBnLeaky(network, weightMap, *lr72->getOutput(0), 1024, 3, 1, 1, 73); + auto ew74 = network->addElementWise(*lr73->getOutput(0), *ew71->getOutput(0), ElementWiseOperation::kSUM); + auto lr75 = convBnLeaky(network, weightMap, *ew74->getOutput(0), 512, 1, 1, 0, 75); + auto lr76 = convBnLeaky(network, weightMap, *lr75->getOutput(0), 1024, 3, 1, 1, 76); + auto lr77 = convBnLeaky(network, weightMap, *lr76->getOutput(0), 512, 1, 1, 0, 77); + auto lr78 = convBnLeaky(network, weightMap, *lr77->getOutput(0), 1024, 3, 1, 1, 78); + auto lr79 = convBnLeaky(network, weightMap, *lr78->getOutput(0), 512, 1, 1, 0, 79); + auto lr80 = convBnLeaky(network, weightMap, *lr79->getOutput(0), 1024, 3, 1, 1, 80); + IConvolutionLayer* conv81 = network->addConvolutionNd(*lr80->getOutput(0), 3 * (CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.81.Conv2d.weight"], weightMap["module_list.81.Conv2d.bias"]); + assert(conv81); + // 82 is yolo + auto l83 = lr79; + auto lr84 = convBnLeaky(network, weightMap, *l83->getOutput(0), 256, 1, 1, 0, 84); + + float *deval = reinterpret_cast(malloc(sizeof(float) * 256 * 2 * 2)); + for (int i = 0; i < 256 * 2 * 2; i++) { + deval[i] = 1.0; + } + Weights deconvwts85{DataType::kFLOAT, deval, 256 * 2 * 2}; + IDeconvolutionLayer* deconv85 = network->addDeconvolutionNd(*lr84->getOutput(0), 256, DimsHW{2, 2}, deconvwts85, emptywts); + assert(deconv85); + deconv85->setStrideNd(DimsHW{2, 2}); + deconv85->setNbGroups(256); + weightMap["deconv85"] = deconvwts85; + + ITensor* inputTensors[] = {deconv85->getOutput(0), ew61->getOutput(0)}; + auto cat86 = network->addConcatenation(inputTensors, 2); + auto lr87 = convBnLeaky(network, weightMap, *cat86->getOutput(0), 256, 1, 1, 0, 87); + auto lr88 = convBnLeaky(network, weightMap, *lr87->getOutput(0), 512, 3, 1, 1, 88); + auto lr89 = convBnLeaky(network, weightMap, *lr88->getOutput(0), 256, 1, 1, 0, 89); + auto lr90 = convBnLeaky(network, weightMap, *lr89->getOutput(0), 512, 3, 1, 1, 90); + auto lr91 = convBnLeaky(network, weightMap, *lr90->getOutput(0), 256, 1, 1, 0, 91); + auto lr92 = convBnLeaky(network, weightMap, *lr91->getOutput(0), 512, 3, 1, 1, 92); + IConvolutionLayer* conv93 = network->addConvolutionNd(*lr92->getOutput(0), 3 * (CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.93.Conv2d.weight"], weightMap["module_list.93.Conv2d.bias"]); + assert(conv93); + // 94 is yolo + auto l95 = lr91; + auto lr96 = convBnLeaky(network, weightMap, *l95->getOutput(0), 128, 1, 1, 0, 96); + Weights deconvwts97{DataType::kFLOAT, deval, 128 * 2 * 2}; + IDeconvolutionLayer* deconv97 = network->addDeconvolutionNd(*lr96->getOutput(0), 128, DimsHW{2, 2}, deconvwts97, emptywts); + assert(deconv97); + deconv97->setStrideNd(DimsHW{2, 2}); + deconv97->setNbGroups(128); + ITensor* inputTensors1[] = {deconv97->getOutput(0), ew36->getOutput(0)}; + auto cat98 = network->addConcatenation(inputTensors1, 2); + auto lr99 = convBnLeaky(network, weightMap, *cat98->getOutput(0), 128, 1, 1, 0, 99); + auto lr100 = convBnLeaky(network, weightMap, *lr99->getOutput(0), 256, 3, 1, 1, 100); + auto lr101 = convBnLeaky(network, weightMap, *lr100->getOutput(0), 128, 1, 1, 0, 101); + auto lr102 = convBnLeaky(network, weightMap, *lr101->getOutput(0), 256, 3, 1, 1, 102); + auto lr103 = convBnLeaky(network, weightMap, *lr102->getOutput(0), 128, 1, 1, 0, 103); + auto lr104 = convBnLeaky(network, weightMap, *lr103->getOutput(0), 256, 3, 1, 1, 104); + IConvolutionLayer* conv105 = network->addConvolutionNd(*lr104->getOutput(0), 3 * (CLASS_NUM + 5), DimsHW{1, 1}, weightMap["module_list.105.Conv2d.weight"], weightMap["module_list.105.Conv2d.bias"]); + assert(conv105); + + auto creator = getPluginRegistry()->getPluginCreator("YoloLayer_TRT", "1"); + auto class_num = CLASS_NUM; + std::vector vpf{ + PluginField{"numClasses", &class_num, PluginFieldType::kINT32, 1} + }; + + PluginFieldCollection pfc{ (int)vpf.size(), vpf.data() }; + IPluginV2 *pluginObj = creator->createPlugin("yololayer", &pfc); + ITensor* inputTensors_yolo[] = {conv81->getOutput(0), conv93->getOutput(0), conv105->getOutput(0)}; + auto yolo = network->addPluginV2(inputTensors_yolo, 3, *pluginObj); + + yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME); + network->markOutput(*yolo->getOutput(0)); + + // Build engine + builder->setMaxBatchSize(maxBatchSize); + config->setMaxWorkspaceSize(16 * (1 << 20)); // 16MB +#ifdef USE_FP16 + config->setFlag(BuilderFlag::kFP16); +#endif + std::cout << "Building engine, please wait for a while..." << std::endl; + ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config); + std::cout << "Build engine successfully!" << std::endl; + + // Don't need the network any more + network->destroy(); + + // Release host memory + for (auto& mem : weightMap) + { + free((void*) (mem.second.values)); + } + + return engine; +} + +void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, const std::string& weights_path) { + // Create builder + IBuilder* builder = createInferBuilder(gLogger); + IBuilderConfig* config = builder->createBuilderConfig(); + + // Create model to populate the network, then set the outputs and create an engine + ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT, weights_path); + assert(engine != nullptr); + + // Serialize the engine + (*modelStream) = engine->serialize(); + + // Close everything down + engine->destroy(); + builder->destroy(); +} + +void doInference(IExecutionContext& context, float* input, float* output, int batchSize) { + const ICudaEngine& engine = context.getEngine(); + + // Pointers to input and output device buffers to pass to engine. + // Engine requires exactly IEngine::getNbBindings() number of buffers. + assert(engine.getNbBindings() == 2); + void* buffers[2]; + + // In order to bind the buffers, we need to know the names of the input and output tensors. + // Note that indices are guaranteed to be less than IEngine::getNbBindings() + const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); + const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); + + // Create GPU buffers on device + CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float))); + CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float))); + + // Create stream + cudaStream_t stream; + CHECK(cudaStreamCreate(&stream)); + + // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host + CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); + context.enqueue(batchSize, buffers, stream, nullptr); + CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); + + // Release stream and buffers + cudaStreamDestroy(stream); + CHECK(cudaFree(buffers[inputIndex])); + CHECK(cudaFree(buffers[outputIndex])); +} + +int read_files_in_dir(const char *p_dir_name, std::vector &file_names) { + DIR *p_dir = opendir(p_dir_name); + if (p_dir == nullptr) { + return -1; + } + + struct dirent* p_file = nullptr; + while ((p_file = readdir(p_dir)) != nullptr) { + if (strcmp(p_file->d_name, ".") != 0 && + strcmp(p_file->d_name, "..") != 0) { + //std::string cur_file_name(p_dir_name); + //cur_file_name += "/"; + //cur_file_name += p_file->d_name; + std::string cur_file_name(p_file->d_name); + file_names.push_back(cur_file_name); + } + } + + closedir(p_dir); + return 0; +} + +int main(int argc, char** argv) { + cudaSetDevice(DEVICE); + // create a model using the API directly and serialize it to a stream + char *trtModelStream{nullptr}; + size_t size{0}; + + if (argc == 2 && std::string(argv[1]) == "-s") { + IHostMemory* modelStream{nullptr}; + APIToModel(1, &modelStream, "yolov3.wts"); + assert(modelStream != nullptr); + std::ofstream p("yolov3.engine", std::ios::binary); + if (!p) { + std::cerr << "could not open plan output file" << std::endl; + return -1; + } + p.write(reinterpret_cast(modelStream->data()), modelStream->size()); + modelStream->destroy(); + return 0; + } else if (argc == 3 && std::string(argv[1]) == "-d") { + std::ifstream file("yolov3.engine", std::ios::binary); + if (file.good()) { + file.seekg(0, file.end); + size = file.tellg(); + file.seekg(0, file.beg); + trtModelStream = new char[size]; + assert(trtModelStream); + file.read(trtModelStream, size); + file.close(); + } + } else { + std::cerr << "arguments not right!" << std::endl; + std::cerr << "./yolov3 -s // serialize model to plan file" << std::endl; + std::cerr << "./yolov3 -d ../samples // deserialize plan file and run inference" << std::endl; + return -1; + } + + std::vector file_names; + if (read_files_in_dir(argv[2], file_names) < 0) { + std::cout << "read_files_in_dir failed." << std::endl; + return -1; + } + + // prepare input data --------------------------- + static float data[3 * INPUT_H * INPUT_W]; + //for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++) + // data[i] = 1.0; + static float prob[OUTPUT_SIZE]; + IRuntime* runtime = createInferRuntime(gLogger); + assert(runtime != nullptr); + ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); + assert(engine != nullptr); + IExecutionContext* context = engine->createExecutionContext(); + assert(context != nullptr); + delete[] trtModelStream; + + int fcount = 0; + for (auto f: file_names) { + fcount++; + std::cout << fcount << " " << f << std::endl; + cv::Mat img = cv::imread(std::string(argv[2]) + "/" + f); + if (img.empty()) continue; + cv::Mat pr_img = preprocess_img(img); + for (int i = 0; i < INPUT_H * INPUT_W; i++) { + data[i] = pr_img.at(i)[2] / 255.0; + data[i + INPUT_H * INPUT_W] = pr_img.at(i)[1] / 255.0; + data[i + 2 * INPUT_H * INPUT_W] = pr_img.at(i)[0] / 255.0; + } + + // Run inference + auto start = std::chrono::system_clock::now(); + doInference(*context, data, prob, 1); + auto end = std::chrono::system_clock::now(); + std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + std::vector res; + nms(res, prob); + for (int i=0; i<20; i++) { + std::cout << prob[i] << ","; + } + std::cout << res.size() << std::endl; + for (size_t j = 0; j < res.size(); j++) { + float *p = (float*)&res[j]; + for (size_t k = 0; k < 7; k++) { + std::cout << p[k] << ", "; + } + std::cout << std::endl; + cv::Rect r = get_rect(img, res[j].bbox); + cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2); + cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2); + } + cv::imwrite("_" + f, img); + } + + // Destroy the engine + context->destroy(); + engine->destroy(); + runtime->destroy(); + return 0; +} \ No newline at end of file diff --git a/uw_detection/src/yolov3_prepare_plan.cpp b/uw_detection/src/yolov3_prepare_plan.cpp new file mode 100644 index 0000000..1ee7079 --- /dev/null +++ b/uw_detection/src/yolov3_prepare_plan.cpp @@ -0,0 +1,35 @@ +#include "NvInfer.h" +#include "cuda_runtime_api.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEVICE 0 // GPU id + +using namespace nvinfer1; + +int main(int argc, char** argv) +{ + cudaSetDevice(DEVICE); + IHostMemory* modelStream{ nullptr }; + std::string path = ros::package::getPath("uw_detection"); + APIToModel(1, &modelStream, path + "/share/yolov3.wts"); + assert(modelStream != nullptr); + std::ofstream p(path + "/share/yolov3.engine", std::ios::binary); + if (!p) + { + std::cerr << "could not open plan output file" << std::endl; + return -1; + } + p.write(reinterpret_cast(modelStream->data()), modelStream->size()); + modelStream->destroy(); + return 0; +} diff --git a/uw_detection/src/yolov3_spp_prepare_plan.cpp b/uw_detection/src/yolov3_spp_prepare_plan.cpp new file mode 100644 index 0000000..96df97b --- /dev/null +++ b/uw_detection/src/yolov3_spp_prepare_plan.cpp @@ -0,0 +1,35 @@ +#include "NvInfer.h" +#include "cuda_runtime_api.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEVICE 0 // GPU id + +using namespace nvinfer1; + +int main(int argc, char** argv) +{ + cudaSetDevice(DEVICE); + IHostMemory* modelStream{ nullptr }; + std::string path = ros::package::getPath("uw_detection"); + APIToModel(1, &modelStream, path + "/share/yolov3-spp_ultralytics68.wts"); + assert(modelStream != nullptr); + std::ofstream p(path + "/share/yolov3-spp.engine", std::ios::binary); + if (!p) + { + std::cerr << "could not open plan output file" << std::endl; + return -1; + } + p.write(reinterpret_cast(modelStream->data()), modelStream->size()); + modelStream->destroy(); + return 0; +}